Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repo sync #32278

Merged
merged 1 commit into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts",
"dev": "cross-env npm start",
"find-orphaned-assets": "node src/assets/scripts/find-orphaned-assets.js",
"find-orphaned-features": "tsx src/data-directory/scripts/find-orphaned-features/index.ts",
"find-past-built-pr": "tsx src/workflows/find-past-built-pr.ts",
"fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start",
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
Expand Down
300 changes: 300 additions & 0 deletions src/data-directory/scripts/find-orphaned-features/find.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
/**
* This script will loop over all pages, in all languages, and look at
* the following:
*
* 1. `title` in frontmatter
* 2. `intro` in frontmatter
* 3. `shortTitle` in frontmatter (if present)
* 4. the markdown body itself
* 5. The `versions:` frontmatter key (if the page is in English)
*
* Then it will search out the features mentioned based on `data/features/*.yml`
* It will make a Set of these (e.g. `dependabot-grouped-dependencies` and
* `ghas-enablement-webhook`) and one by one pluck them away.
*
* After the pages, it will loop over the reusables in English, and do the
* same search there. Once it's done the English, it loops over the
* reusables in the translations (if they exist) and does the same search.
*
* Lastly, it will output the remaining features, as relative file paths.
* For example, `data/features/havent-been-used-in-years.yml` so now you
* know that file can be deleted.
*
* NOTE: A lot of translations have corrupted Liquid. So if we can't parse
* the Liquid we fall back to string search. A regex will try to find
* all `{% ifversion ... %}` (and `elsif`) and search for any features
* mentioned inside that as a string.
*
*/

import fs from 'fs'
import path from 'path'

import chalk from 'chalk'
import { TokenizationError } from 'liquidjs'

import warmServer from '@/frame/lib/warm-server.js'
import { getDeepDataByLanguage } from '@/data-directory/lib/get-data.js'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
import languages from '@/languages/lib/languages.js'
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js'

type Options = {
sourceDirectory: string
output?: string
verbose?: boolean
}

type Page = {
permalinks: Permalink[]
relativePath: string
fullPath: string
title: string
shortTitle?: string
intro: string
markdown: string
languageCode: string
versions: Record<string, string>
}
type Permalink = {
href: string
languageCode: string
}

export async function find(options: Options) {
const { sourceDirectory } = options
if (process.env.ENABLED_LANGUAGES && process.env.ENABLED_LANGUAGES === 'en') {
console.warn(
chalk.yellow(
`Only English is enabled. Be careful with the output.
To include all translations make sure they're available and that
ENABLED_LANGUAGES is not set or set to 'all'.`.replaceAll(/\s\s+/g, ' '),
),
)
}
const site = await warmServer([])

const features = new Set(Object.keys(getDeepDataByLanguage('features', 'en')))
if (options.verbose) {
console.log(`Found ${features.size} features`)
}

const pageList: Page[] = site.pageList
if (options.verbose) {
console.log(`Searching ${pageList.length.toLocaleString()} pages`)
}

const t0 = new Date()
searchAndRemove(features, pageList, Boolean(options.verbose))
const t1 = new Date()

if (options.verbose) {
const color = features.size === 0 ? chalk.green : chalk.yellow
console.log(
color(
`Searched ${pageList.length.toLocaleString()} pages in ${formatDelta(t0, t1)}.
And found ${features.size} features remaining (i.e. orphans).`.replace(/\s\s+/, ' '),
),
)
}

const remaining = Array.from(features).map((feature) =>
path.join(sourceDirectory, `${feature}.yml`),
)
if (options.output) {
if (options.output.endsWith('.json')) {
fs.writeFileSync(options.output, JSON.stringify(remaining, null, 2))
} else {
fs.writeFileSync(options.output, remaining.join('\n'))
}
if (!options.verbose) {
return
}
}
console.log(chalk.bold(`Orphans found (${remaining.length}):`))
for (const feature of remaining) {
console.log(chalk.green(feature))
}
}

function formatDelta(t0: Date, t1: Date) {
const ms = t1.getTime() - t0.getTime()
return `${(ms / 1000).toFixed(1)} seconds`
}

function searchAndRemove(features: Set<string>, pages: Page[], verbose = false) {
for (const page of pages) {
const content = page.markdown
// We actually never bother looking at the `versions:` frontmatter
// key in translations, so it doesn't matter if the translated
// frontmatter might have `versions: some-old-feature`.
if (page.languageCode === 'en') {
for (const [key, value] of Object.entries(page.versions)) {
if (key === 'feature') {
if (features.has(value)) {
features.delete(value)
}
}
}
}

const combined = `
${content}
${page.title || ''}
${page.shortTitle || ''}
${page.intro || ''}
`

checkString(combined, features, { page, verbose, languageCode: page.languageCode })
}

// Reusables are a bit special, as they are shared between languages.
// There'll always be a slight mismatch between files present on disk
// in English vs. translations.
// The translations never delete files, so there's often excess reusables
// on disk in translations. And the English might be ahead, meaning a file
// has been introduced in English but not yet translated.
// The code below loops over the English reusables, and takes note of the
// their relative paths and content. Then, we re-use the keys of that map
// to know which files, in the translations, to check. And when we read
// them in, we'll need the English equivalent content to be able to
// use the correctTranslatedContentStrings function.

const englishReusables = new Map<string, string>()
for (const filePath of getReusableFiles(path.join(languages.en.dir, 'data', 'reusables'))) {
const relativePath = path.relative(languages.en.dir, filePath)
const fileContent = fs.readFileSync(filePath, 'utf-8')
checkString(fileContent, features, { filePath, verbose, languageCode: 'en' })
englishReusables.set(relativePath, fileContent)
}
for (const language of Object.values(languages)) {
if (language.code === 'en') continue // Already did that in the loop above

for (const [relativePath, englishFileContent] of Array.from(englishReusables.entries())) {
const filePath = path.join(language.dir, relativePath)
try {
const fileContent = fs.readFileSync(filePath, 'utf-8')
const correctedFileContent = correctTranslatedContentStrings(
fileContent,
englishFileContent,
{
code: language.code,
relativePath,
},
)

checkString(correctedFileContent, features, {
filePath,
verbose,
languageCode: language.code,
})
} catch (error) {
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
// That a reusable does *not* exist in a translation is
// perfectly expected. It means that English reusable was
// most likely added recently and the translation hasn't been
// translated yet.
continue
}
throw error
}
}
}
}

function getReusableFiles(root: string): string[] {
const here = []
for (const file of fs.readdirSync(root)) {
const filePath = `${root}/${file}`
if (fs.statSync(filePath).isDirectory()) {
here.push(...getReusableFiles(filePath))
} else if (file.endsWith('.md') && file !== 'README.md') {
here.push(filePath)
}
}
return here
}

const IGNORE_ARGS = new Set(['or', 'and', 'not', '<', '>', 'ghes', 'fpt', 'ghec', '!=', '='])

function checkString(
string: string,
features: Set<string>,
{
page,
filePath,
languageCode,
verbose = false,
}: { page?: Page; filePath?: string; languageCode?: string; verbose?: boolean } = {},
) {
try {
for (const token of getLiquidTokens(string)) {
if (token.name === 'ifversion' || token.name === 'elsif') {
for (const arg of token.args.split(/\s+/)) {
if (IGNORE_ARGS.has(arg)) continue
if (isFloat(arg)) continue

if (features.has(arg)) {
features.delete(arg)
}
}
}
}
} catch (error) {
if (error instanceof TokenizationError) {
// If it happens in English, it's a serious error
if (languageCode === 'en') throw error

// The translation might, currently, have corrupted liquid
// So treat it as a string
if (verbose)
console.log(
`TokenizationError in ${page ? page.fullPath : filePath}. Treating ${page ? page.fullPath : filePath} as a string and using regex`,
)

for (const feature of Array.from(findByRegex(features, string))) {
features.delete(feature)
}
} else {
throw error
}
}
}

function findByRegex(features: Set<string>, string: string) {
const found = new Set<string>()
for (const match of string.match(/\{%\s*(ifversion|elsif)\s*(.*?)\s*%\}/g) || []) {
for (const feature of Array.from(features)) {
const regex = new RegExp(`\\s${escapeRegex(feature)}(\\s|%)`, 'i')
if (regex.test(match)) {
found.add(feature)
}
}
}
return found
}

const test = findByRegex(
new Set(['placeholder', 'foo-bar']),
`
placeholder

{%ifversion placeholder-foo or fpt%}
{% elsif not-placeholder %}
{% elsif foo-bar%}
{%endif %}

{% data reusables.enterprise-migration-tool.placeholder-table %}
{% data placeholder %}
`,
)
console.assert(test.has('foo-bar'), test.toString())
console.assert(!test.has('placeholder'), test.toString())

function escapeRegex(string: string) {
return string.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&')
}

function isFloat(x: any) {
return !!(parseFloat(x) + 1)
}
18 changes: 18 additions & 0 deletions src/data-directory/scripts/find-orphaned-features/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import { program } from 'commander'
import { find } from './find'

program
.name('find-orphaned-features')
.description(
"Compare what's in data/features/*.yml with what's mentioned in Markdown and frontmatter",
)

program
.command('find')
.description('Figure out what features are not being used')
.option('-s, --source-directory <directory>', 'Source directory', 'data/features')
.option('-o, --output <output-file>', 'Output file')
.option('-v, --verbose', 'Verbose')
.action(find)

program.parse(process.argv)
Loading