diff --git a/package.json b/package.json index 08372d1141d3..3d611eb798b0 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts", "dev": "cross-env npm start", "find-orphaned-assets": "node src/assets/scripts/find-orphaned-assets.js", + "find-orphaned-features": "tsx src/data-directory/scripts/find-orphaned-features/index.ts", "find-past-built-pr": "tsx src/workflows/find-past-built-pr.ts", "fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start", "fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests", diff --git a/src/data-directory/scripts/find-orphaned-features/find.ts b/src/data-directory/scripts/find-orphaned-features/find.ts new file mode 100644 index 000000000000..c516fad8f8eb --- /dev/null +++ b/src/data-directory/scripts/find-orphaned-features/find.ts @@ -0,0 +1,300 @@ +/** + * This script will loop over all pages, in all languages, and look at + * the following: + * + * 1. `title` in frontmatter + * 2. `intro` in frontmatter + * 3. `shortTitle` in frontmatter (if present) + * 4. the markdown body itself + * 5. The `versions:` frontmatter key (if the page is in English) + * + * Then it will search out the features mentioned based on `data/features/*.yml` + * It will make a Set of these (e.g. `dependabot-grouped-dependencies` and + * `ghas-enablement-webhook`) and one by one pluck them away. + * + * After the pages, it will loop over the reusables in English, and do the + * same search there. Once it's done the English, it loops over the + * reusables in the translations (if they exist) and does the same search. + * + * Lastly, it will output the remaining features, as relative file paths. + * For example, `data/features/havent-been-used-in-years.yml` so now you + * know that file can be deleted. + * + * NOTE: A lot of translations have corrupted Liquid. So if we can't parse + * the Liquid we fall back to string search. A regex will try to find + * all `{% ifversion ... %}` (and `elsif`) and search for any features + * mentioned inside that as a string. + * + */ + +import fs from 'fs' +import path from 'path' + +import chalk from 'chalk' +import { TokenizationError } from 'liquidjs' + +import warmServer from '@/frame/lib/warm-server.js' +import { getDeepDataByLanguage } from '@/data-directory/lib/get-data.js' +import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js' +import languages from '@/languages/lib/languages.js' +import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js' + +type Options = { + sourceDirectory: string + output?: string + verbose?: boolean +} + +type Page = { + permalinks: Permalink[] + relativePath: string + fullPath: string + title: string + shortTitle?: string + intro: string + markdown: string + languageCode: string + versions: Record +} +type Permalink = { + href: string + languageCode: string +} + +export async function find(options: Options) { + const { sourceDirectory } = options + if (process.env.ENABLED_LANGUAGES && process.env.ENABLED_LANGUAGES === 'en') { + console.warn( + chalk.yellow( + `Only English is enabled. Be careful with the output. + To include all translations make sure they're available and that + ENABLED_LANGUAGES is not set or set to 'all'.`.replaceAll(/\s\s+/g, ' '), + ), + ) + } + const site = await warmServer([]) + + const features = new Set(Object.keys(getDeepDataByLanguage('features', 'en'))) + if (options.verbose) { + console.log(`Found ${features.size} features`) + } + + const pageList: Page[] = site.pageList + if (options.verbose) { + console.log(`Searching ${pageList.length.toLocaleString()} pages`) + } + + const t0 = new Date() + searchAndRemove(features, pageList, Boolean(options.verbose)) + const t1 = new Date() + + if (options.verbose) { + const color = features.size === 0 ? chalk.green : chalk.yellow + console.log( + color( + `Searched ${pageList.length.toLocaleString()} pages in ${formatDelta(t0, t1)}. + And found ${features.size} features remaining (i.e. orphans).`.replace(/\s\s+/, ' '), + ), + ) + } + + const remaining = Array.from(features).map((feature) => + path.join(sourceDirectory, `${feature}.yml`), + ) + if (options.output) { + if (options.output.endsWith('.json')) { + fs.writeFileSync(options.output, JSON.stringify(remaining, null, 2)) + } else { + fs.writeFileSync(options.output, remaining.join('\n')) + } + if (!options.verbose) { + return + } + } + console.log(chalk.bold(`Orphans found (${remaining.length}):`)) + for (const feature of remaining) { + console.log(chalk.green(feature)) + } +} + +function formatDelta(t0: Date, t1: Date) { + const ms = t1.getTime() - t0.getTime() + return `${(ms / 1000).toFixed(1)} seconds` +} + +function searchAndRemove(features: Set, pages: Page[], verbose = false) { + for (const page of pages) { + const content = page.markdown + // We actually never bother looking at the `versions:` frontmatter + // key in translations, so it doesn't matter if the translated + // frontmatter might have `versions: some-old-feature`. + if (page.languageCode === 'en') { + for (const [key, value] of Object.entries(page.versions)) { + if (key === 'feature') { + if (features.has(value)) { + features.delete(value) + } + } + } + } + + const combined = ` + ${content} + ${page.title || ''} + ${page.shortTitle || ''} + ${page.intro || ''} + ` + + checkString(combined, features, { page, verbose, languageCode: page.languageCode }) + } + + // Reusables are a bit special, as they are shared between languages. + // There'll always be a slight mismatch between files present on disk + // in English vs. translations. + // The translations never delete files, so there's often excess reusables + // on disk in translations. And the English might be ahead, meaning a file + // has been introduced in English but not yet translated. + // The code below loops over the English reusables, and takes note of the + // their relative paths and content. Then, we re-use the keys of that map + // to know which files, in the translations, to check. And when we read + // them in, we'll need the English equivalent content to be able to + // use the correctTranslatedContentStrings function. + + const englishReusables = new Map() + for (const filePath of getReusableFiles(path.join(languages.en.dir, 'data', 'reusables'))) { + const relativePath = path.relative(languages.en.dir, filePath) + const fileContent = fs.readFileSync(filePath, 'utf-8') + checkString(fileContent, features, { filePath, verbose, languageCode: 'en' }) + englishReusables.set(relativePath, fileContent) + } + for (const language of Object.values(languages)) { + if (language.code === 'en') continue // Already did that in the loop above + + for (const [relativePath, englishFileContent] of Array.from(englishReusables.entries())) { + const filePath = path.join(language.dir, relativePath) + try { + const fileContent = fs.readFileSync(filePath, 'utf-8') + const correctedFileContent = correctTranslatedContentStrings( + fileContent, + englishFileContent, + { + code: language.code, + relativePath, + }, + ) + + checkString(correctedFileContent, features, { + filePath, + verbose, + languageCode: language.code, + }) + } catch (error) { + if (error instanceof Error && 'code' in error && error.code === 'ENOENT') { + // That a reusable does *not* exist in a translation is + // perfectly expected. It means that English reusable was + // most likely added recently and the translation hasn't been + // translated yet. + continue + } + throw error + } + } + } +} + +function getReusableFiles(root: string): string[] { + const here = [] + for (const file of fs.readdirSync(root)) { + const filePath = `${root}/${file}` + if (fs.statSync(filePath).isDirectory()) { + here.push(...getReusableFiles(filePath)) + } else if (file.endsWith('.md') && file !== 'README.md') { + here.push(filePath) + } + } + return here +} + +const IGNORE_ARGS = new Set(['or', 'and', 'not', '<', '>', 'ghes', 'fpt', 'ghec', '!=', '=']) + +function checkString( + string: string, + features: Set, + { + page, + filePath, + languageCode, + verbose = false, + }: { page?: Page; filePath?: string; languageCode?: string; verbose?: boolean } = {}, +) { + try { + for (const token of getLiquidTokens(string)) { + if (token.name === 'ifversion' || token.name === 'elsif') { + for (const arg of token.args.split(/\s+/)) { + if (IGNORE_ARGS.has(arg)) continue + if (isFloat(arg)) continue + + if (features.has(arg)) { + features.delete(arg) + } + } + } + } + } catch (error) { + if (error instanceof TokenizationError) { + // If it happens in English, it's a serious error + if (languageCode === 'en') throw error + + // The translation might, currently, have corrupted liquid + // So treat it as a string + if (verbose) + console.log( + `TokenizationError in ${page ? page.fullPath : filePath}. Treating ${page ? page.fullPath : filePath} as a string and using regex`, + ) + + for (const feature of Array.from(findByRegex(features, string))) { + features.delete(feature) + } + } else { + throw error + } + } +} + +function findByRegex(features: Set, string: string) { + const found = new Set() + for (const match of string.match(/\{%\s*(ifversion|elsif)\s*(.*?)\s*%\}/g) || []) { + for (const feature of Array.from(features)) { + const regex = new RegExp(`\\s${escapeRegex(feature)}(\\s|%)`, 'i') + if (regex.test(match)) { + found.add(feature) + } + } + } + return found +} + +const test = findByRegex( + new Set(['placeholder', 'foo-bar']), + ` + placeholder + + {%ifversion placeholder-foo or fpt%} + {% elsif not-placeholder %} + {% elsif foo-bar%} + {%endif %} + + {% data reusables.enterprise-migration-tool.placeholder-table %} + {% data placeholder %} +`, +) +console.assert(test.has('foo-bar'), test.toString()) +console.assert(!test.has('placeholder'), test.toString()) + +function escapeRegex(string: string) { + return string.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&') +} + +function isFloat(x: any) { + return !!(parseFloat(x) + 1) +} diff --git a/src/data-directory/scripts/find-orphaned-features/index.ts b/src/data-directory/scripts/find-orphaned-features/index.ts new file mode 100644 index 000000000000..b4ec025170ca --- /dev/null +++ b/src/data-directory/scripts/find-orphaned-features/index.ts @@ -0,0 +1,18 @@ +import { program } from 'commander' +import { find } from './find' + +program + .name('find-orphaned-features') + .description( + "Compare what's in data/features/*.yml with what's mentioned in Markdown and frontmatter", + ) + +program + .command('find') + .description('Figure out what features are not being used') + .option('-s, --source-directory ', 'Source directory', 'data/features') + .option('-o, --output ', 'Output file') + .option('-v, --verbose', 'Verbose') + .action(find) + +program.parse(process.argv)