From 2cd06026ea063315f3b98516c5ed129d9fea5706 Mon Sep 17 00:00:00 2001 From: Alex Nguyen <150945400+nguyenalex836@users.noreply.github.com> Date: Wed, 27 Mar 2024 09:49:02 -0700 Subject: [PATCH 1/3] Update sme-review-tracking-issue.yml (#49901) --- .github/workflows/sme-review-tracking-issue.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/sme-review-tracking-issue.yml b/.github/workflows/sme-review-tracking-issue.yml index 327a4a0d5ada..f642a846170e 100644 --- a/.github/workflows/sme-review-tracking-issue.yml +++ b/.github/workflows/sme-review-tracking-issue.yml @@ -13,10 +13,6 @@ on: types: - labeled - pull_request: - types: - - labeled - permissions: contents: read From ebf4aad5a0d9d082c5b9576d323f8b5bff1bf95f Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Wed, 27 Mar 2024 12:57:23 -0400 Subject: [PATCH 2/3] Change sort order of queries by attributes (#49882) --- .../generate-code-scanning-query-list.ts | 47 +++++++++++++------ 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/code-scanning/scripts/generate-code-scanning-query-list.ts b/src/code-scanning/scripts/generate-code-scanning-query-list.ts index f05d2f2fabae..3b63af44eaac 100644 --- a/src/code-scanning/scripts/generate-code-scanning-query-list.ts +++ b/src/code-scanning/scripts/generate-code-scanning-query-list.ts @@ -90,6 +90,12 @@ type Query = { autofixSupport: 'none' | 'default' } +type QueryExtended = Query & { + inDefault: boolean + inExtended: boolean + inAutofix: boolean +} + const opts = program.opts() main( { @@ -162,8 +168,28 @@ async function main(options: Options, language: string) { } } - const entries = Object.values(queries) - entries.sort((a, b) => a.name.localeCompare(b.name)) + function decorate(query: Query): QueryExtended { + return { + ...query, + inDefault: query.packs.includes('code-scanning'), + inExtended: query.packs.includes('security-extended'), + inAutofix: query.autofixSupport === 'default', + } + } + + const entries = Object.values(queries).map(decorate) + + // Spec: "Queries that are both in Default and Extended should come first, + // in alphabetical order. Followed by the queries that are in Extended only." + entries.sort((a, b) => { + if (a.inDefault && !b.inDefault) return -1 + else if (!a.inDefault && b.inDefault) return 1 + + if (a.inExtended && !b.inExtended) return -1 + else if (!a.inExtended && b.inExtended) return 1 + + return a.name.localeCompare(b.name) + }) // At the moment, our chosen business logic is that we omit the Autofix // column if there are no queries that support it. @@ -174,7 +200,7 @@ async function main(options: Options, language: string) { printQueries(options, entries, includeAutofix) } -function printQueries(options: Options, queries: Query[], includeAutofix: boolean) { +function printQueries(options: Options, queries: QueryExtended[], includeAutofix: boolean) { const markdown = [] markdown.push('{% rowheaders %}') markdown.push('') // blank line @@ -190,18 +216,9 @@ function printQueries(options: Options, queries: Query[], includeAutofix: boolea for (const query of queries) { const markdownLink = `[${query.name}](${query.url})` - let defaultIcon = notIncludedOcticon - let extendedIcon = notIncludedOcticon - let autofixIcon = notIncludedOcticon - if (query.packs.includes('code-scanning')) { - defaultIcon = includedOcticon - } - if (query.packs.includes('security-extended')) { - extendedIcon = includedOcticon - } - if (query.autofixSupport === 'default') { - autofixIcon = includedOcticon - } + const defaultIcon = query.inDefault ? includedOcticon : notIncludedOcticon + const extendedIcon = query.inExtended ? includedOcticon : notIncludedOcticon + const autofixIcon = query.inAutofix ? includedOcticon : notIncludedOcticon const row = [markdownLink, query.cwes.join(', '), defaultIcon, extendedIcon] if (includeAutofix) { row.push(autofixIcon) From b141a77a4595d634991c1a662547e75101af603b Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Wed, 27 Mar 2024 12:58:04 -0400 Subject: [PATCH 3/3] Count corrupted liquid (#49879) --- .../count-translation-corruptions.yml | 43 +++++ package.json | 1 + src/frame/lib/page-data.js | 16 +- src/frame/lib/warm-server.js | 8 +- .../scripts/count-translation-corruptions.ts | 168 ++++++++++++++++++ 5 files changed, 228 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/count-translation-corruptions.yml create mode 100644 src/languages/scripts/count-translation-corruptions.ts diff --git a/.github/workflows/count-translation-corruptions.yml b/.github/workflows/count-translation-corruptions.yml new file mode 100644 index 000000000000..d955dea5c3ec --- /dev/null +++ b/.github/workflows/count-translation-corruptions.yml @@ -0,0 +1,43 @@ +name: Count translation corruptions + +# **What it does**: Generates a summary of Liquid corruptions per language. +# **Why we have it**: For insights into the state of translations and things we can do to fix them +# **Who does it impact**: Engineering + +on: + workflow_dispatch: + pull_request: + paths: + - src/languages/scripts/count-translation-corruptions.ts + - .github/workflows/count-translation-corruptions.yml + - .github/actions/node-npm-setup/action.yml + - .github/actions/clone-translations/action.yml + - 'package**.json' + +permissions: + contents: read + +jobs: + count-translation-corruptions: + if: github.repository == 'github/docs-internal' + runs-on: ubuntu-20.04-xl + steps: + - name: Checkout English repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + # Using a PAT is necessary so that the new commit will trigger the + # CI in the PR. (Events from GITHUB_TOKEN don't trigger new workflows.) + token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }} + + # It's important because translations are often a bit behind. + # So if a translation is a bit behind, it might still be referencing + # an asset even though none of the English content does. + - name: Clone all translations + uses: ./.github/actions/clone-translations + with: + token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }} + + - uses: ./.github/actions/node-npm-setup + + - name: Run count + run: npm run count-translation-corruptions diff --git a/package.json b/package.json index 2db69abe1f4e..b7a5f29fca6b 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "check-content-type": "node src/workflows/check-content-type.js", "check-github-github-links": "node src/links/scripts/check-github-github-links.js", "copy-fixture-data": "node src/tests/scripts/copy-fixture-data.js", + "count-translation-corruptions": "tsx src/languages/scripts/count-translation-corruptions.ts", "debug": "cross-env NODE_ENV=development ENABLED_LANGUAGES=en nodemon --inspect src/frame/server.js", "delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts", "dev": "cross-env npm start", diff --git a/src/frame/lib/page-data.js b/src/frame/lib/page-data.js index 013787f342ff..5018d7f8acba 100644 --- a/src/frame/lib/page-data.js +++ b/src/frame/lib/page-data.js @@ -266,13 +266,17 @@ async function translateTree(dir, langObj, enTree) { * * Order of languages and versions doesn't matter, but order of child page arrays DOES matter (for navigation). */ -export async function loadSiteTree(unversionedTree) { - const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree())) +export async function loadSiteTree(unversionedTree, languagesOnly = []) { + const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree(languagesOnly))) const siteTree = {} + const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages) // For every language... await Promise.all( - Object.keys(languages).map(async (langCode) => { + langCodes.map(async (langCode) => { + if (!(langCode in rawTree)) { + throw new Error(`No tree for language ${langCode}`) + } const treePerVersion = {} // in every version... await Promise.all( @@ -329,8 +333,12 @@ export async function loadPageList(unversionedTree, languagesOnly = []) { const rawTree = unversionedTree || (await loadUnversionedTree(languagesOnly)) const pageList = [] + const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages) await Promise.all( - ((languagesOnly.length && languagesOnly) || Object.keys(languages)).map(async (langCode) => { + langCodes.map(async (langCode) => { + if (!(langCode in rawTree)) { + throw new Error(`No tree for language ${langCode}`) + } await addToCollection(rawTree[langCode], pageList) }), ) diff --git a/src/frame/lib/warm-server.js b/src/frame/lib/warm-server.js index 3dca776558c3..0bac4892e413 100644 --- a/src/frame/lib/warm-server.js +++ b/src/frame/lib/warm-server.js @@ -26,8 +26,8 @@ async function warmServer(languagesOnly = []) { } const unversionedTree = await dog.loadUnversionedTree(languagesOnly) - const siteTree = await dog.loadSiteTree(unversionedTree) - const pageList = await dog.loadPages(unversionedTree) + const siteTree = await dog.loadSiteTree(unversionedTree, languagesOnly) + const pageList = await dog.loadPages(unversionedTree, languagesOnly) const pageMap = await dog.loadPageMap(pageList) const redirects = await dog.loadRedirects(pageList) @@ -52,12 +52,12 @@ dog.warmServer = statsd.asyncTimer(warmServer, 'warm_server') // We only want statistics if the priming needs to occur, so let's wrap the // real method and return early [without statistics] whenever possible -export default async function warmServerWrapper() { +export default async function warmServerWrapper(languagesOnly = []) { // Handle receiving multiple calls to this method from multiple page requests // by holding the in-progress Promise and returning it instead of allowing // the server to actually load all of the files multiple times. if (!promisedWarmServer) { - promisedWarmServer = dog.warmServer() + promisedWarmServer = dog.warmServer(languagesOnly) } return promisedWarmServer } diff --git a/src/languages/scripts/count-translation-corruptions.ts b/src/languages/scripts/count-translation-corruptions.ts new file mode 100644 index 000000000000..8a7699bbc2f4 --- /dev/null +++ b/src/languages/scripts/count-translation-corruptions.ts @@ -0,0 +1,168 @@ +import path from 'path' +import fs from 'fs' + +import { program } from 'commander' +import chalk from 'chalk' +import { TokenizationError } from 'liquidjs' +import walk from 'walk-sync' + +import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js' +import languages from '@/languages/lib/languages.js' +import warmServer, { type Site } from '@/frame/lib/warm-server.js' +import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js' + +program + .description('Tally the number of liquid corruptions in a translation') + .argument('[language...]', 'language(s) to compare against') + .action(main) +program.parse(process.argv) + +type Page = { + relativePath: string + fullPath: string + title: string + shortTitle?: string + intro: string + markdown: string + languageCode: string +} + +type Reusables = Map + +async function main(languageCodes: string[]) { + const langCodes = languageCodes.length + ? languageCodes + : Object.keys(languages).filter((x) => x !== 'en') + const site = await warmServer(languageCodes.length ? ['en', ...langCodes] : []) + + // When checking reusables, we only want to check the files that + // have an English equivalent. + const reusables = getReusables() + + const totalErrors = new Map() + + for (const languageCode of langCodes) { + if (!(languageCode in languages)) { + console.error(chalk.red(`Language ${languageCode} not found`)) + return process.exit(1) + } + if (languageCode === 'en') { + console.error(chalk.red("Can't test in English ('en')")) + return process.exit(1) + } + const { errors } = run(languageCode, site, reusables) + for (const [error, count] of Array.from(errors.entries())) { + totalErrors.set(error, (totalErrors.get(error) || 0) + count) + } + } + + const sumTotal = Array.from(totalErrors.values()).reduce((acc, count) => acc + count, 0) + console.log('\nGRAND TOTAL ERRORS:', sumTotal) +} + +function getReusables(): Reusables { + const reusables = new Map() + const files = walk('data/reusables', { + includeBasePath: true, + globs: ['**/*.md'], + ignore: ['**/README.md'], + }) + for (const file of files) { + const content = fs.readFileSync(file, 'utf8') + reusables.set(file, content) + } + return reusables +} + +function run(languageCode: string, site: Site, englishReusables: Reusables) { + const PADDING = 60 + const language = languages[languageCode as keyof typeof languages] + + console.log(`--- Tallying liquid corruptions in ${languageCode} (${language.name}) ---`) + + const pageList: Page[] = site.pageList + const errors = new Map() + const wheres = new Map() + const illegalTags = new Map() + + function countError(error: TokenizationError, where: string) { + const errorString = (error as any).originalError.message as string + if (errorString.includes('illegal tag syntax')) { + const illegalTag = (error as any).token.content + illegalTags.set(illegalTag, (illegalTags.get(illegalTag) || 0) + 1) + } + errors.set(errorString, (errors.get(errorString) || 0) + 1) + wheres.set(where, (wheres.get(where) || 0) + 1) + } + + for (const page of pageList) { + if (page.languageCode !== languageCode) continue + + const strings: string[][] = [ + ['title', page.title], + ['shortTitle', page.shortTitle || ''], + ['intro', page.intro || ''], + ['markdown', page.markdown], + ].filter(([, string]) => Boolean(string)) + + for (const [where, string] of strings) { + try { + getLiquidTokens(string) + } catch (error) { + if (error instanceof TokenizationError) { + countError(error, where) + } else { + throw error + } + } + } + } + + for (const [relativePath, englishContent] of Array.from(englishReusables.entries())) { + try { + const filePath = path.join(language.dir, relativePath) + const rawContent = fs.readFileSync(filePath, 'utf8') + const correctedContent = correctTranslatedContentStrings(rawContent, englishContent, { + code: languageCode, + relativePath, + }) + getLiquidTokens(correctedContent) + } catch (error) { + if (error instanceof TokenizationError) { + countError(error, 'reusable') + } else if (error instanceof Error && error.message.startsWith('ENOENT')) { + continue + } else { + throw error + } + } + } + + const flat = Array.from(errors.entries()).sort((a, b) => b[1] - a[1]) + const sumTotal = flat.reduce((acc, [, count]) => acc + count, 0) + + console.log('\nMost common errors') + flat.forEach(([error, count], i) => { + console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count) + }) + console.log(`${'TOTAL:'.padEnd(3 + 1 + PADDING)}`, sumTotal) + + if (sumTotal) { + const whereFlat = Array.from(wheres.entries()).sort((a, b) => b[1] - a[1]) + console.log('\nMost common places') + whereFlat.forEach(([error, count], i) => { + console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count) + }) + + const illegalTagsFlat = Array.from(illegalTags.entries()).sort((a, b) => b[1] - a[1]) + if (illegalTagsFlat.reduce((acc, [, count]) => acc + count, 0)) { + console.log('\nMost common illegal tags', illegalTagsFlat.length > 10 ? ' (Top 10)' : '') + illegalTagsFlat.slice(0, 10).forEach(([error, count], i) => { + console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count) + }) + } + } + console.log('\n') + + return { errors } +}