Merge pull request #32252 from github/repo-sync

Repo sync
github · Mar 27, 2024 · ae168ac · ae168ac
2 parents 18e7fd4 + b141a77
commit ae168ac
Show file tree

Hide file tree

Showing 7 changed files with 260 additions and 27 deletions.
diff --git a/.github/workflows/count-translation-corruptions.yml b/.github/workflows/count-translation-corruptions.yml
@@ -0,0 +1,43 @@
+name: Count translation corruptions
+
+# **What it does**: Generates a summary of Liquid corruptions per language.
+# **Why we have it**: For insights into the state of translations and things we can do to fix them
+# **Who does it impact**: Engineering
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - src/languages/scripts/count-translation-corruptions.ts
+      - .github/workflows/count-translation-corruptions.yml
+      - .github/actions/node-npm-setup/action.yml
+      - .github/actions/clone-translations/action.yml
+      - 'package**.json'
+
+permissions:
+  contents: read
+
+jobs:
+  count-translation-corruptions:
+    if: github.repository == 'github/docs-internal'
+    runs-on: ubuntu-20.04-xl
+    steps:
+      - name: Checkout English repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          # Using a PAT is necessary so that the new commit will trigger the
+          # CI in the PR. (Events from GITHUB_TOKEN don't trigger new workflows.)
+          token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
+
+      # It's important because translations are often a bit behind.
+      # So if a translation is a bit behind, it might still be referencing
+      # an asset even though none of the English content does.
+      - name: Clone all translations
+        uses: ./.github/actions/clone-translations
+        with:
+          token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}
+
+      - uses: ./.github/actions/node-npm-setup
+
+      - name: Run count
+        run: npm run count-translation-corruptions
diff --git a/.github/workflows/sme-review-tracking-issue.yml b/.github/workflows/sme-review-tracking-issue.yml
@@ -13,10 +13,6 @@ on:
     types:
       - labeled
 
-  pull_request:
-    types:
-      - labeled
-
 permissions:
   contents: read
 

diff --git a/package.json b/package.json
@@ -22,6 +22,7 @@
     "check-content-type": "node src/workflows/check-content-type.js",
     "check-github-github-links": "node src/links/scripts/check-github-github-links.js",
     "copy-fixture-data": "node src/tests/scripts/copy-fixture-data.js",
+    "count-translation-corruptions": "tsx src/languages/scripts/count-translation-corruptions.ts",
     "debug": "cross-env NODE_ENV=development ENABLED_LANGUAGES=en nodemon --inspect src/frame/server.js",
     "delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts",
     "dev": "cross-env npm start",

diff --git a/src/code-scanning/scripts/generate-code-scanning-query-list.ts b/src/code-scanning/scripts/generate-code-scanning-query-list.ts
@@ -90,6 +90,12 @@ type Query = {
   autofixSupport: 'none' | 'default'
 }
 
+type QueryExtended = Query & {
+  inDefault: boolean
+  inExtended: boolean
+  inAutofix: boolean
+}
+
 const opts = program.opts()
 main(
   {
@@ -162,8 +168,28 @@ async function main(options: Options, language: string) {
     }
   }
 
-  const entries = Object.values(queries)
-  entries.sort((a, b) => a.name.localeCompare(b.name))
+  function decorate(query: Query): QueryExtended {
+    return {
+      ...query,
+      inDefault: query.packs.includes('code-scanning'),
+      inExtended: query.packs.includes('security-extended'),
+      inAutofix: query.autofixSupport === 'default',
+    }
+  }
+
+  const entries = Object.values(queries).map(decorate)
+
+  // Spec: "Queries that are both in Default and Extended should come first,
+  // in alphabetical order. Followed by the queries that are in Extended only."
+  entries.sort((a, b) => {
+    if (a.inDefault && !b.inDefault) return -1
+    else if (!a.inDefault && b.inDefault) return 1
+
+    if (a.inExtended && !b.inExtended) return -1
+    else if (!a.inExtended && b.inExtended) return 1
+
+    return a.name.localeCompare(b.name)
+  })
 
   // At the moment, our chosen business logic is that we omit the Autofix
   // column if there are no queries that support it.
@@ -174,7 +200,7 @@ async function main(options: Options, language: string) {
   printQueries(options, entries, includeAutofix)
 }
 
-function printQueries(options: Options, queries: Query[], includeAutofix: boolean) {
+function printQueries(options: Options, queries: QueryExtended[], includeAutofix: boolean) {
   const markdown = []
   markdown.push('{% rowheaders %}')
   markdown.push('') // blank line
@@ -190,18 +216,9 @@ function printQueries(options: Options, queries: Query[], includeAutofix: boolea
 
   for (const query of queries) {
     const markdownLink = `[${query.name}](${query.url})`
-    let defaultIcon = notIncludedOcticon
-    let extendedIcon = notIncludedOcticon
-    let autofixIcon = notIncludedOcticon
-    if (query.packs.includes('code-scanning')) {
-      defaultIcon = includedOcticon
-    }
-    if (query.packs.includes('security-extended')) {
-      extendedIcon = includedOcticon
-    }
-    if (query.autofixSupport === 'default') {
-      autofixIcon = includedOcticon
-    }
+    const defaultIcon = query.inDefault ? includedOcticon : notIncludedOcticon
+    const extendedIcon = query.inExtended ? includedOcticon : notIncludedOcticon
+    const autofixIcon = query.inAutofix ? includedOcticon : notIncludedOcticon
     const row = [markdownLink, query.cwes.join(', '), defaultIcon, extendedIcon]
     if (includeAutofix) {
       row.push(autofixIcon)

diff --git a/src/frame/lib/page-data.js b/src/frame/lib/page-data.js
@@ -266,13 +266,17 @@ async function translateTree(dir, langObj, enTree) {
  *
  * Order of languages and versions doesn't matter, but order of child page arrays DOES matter (for navigation).
 */
-export async function loadSiteTree(unversionedTree) {
-  const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree()))
+export async function loadSiteTree(unversionedTree, languagesOnly = []) {
+  const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree(languagesOnly)))
   const siteTree = {}
 
+  const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
   // For every language...
   await Promise.all(
-    Object.keys(languages).map(async (langCode) => {
+    langCodes.map(async (langCode) => {
+      if (!(langCode in rawTree)) {
+        throw new Error(`No tree for language ${langCode}`)
+      }
       const treePerVersion = {}
       // in every version...
       await Promise.all(
@@ -329,8 +333,12 @@ export async function loadPageList(unversionedTree, languagesOnly = []) {
   const rawTree = unversionedTree || (await loadUnversionedTree(languagesOnly))
   const pageList = []
 
+  const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
   await Promise.all(
-    ((languagesOnly.length && languagesOnly) || Object.keys(languages)).map(async (langCode) => {
+    langCodes.map(async (langCode) => {
+      if (!(langCode in rawTree)) {
+        throw new Error(`No tree for language ${langCode}`)
+      }
       await addToCollection(rawTree[langCode], pageList)
     }),
   )

diff --git a/src/frame/lib/warm-server.js b/src/frame/lib/warm-server.js
@@ -26,8 +26,8 @@ async function warmServer(languagesOnly = []) {
   }
 
   const unversionedTree = await dog.loadUnversionedTree(languagesOnly)
-  const siteTree = await dog.loadSiteTree(unversionedTree)
-  const pageList = await dog.loadPages(unversionedTree)
+  const siteTree = await dog.loadSiteTree(unversionedTree, languagesOnly)
+  const pageList = await dog.loadPages(unversionedTree, languagesOnly)
   const pageMap = await dog.loadPageMap(pageList)
   const redirects = await dog.loadRedirects(pageList)
 
@@ -52,12 +52,12 @@ dog.warmServer = statsd.asyncTimer(warmServer, 'warm_server')
 
 // We only want statistics if the priming needs to occur, so let's wrap the
 // real method and return early [without statistics] whenever possible
-export default async function warmServerWrapper() {
+export default async function warmServerWrapper(languagesOnly = []) {
   // Handle receiving multiple calls to this method from multiple page requests
   // by holding the in-progress Promise and returning it instead of allowing
   // the server to actually load all of the files multiple times.
   if (!promisedWarmServer) {
-    promisedWarmServer = dog.warmServer()
+    promisedWarmServer = dog.warmServer(languagesOnly)
   }
   return promisedWarmServer
 }
diff --git a/src/languages/scripts/count-translation-corruptions.ts b/src/languages/scripts/count-translation-corruptions.ts
@@ -0,0 +1,168 @@
+import path from 'path'
+import fs from 'fs'
+
+import { program } from 'commander'
+import chalk from 'chalk'
+import { TokenizationError } from 'liquidjs'
+import walk from 'walk-sync'
+
+import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
+import languages from '@/languages/lib/languages.js'
+import warmServer, { type Site } from '@/frame/lib/warm-server.js'
+import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js'
+
+program
+  .description('Tally the number of liquid corruptions in a translation')
+  .argument('[language...]', 'language(s) to compare against')
+  .action(main)
+program.parse(process.argv)
+
+type Page = {
+  relativePath: string
+  fullPath: string
+  title: string
+  shortTitle?: string
+  intro: string
+  markdown: string
+  languageCode: string
+}
+
+type Reusables = Map<string, string>
+
+async function main(languageCodes: string[]) {
+  const langCodes = languageCodes.length
+    ? languageCodes
+    : Object.keys(languages).filter((x) => x !== 'en')
+  const site = await warmServer(languageCodes.length ? ['en', ...langCodes] : [])
+
+  // When checking reusables, we only want to check the files that
+  // have an English equivalent.
+  const reusables = getReusables()
+
+  const totalErrors = new Map<string, number>()
+
+  for (const languageCode of langCodes) {
+    if (!(languageCode in languages)) {
+      console.error(chalk.red(`Language ${languageCode} not found`))
+      return process.exit(1)
+    }
+    if (languageCode === 'en') {
+      console.error(chalk.red("Can't test in English ('en')"))
+      return process.exit(1)
+    }
+    const { errors } = run(languageCode, site, reusables)
+    for (const [error, count] of Array.from(errors.entries())) {
+      totalErrors.set(error, (totalErrors.get(error) || 0) + count)
+    }
+  }
+
+  const sumTotal = Array.from(totalErrors.values()).reduce((acc, count) => acc + count, 0)
+  console.log('\nGRAND TOTAL ERRORS:', sumTotal)
+}
+
+function getReusables(): Reusables {
+  const reusables = new Map()
+  const files = walk('data/reusables', {
+    includeBasePath: true,
+    globs: ['**/*.md'],
+    ignore: ['**/README.md'],
+  })
+  for (const file of files) {
+    const content = fs.readFileSync(file, 'utf8')
+    reusables.set(file, content)
+  }
+  return reusables
+}
+
+function run(languageCode: string, site: Site, englishReusables: Reusables) {
+  const PADDING = 60
+  const language = languages[languageCode as keyof typeof languages]
+
+  console.log(`--- Tallying liquid corruptions in ${languageCode} (${language.name}) ---`)
+
+  const pageList: Page[] = site.pageList
+  const errors = new Map<string, number>()
+  const wheres = new Map<string, number>()
+  const illegalTags = new Map<string, number>()
+
+  function countError(error: TokenizationError, where: string) {
+    const errorString = (error as any).originalError.message as string
+    if (errorString.includes('illegal tag syntax')) {
+      const illegalTag = (error as any).token.content
+      illegalTags.set(illegalTag, (illegalTags.get(illegalTag) || 0) + 1)
+    }
+    errors.set(errorString, (errors.get(errorString) || 0) + 1)
+    wheres.set(where, (wheres.get(where) || 0) + 1)
+  }
+
+  for (const page of pageList) {
+    if (page.languageCode !== languageCode) continue
+
+    const strings: string[][] = [
+      ['title', page.title],
+      ['shortTitle', page.shortTitle || ''],
+      ['intro', page.intro || ''],
+      ['markdown', page.markdown],
+    ].filter(([, string]) => Boolean(string))
+
+    for (const [where, string] of strings) {
+      try {
+        getLiquidTokens(string)
+      } catch (error) {
+        if (error instanceof TokenizationError) {
+          countError(error, where)
+        } else {
+          throw error
+        }
+      }
+    }
+  }
+
+  for (const [relativePath, englishContent] of Array.from(englishReusables.entries())) {
+    try {
+      const filePath = path.join(language.dir, relativePath)
+      const rawContent = fs.readFileSync(filePath, 'utf8')
+      const correctedContent = correctTranslatedContentStrings(rawContent, englishContent, {
+        code: languageCode,
+        relativePath,
+      })
+      getLiquidTokens(correctedContent)
+    } catch (error) {
+      if (error instanceof TokenizationError) {
+        countError(error, 'reusable')
+      } else if (error instanceof Error && error.message.startsWith('ENOENT')) {
+        continue
+      } else {
+        throw error
+      }
+    }
+  }
+
+  const flat = Array.from(errors.entries()).sort((a, b) => b[1] - a[1])
+  const sumTotal = flat.reduce((acc, [, count]) => acc + count, 0)
+
+  console.log('\nMost common errors')
+  flat.forEach(([error, count], i) => {
+    console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
+  })
+  console.log(`${'TOTAL:'.padEnd(3 + 1 + PADDING)}`, sumTotal)
+
+  if (sumTotal) {
+    const whereFlat = Array.from(wheres.entries()).sort((a, b) => b[1] - a[1])
+    console.log('\nMost common places')
+    whereFlat.forEach(([error, count], i) => {
+      console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
+    })
+
+    const illegalTagsFlat = Array.from(illegalTags.entries()).sort((a, b) => b[1] - a[1])
+    if (illegalTagsFlat.reduce((acc, [, count]) => acc + count, 0)) {
+      console.log('\nMost common illegal tags', illegalTagsFlat.length > 10 ? ' (Top 10)' : '')
+      illegalTagsFlat.slice(0, 10).forEach(([error, count], i) => {
+        console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
+      })
+    }
+  }
+  console.log('\n')
+
+  return { errors }
+}