Skip to content

Commit

Permalink
Merge pull request #32252 from github/repo-sync
Browse files Browse the repository at this point in the history
Repo sync
  • Loading branch information
docs-bot authored Mar 27, 2024
2 parents 18e7fd4 + b141a77 commit ae168ac
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 27 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/count-translation-corruptions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Count translation corruptions

# **What it does**: Generates a summary of Liquid corruptions per language.
# **Why we have it**: For insights into the state of translations and things we can do to fix them
# **Who does it impact**: Engineering

on:
workflow_dispatch:
pull_request:
paths:
- src/languages/scripts/count-translation-corruptions.ts
- .github/workflows/count-translation-corruptions.yml
- .github/actions/node-npm-setup/action.yml
- .github/actions/clone-translations/action.yml
- 'package**.json'

permissions:
contents: read

jobs:
count-translation-corruptions:
if: github.repository == 'github/docs-internal'
runs-on: ubuntu-20.04-xl
steps:
- name: Checkout English repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
# Using a PAT is necessary so that the new commit will trigger the
# CI in the PR. (Events from GITHUB_TOKEN don't trigger new workflows.)
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}

# It's important because translations are often a bit behind.
# So if a translation is a bit behind, it might still be referencing
# an asset even though none of the English content does.
- name: Clone all translations
uses: ./.github/actions/clone-translations
with:
token: ${{ secrets.DOCS_BOT_PAT_READPUBLICKEY }}

- uses: ./.github/actions/node-npm-setup

- name: Run count
run: npm run count-translation-corruptions
4 changes: 0 additions & 4 deletions .github/workflows/sme-review-tracking-issue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ on:
types:
- labeled

pull_request:
types:
- labeled

permissions:
contents: read

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"check-content-type": "node src/workflows/check-content-type.js",
"check-github-github-links": "node src/links/scripts/check-github-github-links.js",
"copy-fixture-data": "node src/tests/scripts/copy-fixture-data.js",
"count-translation-corruptions": "tsx src/languages/scripts/count-translation-corruptions.ts",
"debug": "cross-env NODE_ENV=development ENABLED_LANGUAGES=en nodemon --inspect src/frame/server.js",
"delete-orphan-translation-files": "tsx src/workflows/delete-orphan-translation-files.ts",
"dev": "cross-env npm start",
Expand Down
47 changes: 32 additions & 15 deletions src/code-scanning/scripts/generate-code-scanning-query-list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ type Query = {
autofixSupport: 'none' | 'default'
}

type QueryExtended = Query & {
inDefault: boolean
inExtended: boolean
inAutofix: boolean
}

const opts = program.opts()
main(
{
Expand Down Expand Up @@ -162,8 +168,28 @@ async function main(options: Options, language: string) {
}
}

const entries = Object.values(queries)
entries.sort((a, b) => a.name.localeCompare(b.name))
function decorate(query: Query): QueryExtended {
return {
...query,
inDefault: query.packs.includes('code-scanning'),
inExtended: query.packs.includes('security-extended'),
inAutofix: query.autofixSupport === 'default',
}
}

const entries = Object.values(queries).map(decorate)

// Spec: "Queries that are both in Default and Extended should come first,
// in alphabetical order. Followed by the queries that are in Extended only."
entries.sort((a, b) => {
if (a.inDefault && !b.inDefault) return -1
else if (!a.inDefault && b.inDefault) return 1

if (a.inExtended && !b.inExtended) return -1
else if (!a.inExtended && b.inExtended) return 1

return a.name.localeCompare(b.name)
})

// At the moment, our chosen business logic is that we omit the Autofix
// column if there are no queries that support it.
Expand All @@ -174,7 +200,7 @@ async function main(options: Options, language: string) {
printQueries(options, entries, includeAutofix)
}

function printQueries(options: Options, queries: Query[], includeAutofix: boolean) {
function printQueries(options: Options, queries: QueryExtended[], includeAutofix: boolean) {
const markdown = []
markdown.push('{% rowheaders %}')
markdown.push('') // blank line
Expand All @@ -190,18 +216,9 @@ function printQueries(options: Options, queries: Query[], includeAutofix: boolea

for (const query of queries) {
const markdownLink = `[${query.name}](${query.url})`
let defaultIcon = notIncludedOcticon
let extendedIcon = notIncludedOcticon
let autofixIcon = notIncludedOcticon
if (query.packs.includes('code-scanning')) {
defaultIcon = includedOcticon
}
if (query.packs.includes('security-extended')) {
extendedIcon = includedOcticon
}
if (query.autofixSupport === 'default') {
autofixIcon = includedOcticon
}
const defaultIcon = query.inDefault ? includedOcticon : notIncludedOcticon
const extendedIcon = query.inExtended ? includedOcticon : notIncludedOcticon
const autofixIcon = query.inAutofix ? includedOcticon : notIncludedOcticon
const row = [markdownLink, query.cwes.join(', '), defaultIcon, extendedIcon]
if (includeAutofix) {
row.push(autofixIcon)
Expand Down
16 changes: 12 additions & 4 deletions src/frame/lib/page-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,17 @@ async function translateTree(dir, langObj, enTree) {
*
* Order of languages and versions doesn't matter, but order of child page arrays DOES matter (for navigation).
*/
export async function loadSiteTree(unversionedTree) {
const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree()))
export async function loadSiteTree(unversionedTree, languagesOnly = []) {
const rawTree = Object.assign({}, unversionedTree || (await loadUnversionedTree(languagesOnly)))
const siteTree = {}

const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
// For every language...
await Promise.all(
Object.keys(languages).map(async (langCode) => {
langCodes.map(async (langCode) => {
if (!(langCode in rawTree)) {
throw new Error(`No tree for language ${langCode}`)
}
const treePerVersion = {}
// in every version...
await Promise.all(
Expand Down Expand Up @@ -329,8 +333,12 @@ export async function loadPageList(unversionedTree, languagesOnly = []) {
const rawTree = unversionedTree || (await loadUnversionedTree(languagesOnly))
const pageList = []

const langCodes = (languagesOnly.length && languagesOnly) || Object.keys(languages)
await Promise.all(
((languagesOnly.length && languagesOnly) || Object.keys(languages)).map(async (langCode) => {
langCodes.map(async (langCode) => {
if (!(langCode in rawTree)) {
throw new Error(`No tree for language ${langCode}`)
}
await addToCollection(rawTree[langCode], pageList)
}),
)
Expand Down
8 changes: 4 additions & 4 deletions src/frame/lib/warm-server.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ async function warmServer(languagesOnly = []) {
}

const unversionedTree = await dog.loadUnversionedTree(languagesOnly)
const siteTree = await dog.loadSiteTree(unversionedTree)
const pageList = await dog.loadPages(unversionedTree)
const siteTree = await dog.loadSiteTree(unversionedTree, languagesOnly)
const pageList = await dog.loadPages(unversionedTree, languagesOnly)
const pageMap = await dog.loadPageMap(pageList)
const redirects = await dog.loadRedirects(pageList)

Expand All @@ -52,12 +52,12 @@ dog.warmServer = statsd.asyncTimer(warmServer, 'warm_server')

// We only want statistics if the priming needs to occur, so let's wrap the
// real method and return early [without statistics] whenever possible
export default async function warmServerWrapper() {
export default async function warmServerWrapper(languagesOnly = []) {
// Handle receiving multiple calls to this method from multiple page requests
// by holding the in-progress Promise and returning it instead of allowing
// the server to actually load all of the files multiple times.
if (!promisedWarmServer) {
promisedWarmServer = dog.warmServer()
promisedWarmServer = dog.warmServer(languagesOnly)
}
return promisedWarmServer
}
168 changes: 168 additions & 0 deletions src/languages/scripts/count-translation-corruptions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import path from 'path'
import fs from 'fs'

import { program } from 'commander'
import chalk from 'chalk'
import { TokenizationError } from 'liquidjs'
import walk from 'walk-sync'

import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils.js'
import languages from '@/languages/lib/languages.js'
import warmServer, { type Site } from '@/frame/lib/warm-server.js'
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content.js'

program
.description('Tally the number of liquid corruptions in a translation')
.argument('[language...]', 'language(s) to compare against')
.action(main)
program.parse(process.argv)

type Page = {
relativePath: string
fullPath: string
title: string
shortTitle?: string
intro: string
markdown: string
languageCode: string
}

type Reusables = Map<string, string>

async function main(languageCodes: string[]) {
const langCodes = languageCodes.length
? languageCodes
: Object.keys(languages).filter((x) => x !== 'en')
const site = await warmServer(languageCodes.length ? ['en', ...langCodes] : [])

// When checking reusables, we only want to check the files that
// have an English equivalent.
const reusables = getReusables()

const totalErrors = new Map<string, number>()

for (const languageCode of langCodes) {
if (!(languageCode in languages)) {
console.error(chalk.red(`Language ${languageCode} not found`))
return process.exit(1)
}
if (languageCode === 'en') {
console.error(chalk.red("Can't test in English ('en')"))
return process.exit(1)
}
const { errors } = run(languageCode, site, reusables)
for (const [error, count] of Array.from(errors.entries())) {
totalErrors.set(error, (totalErrors.get(error) || 0) + count)
}
}

const sumTotal = Array.from(totalErrors.values()).reduce((acc, count) => acc + count, 0)
console.log('\nGRAND TOTAL ERRORS:', sumTotal)
}

function getReusables(): Reusables {
const reusables = new Map()
const files = walk('data/reusables', {
includeBasePath: true,
globs: ['**/*.md'],
ignore: ['**/README.md'],
})
for (const file of files) {
const content = fs.readFileSync(file, 'utf8')
reusables.set(file, content)
}
return reusables
}

function run(languageCode: string, site: Site, englishReusables: Reusables) {
const PADDING = 60
const language = languages[languageCode as keyof typeof languages]

console.log(`--- Tallying liquid corruptions in ${languageCode} (${language.name}) ---`)

const pageList: Page[] = site.pageList
const errors = new Map<string, number>()
const wheres = new Map<string, number>()
const illegalTags = new Map<string, number>()

function countError(error: TokenizationError, where: string) {
const errorString = (error as any).originalError.message as string
if (errorString.includes('illegal tag syntax')) {
const illegalTag = (error as any).token.content
illegalTags.set(illegalTag, (illegalTags.get(illegalTag) || 0) + 1)
}
errors.set(errorString, (errors.get(errorString) || 0) + 1)
wheres.set(where, (wheres.get(where) || 0) + 1)
}

for (const page of pageList) {
if (page.languageCode !== languageCode) continue

const strings: string[][] = [
['title', page.title],
['shortTitle', page.shortTitle || ''],
['intro', page.intro || ''],
['markdown', page.markdown],
].filter(([, string]) => Boolean(string))

for (const [where, string] of strings) {
try {
getLiquidTokens(string)
} catch (error) {
if (error instanceof TokenizationError) {
countError(error, where)
} else {
throw error
}
}
}
}

for (const [relativePath, englishContent] of Array.from(englishReusables.entries())) {
try {
const filePath = path.join(language.dir, relativePath)
const rawContent = fs.readFileSync(filePath, 'utf8')
const correctedContent = correctTranslatedContentStrings(rawContent, englishContent, {
code: languageCode,
relativePath,
})
getLiquidTokens(correctedContent)
} catch (error) {
if (error instanceof TokenizationError) {
countError(error, 'reusable')
} else if (error instanceof Error && error.message.startsWith('ENOENT')) {
continue
} else {
throw error
}
}
}

const flat = Array.from(errors.entries()).sort((a, b) => b[1] - a[1])
const sumTotal = flat.reduce((acc, [, count]) => acc + count, 0)

console.log('\nMost common errors')
flat.forEach(([error, count], i) => {
console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
})
console.log(`${'TOTAL:'.padEnd(3 + 1 + PADDING)}`, sumTotal)

if (sumTotal) {
const whereFlat = Array.from(wheres.entries()).sort((a, b) => b[1] - a[1])
console.log('\nMost common places')
whereFlat.forEach(([error, count], i) => {
console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
})

const illegalTagsFlat = Array.from(illegalTags.entries()).sort((a, b) => b[1] - a[1])
if (illegalTagsFlat.reduce((acc, [, count]) => acc + count, 0)) {
console.log('\nMost common illegal tags', illegalTagsFlat.length > 10 ? ' (Top 10)' : '')
illegalTagsFlat.slice(0, 10).forEach(([error, count], i) => {
console.log(`${i + 1}.`.padEnd(3), error.padEnd(PADDING), count)
})
}
}
console.log('\n')

return { errors }
}

0 comments on commit ae168ac

Please sign in to comment.