From 43ecce79be3b158ef8ee66e72062b8c4750118af Mon Sep 17 00:00:00 2001 From: Anthony Fu Date: Thu, 12 Sep 2024 20:45:20 +0200 Subject: [PATCH] feat(engine-js): supports contiguous anchor simulation --- docs/references/engine-js-compat.md | 70 ++++++++++++------------- packages/engine-javascript/src/index.ts | 33 ++++++++---- scripts/report-engine-js-compat.ts | 34 +++++++----- 3 files changed, 79 insertions(+), 58 deletions(-) diff --git a/docs/references/engine-js-compat.md b/docs/references/engine-js-compat.md index 752d36bc1..0a0b13301 100644 --- a/docs/references/engine-js-compat.md +++ b/docs/references/engine-js-compat.md @@ -200,30 +200,30 @@ Languages that does not throw with the JavaScript RegExp engine, but will produc | Language | Highlight Match | Patterns Parsable | Patterns Failed | Diff | | ------------- | :--------------------------------------------------------------------------------- | ----------------: | --------------: | ---: | -| angular-html | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=angular-html) | 2 | - | 6 | -| bash | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=bash) | 148 | - | 13 | -| beancount | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=beancount) | 39 | - | 4 | -| c | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=c) | 178 | - | 35 | -| crystal | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=crystal) | 143 | - | 2 | -| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 105 | - | 43 | -| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 50 | -| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 9 | - | 74 | -| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 66 | - | 6 | -| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 1 | -| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 183 | - | 5 | -| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 2 | -| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 104 | - | 3 | -| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 22 | -| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 342 | - | 37 | -| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 11 | -| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 91 | - | 6 | -| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 64 | - | 4 | +| angular-html | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=angular-html) | 2 | - | 330 | +| bash | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=bash) | 148 | - | 56 | +| beancount | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=beancount) | 39 | - | 171 | +| c | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=c) | 178 | - | 209 | +| crystal | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=crystal) | 143 | - | 40 | +| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 105 | - | 179 | +| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 470 | +| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 9 | - | 306 | +| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 66 | - | 48 | +| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 40 | +| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 183 | - | 25 | +| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 38 | +| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 104 | - | 4 | +| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 172 | +| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 342 | - | 605 | +| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 336 | +| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 91 | - | 164 | +| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 64 | - | 62 | | ruby | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=ruby) | 154 | - | 1 | -| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 13 | -| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 8 | -| splunk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=splunk) | 17 | - | 4 | -| stata | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=stata) | 194 | - | 4 | -| zsh | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=zsh) | 148 | - | 26 | +| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 56 | +| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 40 | +| splunk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=splunk) | 17 | - | 8 | +| stata | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=stata) | 194 | - | 32 | +| zsh | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=zsh) | 148 | - | 117 | ## Unsupported Languages @@ -234,17 +234,17 @@ Languages that throws with the JavaScript RegExp engine (contains syntaxes that | ada | ✅ OK | 201 | 1 | | | sass | ✅ OK | 67 | 2 | | | blade | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=blade) | 336 | 2 | | -| mdc | ❌ Error | 37 | - | 22 | +| mdc | ❌ Error | 37 | - | 377 | | powershell | ❌ Error | 87 | 1 | | -| wolfram | ❌ Error | 500 | 1 | 2 | -| razor | ❌ Error | 82 | 3 | 7 | +| wolfram | ❌ Error | 500 | 1 | 12 | +| razor | ❌ Error | 82 | 3 | 26 | | mdx | ❌ Error | 193 | 4 | | -| swift | ❌ Error | 325 | 4 | 4 | -| julia | ❌ Error | 90 | 5 | 5 | -| kotlin | ❌ Error | 52 | 6 | 81 | -| purescript | ❌ Error | 67 | 6 | 169 | -| markdown | ❌ Error | 111 | 7 | 41 | -| apex | ❌ Error | 173 | 14 | 44 | -| haskell | ❌ Error | 136 | 21 | 3 | -| cpp | ❌ Error | 238 | 22 | 5 | -| csharp | ❌ Error | 278 | 33 | 34 | +| swift | ❌ Error | 325 | 4 | 18 | +| julia | ❌ Error | 90 | 5 | 49 | +| kotlin | ❌ Error | 52 | 6 | 2986 | +| purescript | ❌ Error | 67 | 6 | 1488 | +| markdown | ❌ Error | 111 | 7 | 584 | +| apex | ❌ Error | 173 | 14 | 242 | +| haskell | ❌ Error | 136 | 21 | 12 | +| cpp | ❌ Error | 238 | 22 | 25 | +| csharp | ❌ Error | 278 | 33 | 232 | diff --git a/packages/engine-javascript/src/index.ts b/packages/engine-javascript/src/index.ts index e7a340ab8..77df016e3 100644 --- a/packages/engine-javascript/src/index.ts +++ b/packages/engine-javascript/src/index.ts @@ -41,6 +41,7 @@ export function defaultJavaScriptRegexConstructor(pattern: string): RegExp { export class JavaScriptScanner implements PatternScanner { regexps: (RegExp | null)[] + contiguousAnchorSimulation: boolean[] constructor( public patterns: string[], @@ -48,7 +49,10 @@ export class JavaScriptScanner implements PatternScanner { public forgiving: boolean, public regexConstructor: (pattern: string) => RegExp = defaultJavaScriptRegexConstructor, ) { - this.regexps = patterns.map((p) => { + this.contiguousAnchorSimulation = Array.from({ length: patterns.length }, () => false) + this.regexps = patterns.map((p, idx) => { + if (p.startsWith('(^|\\G)') || p.startsWith('(\\G|^)')) + this.contiguousAnchorSimulation[idx] = true const cached = cache?.get(p) if (cached) { if (cached instanceof RegExp) { @@ -77,9 +81,9 @@ export class JavaScriptScanner implements PatternScanner { const str = typeof string === 'string' ? string : string.content - const pending: [index: number, match: RegExpExecArray][] = [] + const pending: [index: number, match: RegExpExecArray, offset: number][] = [] - function toResult(index: number, match: RegExpExecArray) { + function toResult(index: number, match: RegExpExecArray, offset = 0) { return { index, captureIndices: match.indices!.map((indice) => { @@ -91,9 +95,9 @@ export class JavaScriptScanner implements PatternScanner { } } return { - start: indice[0], + start: indice[0] + offset, length: indice[1] - indice[0], - end: indice[1], + end: indice[1] + offset, } }), } @@ -104,16 +108,25 @@ export class JavaScriptScanner implements PatternScanner { if (!regexp) continue try { + let offset = 0 regexp.lastIndex = startPosition - const match = regexp.exec(str) + let match = regexp.exec(str) + + // If a regex starts with `(^|\\G)` or `(\\G|^)`, we simulate the behavior by cutting the string + if (!match && this.contiguousAnchorSimulation[i]) { + offset = startPosition + regexp.lastIndex = 0 + match = regexp.exec(str.slice(startPosition)) + } if (!match) continue + // If the match is at the start position, return it immediately if (match.index === startPosition) { - return toResult(i, match) + return toResult(i, match, offset) } // Otherwise, store it for later - pending.push([i, match]) + pending.push([i, match, offset]) } catch (e) { if (this.forgiving) @@ -125,9 +138,9 @@ export class JavaScriptScanner implements PatternScanner { // Find the closest match to the start position if (pending.length) { const minIndex = Math.min(...pending.map(m => m[1].index)) - for (const [i, match] of pending) { + for (const [i, match, offset] of pending) { if (match.index === minIndex) { - return toResult(i, match) + return toResult(i, match, offset) } } } diff --git a/scripts/report-engine-js-compat.ts b/scripts/report-engine-js-compat.ts index 3c54f41a9..080c43841 100644 --- a/scripts/report-engine-js-compat.ts +++ b/scripts/report-engine-js-compat.ts @@ -66,7 +66,7 @@ async function run() { } const highlightA = serializeTokens(shikiWasm, sample, lang) - let highlightB: string | undefined + let highlightB: { tokens: string, html: string } | undefined let highlightDiff: Diff[] = [] try { @@ -103,13 +103,13 @@ async function run() { } if (highlightMatch !== 'error') - highlightMatch = highlightA === highlightB + highlightMatch = highlightA.html === highlightB?.html highlightDiff = highlightB && highlightA !== highlightB - ? diffMain(highlightA, highlightB) + ? diffMain(highlightA.tokens, highlightB.tokens) : [] diffCleanupSemantic(highlightDiff) - if (!highlightMatch) { + if (highlightB && highlightMatch !== true) { console.log(c.yellow(`[${lang}] Mismatch`)) await fs.mkdir(new URL('./compares', import.meta.url), { recursive: true }) @@ -122,10 +122,10 @@ async function run() { 'pre { flex: 1; margin: 0; padding: 0; }', '', '
',
-          highlightA,
+          highlightA.html,
           '
', '
',
-          highlightB,
+          highlightB?.html,
           '
', ].join('\n'), 'utf-8', @@ -143,8 +143,8 @@ async function run() { ...highlightMatch === true ? {} : { - highlightA, - highlightB, + highlightA: highlightA.html, + highlightB: highlightB?.html, }, diff: highlightDiff, }) @@ -177,13 +177,13 @@ async function run() { ['---', ':---', '---:', '---:', '---:'], ...report .map((item) => { - const diffCount = item.diff.filter(diff => diff[0] === 1).length + const diffChars = item.diff.map(diff => diff[0] === 1 ? diff[1].length : 0).reduce((a, b) => a + b, 0) return [ item.lang, item.highlightMatch === true ? '✅ OK' : item.highlightMatch === 'error' ? '❌ Error' : `[🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=${item.lang})`, item.patternsParsable === 0 ? '-' : item.patternsParsable.toString(), item.patternsFailed.length === 0 ? '-' : item.patternsFailed.length.toString(), - diffCount ? diffCount.toString() : '', + diffChars ? diffChars.toString() : '', ] as [string, string, string, string, string] }), ] @@ -250,9 +250,17 @@ async function run() { } function serializeTokens(shiki: HighlighterGeneric, sample: string, lang: string) { - const tokens = shiki.codeToTokensBase(sample, { lang: lang as any, theme: 'vitesse-dark' }) - const str = tokens.flat(1).map(t => t.color?.padEnd(18, ' ') + t.content).join('\n') - return str + const tokens = shiki + .codeToTokensBase(sample, { lang: lang as any, theme: 'vitesse-dark' }) + .flat(1) + .map(t => t.color?.padEnd(18, ' ') + t.content) + .join('\n') + const html = shiki + .codeToHtml(sample, { lang: lang as any, theme: 'vitesse-dark' }) + return { + tokens, + html, + } } function getPatternsOfGrammar(grammar: any): Set {