Skip to content

Commit

Permalink
feat(engine-js): supports contiguous anchor simulation
Browse files Browse the repository at this point in the history
  • Loading branch information
antfu committed Sep 12, 2024
1 parent b57415f commit 43ecce7
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 58 deletions.
70 changes: 35 additions & 35 deletions docs/references/engine-js-compat.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,30 +200,30 @@ Languages that does not throw with the JavaScript RegExp engine, but will produc

| Language | Highlight Match | Patterns Parsable | Patterns Failed | Diff |
| ------------- | :--------------------------------------------------------------------------------- | ----------------: | --------------: | ---: |
| angular-html | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=angular-html) | 2 | - | 6 |
| bash | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=bash) | 148 | - | 13 |
| beancount | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=beancount) | 39 | - | 4 |
| c | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=c) | 178 | - | 35 |
| crystal | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=crystal) | 143 | - | 2 |
| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 105 | - | 43 |
| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 50 |
| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 9 | - | 74 |
| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 66 | - | 6 |
| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 1 |
| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 183 | - | 5 |
| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 2 |
| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 104 | - | 3 |
| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 22 |
| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 342 | - | 37 |
| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 11 |
| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 91 | - | 6 |
| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 64 | - | 4 |
| angular-html | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=angular-html) | 2 | - | 330 |
| bash | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=bash) | 148 | - | 56 |
| beancount | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=beancount) | 39 | - | 171 |
| c | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=c) | 178 | - | 209 |
| crystal | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=crystal) | 143 | - | 40 |
| elixir | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=elixir) | 105 | - | 179 |
| erlang | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=erlang) | 147 | - | 470 |
| glsl | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=glsl) | 9 | - | 306 |
| haml | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=haml) | 66 | - | 48 |
| kusto | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=kusto) | 60 | - | 40 |
| latex | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=latex) | 183 | - | 25 |
| mermaid | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=mermaid) | 129 | - | 38 |
| nginx | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=nginx) | 104 | - | 4 |
| objective-cpp | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=objective-cpp) | 309 | - | 172 |
| php | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=php) | 342 | - | 605 |
| po | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=po) | 23 | - | 336 |
| pug | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=pug) | 91 | - | 164 |
| rst | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=rst) | 64 | - | 62 |
| ruby | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=ruby) | 154 | - | 1 |
| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 13 |
| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 8 |
| splunk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=splunk) | 17 | - | 4 |
| stata | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=stata) | 194 | - | 4 |
| zsh | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=zsh) | 148 | - | 26 |
| shellscript | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=shellscript) | 148 | - | 56 |
| smalltalk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=smalltalk) | 35 | - | 40 |
| splunk | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=splunk) | 17 | - | 8 |
| stata | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=stata) | 194 | - | 32 |
| zsh | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=zsh) | 148 | - | 117 |

## Unsupported Languages

Expand All @@ -234,17 +234,17 @@ Languages that throws with the JavaScript RegExp engine (contains syntaxes that
| ada | ✅ OK | 201 | 1 | |
| sass | ✅ OK | 67 | 2 | |
| blade | [🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=blade) | 336 | 2 | |
| mdc | ❌ Error | 37 | - | 22 |
| mdc | ❌ Error | 37 | - | 377 |
| powershell | ❌ Error | 87 | 1 | |
| wolfram | ❌ Error | 500 | 1 | 2 |
| razor | ❌ Error | 82 | 3 | 7 |
| wolfram | ❌ Error | 500 | 1 | 12 |
| razor | ❌ Error | 82 | 3 | 26 |
| mdx | ❌ Error | 193 | 4 | |
| swift | ❌ Error | 325 | 4 | 4 |
| julia | ❌ Error | 90 | 5 | 5 |
| kotlin | ❌ Error | 52 | 6 | 81 |
| purescript | ❌ Error | 67 | 6 | 169 |
| markdown | ❌ Error | 111 | 7 | 41 |
| apex | ❌ Error | 173 | 14 | 44 |
| haskell | ❌ Error | 136 | 21 | 3 |
| cpp | ❌ Error | 238 | 22 | 5 |
| csharp | ❌ Error | 278 | 33 | 34 |
| swift | ❌ Error | 325 | 4 | 18 |
| julia | ❌ Error | 90 | 5 | 49 |
| kotlin | ❌ Error | 52 | 6 | 2986 |
| purescript | ❌ Error | 67 | 6 | 1488 |
| markdown | ❌ Error | 111 | 7 | 584 |
| apex | ❌ Error | 173 | 14 | 242 |
| haskell | ❌ Error | 136 | 21 | 12 |
| cpp | ❌ Error | 238 | 22 | 25 |
| csharp | ❌ Error | 278 | 33 | 232 |
33 changes: 23 additions & 10 deletions packages/engine-javascript/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,18 @@ export function defaultJavaScriptRegexConstructor(pattern: string): RegExp {

export class JavaScriptScanner implements PatternScanner {
regexps: (RegExp | null)[]
contiguousAnchorSimulation: boolean[]

constructor(
public patterns: string[],
public cache: Map<string, RegExp | Error>,
public forgiving: boolean,
public regexConstructor: (pattern: string) => RegExp = defaultJavaScriptRegexConstructor,
) {
this.regexps = patterns.map((p) => {
this.contiguousAnchorSimulation = Array.from({ length: patterns.length }, () => false)
this.regexps = patterns.map((p, idx) => {
if (p.startsWith('(^|\\G)') || p.startsWith('(\\G|^)'))
this.contiguousAnchorSimulation[idx] = true
const cached = cache?.get(p)
if (cached) {
if (cached instanceof RegExp) {
Expand Down Expand Up @@ -77,9 +81,9 @@ export class JavaScriptScanner implements PatternScanner {
const str = typeof string === 'string'
? string
: string.content
const pending: [index: number, match: RegExpExecArray][] = []
const pending: [index: number, match: RegExpExecArray, offset: number][] = []

function toResult(index: number, match: RegExpExecArray) {
function toResult(index: number, match: RegExpExecArray, offset = 0) {
return {
index,
captureIndices: match.indices!.map((indice) => {
Expand All @@ -91,9 +95,9 @@ export class JavaScriptScanner implements PatternScanner {
}
}
return {
start: indice[0],
start: indice[0] + offset,
length: indice[1] - indice[0],
end: indice[1],
end: indice[1] + offset,
}
}),
}
Expand All @@ -104,16 +108,25 @@ export class JavaScriptScanner implements PatternScanner {
if (!regexp)
continue
try {
let offset = 0
regexp.lastIndex = startPosition
const match = regexp.exec(str)
let match = regexp.exec(str)

// If a regex starts with `(^|\\G)` or `(\\G|^)`, we simulate the behavior by cutting the string
if (!match && this.contiguousAnchorSimulation[i]) {
offset = startPosition
regexp.lastIndex = 0
match = regexp.exec(str.slice(startPosition))
}
if (!match)
continue

// If the match is at the start position, return it immediately
if (match.index === startPosition) {
return toResult(i, match)
return toResult(i, match, offset)
}
// Otherwise, store it for later
pending.push([i, match])
pending.push([i, match, offset])
}
catch (e) {
if (this.forgiving)
Expand All @@ -125,9 +138,9 @@ export class JavaScriptScanner implements PatternScanner {
// Find the closest match to the start position
if (pending.length) {
const minIndex = Math.min(...pending.map(m => m[1].index))
for (const [i, match] of pending) {
for (const [i, match, offset] of pending) {
if (match.index === minIndex) {
return toResult(i, match)
return toResult(i, match, offset)
}
}
}
Expand Down
34 changes: 21 additions & 13 deletions scripts/report-engine-js-compat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ async function run() {
}

const highlightA = serializeTokens(shikiWasm, sample, lang)
let highlightB: string | undefined
let highlightB: { tokens: string, html: string } | undefined
let highlightDiff: Diff[] = []

try {
Expand Down Expand Up @@ -103,13 +103,13 @@ async function run() {
}

if (highlightMatch !== 'error')
highlightMatch = highlightA === highlightB
highlightMatch = highlightA.html === highlightB?.html
highlightDiff = highlightB && highlightA !== highlightB
? diffMain(highlightA, highlightB)
? diffMain(highlightA.tokens, highlightB.tokens)
: []
diffCleanupSemantic(highlightDiff)

if (!highlightMatch) {
if (highlightB && highlightMatch !== true) {
console.log(c.yellow(`[${lang}] Mismatch`))

await fs.mkdir(new URL('./compares', import.meta.url), { recursive: true })
Expand All @@ -122,10 +122,10 @@ async function run() {
'pre { flex: 1; margin: 0; padding: 0; }',
'</style>',
'<pre>',
highlightA,
highlightA.html,
'</pre>',
'<pre>',
highlightB,
highlightB?.html,
'</pre>',
].join('\n'),
'utf-8',
Expand All @@ -143,8 +143,8 @@ async function run() {
...highlightMatch === true
? {}
: {
highlightA,
highlightB,
highlightA: highlightA.html,
highlightB: highlightB?.html,
},
diff: highlightDiff,
})
Expand Down Expand Up @@ -177,13 +177,13 @@ async function run() {
['---', ':---', '---:', '---:', '---:'],
...report
.map((item) => {
const diffCount = item.diff.filter(diff => diff[0] === 1).length
const diffChars = item.diff.map(diff => diff[0] === 1 ? diff[1].length : 0).reduce((a, b) => a + b, 0)
return [
item.lang,
item.highlightMatch === true ? '✅ OK' : item.highlightMatch === 'error' ? '❌ Error' : `[🚧 Mismatch](https://textmate-grammars-themes.netlify.app/?grammar=${item.lang})`,
item.patternsParsable === 0 ? '-' : item.patternsParsable.toString(),
item.patternsFailed.length === 0 ? '-' : item.patternsFailed.length.toString(),
diffCount ? diffCount.toString() : '',
diffChars ? diffChars.toString() : '',
] as [string, string, string, string, string]
}),
]
Expand Down Expand Up @@ -250,9 +250,17 @@ async function run() {
}

function serializeTokens(shiki: HighlighterGeneric<BundledLanguage, BundledTheme>, sample: string, lang: string) {
const tokens = shiki.codeToTokensBase(sample, { lang: lang as any, theme: 'vitesse-dark' })
const str = tokens.flat(1).map(t => t.color?.padEnd(18, ' ') + t.content).join('\n')
return str
const tokens = shiki
.codeToTokensBase(sample, { lang: lang as any, theme: 'vitesse-dark' })
.flat(1)
.map(t => t.color?.padEnd(18, ' ') + t.content)
.join('\n')
const html = shiki
.codeToHtml(sample, { lang: lang as any, theme: 'vitesse-dark' })
return {
tokens,
html,
}
}

function getPatternsOfGrammar(grammar: any): Set<string> {
Expand Down

0 comments on commit 43ecce7

Please sign in to comment.