From 413c48bc61e8efda8dad3968b26cedad363302ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Thu, 7 Apr 2022 10:24:49 +0100 Subject: [PATCH] fix(encode): Handle edge-case with surrogate pairs --- src/encode-trie.ts | 42 ++++++++++++++++++++++-------------------- src/encode.spec.ts | 3 +++ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/encode-trie.ts b/src/encode-trie.ts index e95b24cd..ba0ea83f 100644 --- a/src/encode-trie.ts +++ b/src/encode-trie.ts @@ -30,36 +30,38 @@ export function encodeHTMLTrieRe(regExp: RegExp, str: string): string { while ((match = regExp.exec(str)) !== null) { const i = match.index; + ret += str.substring(lastIdx, i); const char = str.charCodeAt(i); let next = htmlTrie.get(char); - if (next != null) { - if (typeof next !== "string") { - // We are in a branch. Try to match the next char. - if (i + 1 < str.length) { - const value = - typeof next.n === "number" - ? next.n === str.charCodeAt(i + 1) - ? next.o - : undefined - : next.n.get(str.charCodeAt(i + 1)); + if (typeof next === "object") { + // We are in a branch. Try to match the next char. + if (i + 1 < str.length) { + const nextChar = str.charCodeAt(i + 1); + const value = + typeof next.n === "number" + ? next.n === nextChar + ? next.o + : undefined + : next.n.get(nextChar); - if (value !== undefined) { - ret += str.substring(lastIdx, i) + value; - lastIdx = regExp.lastIndex += 1; - continue; - } + if (value !== undefined) { + ret += value; + lastIdx = regExp.lastIndex += 1; + continue; } - - // If we have a character without a value, use a numeric entitiy. - next = next.v ?? `&#x${char.toString(16)};`; } - ret += str.substring(lastIdx, i) + next; + next = next.v; + } + + // We might have a tree node without a value; skip and use a numeric entitiy. + if (next !== undefined) { + ret += next; lastIdx = i + 1; } else { const cp = getCodePoint(str, i); - ret += `${str.substring(lastIdx, i)}&#x${cp.toString(16)};`; + ret += `&#x${cp.toString(16)};`; // Increase by 1 if we have a surrogate pair lastIdx = regExp.lastIndex += Number(cp !== char); } diff --git a/src/encode.spec.ts b/src/encode.spec.ts index a0e13432..dd334a98 100644 --- a/src/encode.spec.ts +++ b/src/encode.spec.ts @@ -54,6 +54,9 @@ describe("Encode->decode test", () => { it("should encode trailing parts of entities", () => expect(entities.encodeHTML("\ud835")).toBe("�")); + + it("should encode surrogate pair with first surrogate equivalent of entity, without corresponding entity", () => + expect(entities.encodeHTML("\u{1d4a4}")).toBe("𝒤")); }); describe("encodeNonAsciiHTML", () => {