diff --git a/scripts/trie/decode-trie.spec.ts b/scripts/trie/decode-trie.spec.ts new file mode 100644 index 00000000..d491fb14 --- /dev/null +++ b/scripts/trie/decode-trie.spec.ts @@ -0,0 +1,97 @@ +import { encodeTrie } from "./encode-trie.js"; +import { decodeNode } from "./decode-trie.js"; + +import { getTrie } from "./trie.js"; + +import xmlMap from "../../maps/xml.json"; +import entityMap from "../../maps/entities.json"; +import legacyMap from "../../maps/legacy.json"; + +function decode(decodeMap: number[]) { + const map = {}; + decodeNode(decodeMap, map, "", 0); + + return map; +} + +function mergeMaps( + map: Record, + legacy: Record +): Record { + const merged = { + ...legacy, + ...Object.fromEntries( + Object.entries(map).map(([key, value]) => [`${key};`, value]) + ), + }; + + return merged; +} + +describe("decode_trie", () => { + it("should decode an empty node", () => + expect(decode([0b0000_0000_0000_0000])).toStrictEqual({})); + + it("should decode an empty encode", () => + expect(decode(encodeTrie({}))).toStrictEqual({})); + + it("should decode a node with a value", () => + expect(decode(encodeTrie({ value: "a" }))).toStrictEqual({ "": "a" })); + + it("should decode a node with a multi-byte value", () => + expect(decode(encodeTrie({ value: "ab" }))).toStrictEqual({ + "": "ab", + })); + + it("should decode a branch of size 1", () => + expect( + decode( + encodeTrie({ + next: new Map([["b".charCodeAt(0), { value: "a" }]]), + }) + ) + ).toStrictEqual({ + b: "a", + })); + + it("should decode a dictionary of size 2", () => + expect( + decode( + encodeTrie({ + next: new Map([ + ["A".charCodeAt(0), { value: "a" }], + ["b".charCodeAt(0), { value: "B" }], + ]), + }) + ) + ).toStrictEqual({ + A: "a", + b: "B", + })); + + it("should decode a jump table of size 2", () => + expect( + decode( + encodeTrie({ + next: new Map([ + ["a".charCodeAt(0), { value: "a" }], + ["b".charCodeAt(0), { value: "B" }], + ]), + }) + ) + ).toStrictEqual({ + a: "a", + b: "B", + })); + + it("should decode the XML map", () => + expect(decode(encodeTrie(getTrie(xmlMap, {})))).toStrictEqual( + mergeMaps(xmlMap, {}) + )); + + // Test takes a long time — skipped by default + it.skip("should decode the HTML map", () => + expect(decode(encodeTrie(getTrie(entityMap, legacyMap)))).toStrictEqual( + mergeMaps(entityMap, legacyMap) + )); +}); diff --git a/scripts/trie/decode-trie.ts b/scripts/trie/decode-trie.ts new file mode 100644 index 00000000..ccf6b0c6 --- /dev/null +++ b/scripts/trie/decode-trie.ts @@ -0,0 +1,79 @@ +enum BinTrieFlags { + VALUE_LENGTH = 0b1100_0000_0000_0000, + BRANCH_LENGTH = 0b0011_1111_1000_0000, + JUMP_TABLE = 0b0000_0000_0111_1111, +} + +export function decodeNode( + decodeMap: number[], + resultMap: Record, + prefix: string, + startIndex: number +): void { + const current = decodeMap[startIndex]; + const valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14; + + if (valueLength > 0) { + resultMap[prefix] = + valueLength === 1 + ? String.fromCharCode( + decodeMap[startIndex] & ~BinTrieFlags.VALUE_LENGTH + ) + : valueLength === 2 + ? String.fromCharCode(decodeMap[startIndex + 1]) + : String.fromCharCode( + decodeMap[startIndex + 1], + decodeMap[startIndex + 2] + ); + } + + const branchLength = (current & BinTrieFlags.BRANCH_LENGTH) >> 7; + const jumpOffset = current & BinTrieFlags.JUMP_TABLE; + + if (valueLength === 1 || (branchLength === 0 && jumpOffset === 0)) { + return; + } + + const branchIdx = startIndex + Math.max(valueLength, 1); + + if (branchLength === 0) { + return decodeNode( + decodeMap, + resultMap, + prefix + String.fromCharCode(jumpOffset), + branchIdx + ); + } + + if (jumpOffset !== 0) { + for (let i = 0; i < branchLength; i++) { + const val = decodeMap[branchIdx + i] - 1; + if (val !== -1) { + const code = jumpOffset + i; + + console.log("CODE", String.fromCharCode(code), i, val); + + decodeNode( + decodeMap, + resultMap, + prefix + String.fromCharCode(code), + val + ); + } + } + } else { + for (let i = 0; i < branchLength; i++) { + console.log( + "BRANCH", + String.fromCharCode(decodeMap[branchIdx + i]) + ); + + decodeNode( + decodeMap, + resultMap, + prefix + String.fromCharCode(decodeMap[branchIdx + i]), + decodeMap[branchIdx + branchLength + i] + ); + } + } +} diff --git a/scripts/trie/utils.ts b/scripts/trie/utils.ts deleted file mode 100644 index d679e8af..00000000 --- a/scripts/trie/utils.ts +++ /dev/null @@ -1,79 +0,0 @@ -import { getTrie, TrieNode } from "./trie.js"; -import { encodeTrie } from "./encode-trie.js"; -import { BinTrieFlags } from "../../src/decode.js"; -import xmlMap from "../../maps/xml.json"; - -/** - * Utils for analzying the encoded trie. - */ - -const decodeXMLMap = encodeTrie(getTrie(xmlMap, {})); -const parseCache = new Map(); - -function parseNode(decodeMap: number[], startIndex: number): TrieNode { - const cached = parseCache.get(startIndex); - if (cached != null) return cached; - let index = startIndex; - const value = decodeMap[index++]; - const hasValue = value & BinTrieFlags.HAS_VALUE; - const node: TrieNode = { - value: hasValue - ? value & BinTrieFlags.MULTI_BYTE - ? String.fromCharCode(decodeMap[index++], decodeMap[index++]) - : String.fromCharCode(decodeMap[index++]) - : undefined, - next: undefined, - }; - - parseCache.set(startIndex, node); - - const branchLength = (value & BinTrieFlags.BRANCH_LENGTH) >>> 8; - - if (branchLength) { - const next = (node.next = new Map()); - if (branchLength === 1) { - next.set(decodeMap[index++], parseNode(decodeMap, index)); - } else if (value & BinTrieFlags.JUMP_TABLE) { - const offset = decodeMap[index++]; - - for (let i = 0; i < branchLength; i++) { - if (decodeMap[index] !== 0) { - const code = offset + i; - next.set(code, parseNode(decodeMap, decodeMap[index + i])); - } - } - } else { - for (let i = 0; i < branchLength; i++) { - const char = decodeMap[index + i]; - const nextNode = parseNode( - decodeMap, - decodeMap[index + branchLength + i] - ); - next.set(char, nextNode); - } - } - } - return node; -} - -const printed = new Set(); -function printTrie(trie: TrieNode, prefix = "") { - if (printed.has(trie)) return; - printed.add(trie); - console.log( - "prefix", - prefix, - "value", - trie.value, - "next size", - trie.next?.size - ); - if (trie.next) { - trie.next.forEach((node, char) => - printTrie(node, prefix + String.fromCharCode(char)) - ); - } -} - -const parsedXMLDecodedTrie = parseNode(decodeXMLMap, 0); -printTrie(parsedXMLDecodedTrie);