fb55 · fb55 · Mar 26, 2022 · Mar 26, 2022 · Mar 26, 2022 · Mar 26, 2022
diff --git a/package.json b/package.json
@@ -50,6 +50,7 @@
         "prettier": "prettier '**/*.{ts,md,json,yml}'",
         "build": "tsc && cp -r src/maps lib",
         "build:docs": "typedoc --hideGenerator src/index.ts",
+        "build:trie": "ts-node scripts/write-decode-map.ts",
         "prepare": "npm run build"
     },
     "repository": {

diff --git a/scripts/trie/README.md b/scripts/trie/README.md
@@ -0,0 +1,94 @@
+# Named entity array-mapped trie generator
+
+In `v3.0.0`, `entities` adopted a version of the radix tree from
+[`parse5`](https://github.com/inikulin/parse5). The below is adapted from
+@inikulin's explanation of this structure.
+
+Prior to `parse5@3.0.0`, the library used simple pre-generated
+[trie data structure](https://en.wikipedia.org/wiki/Trie) for
+[named character references](https://html.spec.whatwg.org/multipage/syntax.html#named-character-references)
+in the tokenizer. This approach suffered from huge constant memory consumption:
+the in-memory size of the structure was ~8.5Mb. This new approach reduces the
+size of the character reference data to ~250Kb, at equivalent performance.
+
+## Radix tree
+
+All entities are encoded as a trie, which contains _nodes_. Nodes contain data
+and branches.
+
+E.g. for the words `test`, `tester` and `testing`, we'll receive the following
+trie:
+
+Legend: `[a, ...]` - node, `*` - data.
+
+```
+              [t]
+               |
+              [e]
+               |
+              [s]
+               |
+              [t]
+               |
+           [e, i, *]
+           /   |
+         [r]  [n]
+          |    |
+         [*]  [g]
+               |
+              [*]
+```
+
+## Mapping the trie to an array
+
+Since we need to allocate an object for each node, the trie consumes a lot of
+memory. Therefore, we map our trie to an array, so we'll end up with just a
+single object. Since we don't have indices and code points which are more than
+`MAX_UINT16` (which is `0xFFFF`), we can use a `Uint16Array` for this.
+
+The only exception here are
+[surrogate pairs](https://en.wikipedia.org/wiki/UTF-16#U.2B10000_to_U.2B10FFFF),
+which appear in named character reference results. They can be split across two
+`uint16` code points. The advantage of typed arrays is that they consume less
+memory and are extremely fast to traverse.
+
+### Node layout
+
+A node may contain one or two bytes of data and/or branch data. The layout of a
+node is as follows:
+
+```
+1 bit |  7 bit  |  1 bit  |  7 bit
+ \        \         \         \
+  \        \         \         \
+   \        \         \         jump table offset
+    \        \         flag if the value uses two bytes (for surrugate pairs)
+     \        number of branches
+      has value flag
+```
+
+If the _has value_ flag is set, the node will immediately be followed by the
+value. If it has any branch data (indicated by the _number of branches_ or the
+_jump table offset_ being set), it will then be followed by the branch data.
+
+### Branch data
+
+Branches can be represented in three different ways:
+
+1.  If we only have a single branch, and this branch wasn't encoded earlier in
+    the tree, we set the number of branch to 0 and the jump table offset to the
+    branch value. The node will be followed by the serialized branch.
+2.  If the branch values are close to one another, we use a jump table. This is
+    indicated by the jump table offset not being 0. The jump table is an array
+    of destination indices.
+3.  If the branch values are far apart, we use a dictionary. Branch data is
+    represented by two arrays, following one after another. The first array
+    contains sorted transition code points, the second one the corresponding
+    next edge/node indices. The traversing algorithm will use binary search to
+    find the key, and will then use the corresponding value as the jump target.
+
+The original `parse5` implementation used a radix tree as the basis for the
+encoded structure. It used a dictionary (see (3) above), as well as a variation
+of (1) for edges of the radix tree. The implementation in `entities` allowed us
+to use a trie when starting to decode, and gave us some space savings in the
+output.
diff --git a/scripts/trie/encode-trie.spec.ts b/scripts/trie/encode-trie.spec.ts
@@ -1,5 +1,6 @@
 import { BinTrieFlags } from "../../src/decode";
 import { encodeTrie } from "./encode-trie";
+import type { TrieNode } from "./trie";
 
 describe("encode_trie", () => {
     it("should encode an empty node", () => {
@@ -21,28 +22,40 @@ describe("encode_trie", () => {
         ]);
     });
 
-    it("should encode a node with a value and a postfix", () => {
-        expect(encodeTrie({ value: "a", postfix: "bc" })).toStrictEqual([
-            "b".charCodeAt(0),
-            "c".charCodeAt(0),
-            BinTrieFlags.HAS_VALUE,
-            "a".charCodeAt(0),
-        ]);
-    });
-
     it("should encode a branch of size 1", () => {
         expect(
             encodeTrie({
                 next: new Map([["b".charCodeAt(0), { value: "a" }]]),
             })
         ).toStrictEqual([
-            0b0000_0001_0000_0000,
             "b".charCodeAt(0),
             BinTrieFlags.HAS_VALUE,
             "a".charCodeAt(0),
         ]);
     });
 
+    it("should encode a branch of size 1 with a value that's already encoded", () => {
+        const nodeA: TrieNode = { value: "a" };
+        const nodeC = { next: new Map([["c".charCodeAt(0), nodeA]]) };
+        const trie = {
+            next: new Map<number, TrieNode>([
+                ["A".charCodeAt(0), nodeA],
+                ["b".charCodeAt(0), nodeC],
+            ]),
+        };
+        expect(encodeTrie(trie)).toStrictEqual([
+            0b0000_0010_0000_0000,
+            "A".charCodeAt(0),
+            "b".charCodeAt(0),
+            0b101,
+            0b111,
+            BinTrieFlags.HAS_VALUE,
+            "a".charCodeAt(0),
+            0b0000_0001_0000_0000 | "c".charCodeAt(0),
+            0b110, // Index plus one
+        ]);
+    });
+
     it("should encode a disjoint recursive branch", () => {
         const recursiveTrie = { next: new Map() };
         recursiveTrie.next.set("a".charCodeAt(0), { value: "a" });
@@ -64,7 +77,7 @@ describe("encode_trie", () => {
             jumpRecursiveTrie.next.set(val, jumpRecursiveTrie)
         );
         expect(encodeTrie(jumpRecursiveTrie)).toStrictEqual([
-            0b0000_1010_0000_0001, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
+            0b0000_1010_0011_0000, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
         ]);
     });
 });
diff --git a/scripts/trie/encode-trie.ts b/scripts/trie/encode-trie.ts
@@ -1,7 +1,7 @@
 /* eslint-disable node/no-unsupported-features/es-builtins */
 
 import * as assert from "assert";
-import { BinTrieFlags, JUMP_OFFSET_BASE } from "../../src/decode";
+import { BinTrieFlags } from "../../src/decode";
 import { TrieNode } from "./trie";
 
 function binaryLength(num: number) {
@@ -11,8 +11,7 @@ function binaryLength(num: number) {
 /**
  * Encodes the trie in binary form.
  *
- * We have four different types of nodes:
- * - Postfixes are ASCII values that match a particular character
+ * We have three different types of nodes:
  * - Values are UNICODE values that an entity resolves to
  * - Branches can be:
  *      1. If size is 1, then a matching character followed by the destination
@@ -32,7 +31,7 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
     const encodeCache = new Map<TrieNode, number>();
     const enc: number[] = [];
 
-    function encodeNode(node: TrieNode, depth: number): number {
+    function encodeNode(node: TrieNode): number {
         // Cache nodes, as we can have loops
         const cached = encodeCache.get(node);
         if (cached != null) return cached;
@@ -41,17 +40,6 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
 
         encodeCache.set(node, startIndex);
 
-        if (node.postfix != null) {
-            for (let i = 0; i < node.postfix.length; i++) {
-                const char = node.postfix.charCodeAt(i);
-
-                assert.ok(char < 128, "Char not in range");
-
-                // Start record with the postfix, as we have to match this first.
-                enc.push(char);
-            }
-        }
-
         const nodeIdx = enc.push(0) - 1;
 
         if (node.value != null) {
@@ -65,22 +53,14 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
                 enc.push(node.value.charCodeAt(i));
         }
 
-        if (node.next) addBranches(node.next, nodeIdx, depth + 1);
+        if (node.next) addBranches(node.next, nodeIdx);
 
-        assert.strictEqual(
-            nodeIdx,
-            startIndex + (node.postfix?.length ?? 0),
-            "Has expected location"
-        );
+        assert.strictEqual(nodeIdx, startIndex, "Has expected location");
 
         return startIndex;
     }
 
-    function addBranches(
-        next: Map<number, TrieNode>,
-        nodeIdx: number,
-        depth: number
-    ) {
+    function addBranches(next: Map<number, TrieNode>, nodeIdx: number) {
         const branches = Array.from(next.entries());
 
         // Sort branches ASC by key
@@ -93,11 +73,12 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
 
         // If we only have a single branch, we can write the next value directly
         if (branches.length === 1 && !encodeCache.has(branches[0][1])) {
-            enc[nodeIdx] |= branches.length << 8; // Write the length of the branch
+            const [char, next] = branches[0];
+
+            assert.ok(binaryLength(char) <= 7, "Too many bits for single char");
 
-            const [[char, next]] = branches;
-            enc.push(char);
-            encodeNode(next, depth);
+            enc[nodeIdx] |= char;
+            encodeNode(next);
             return;
         }
 
@@ -106,10 +87,10 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
         // If we have consecutive branches, we can write the next value as a jump table
 
         /*
-         * First, we determine how much overhead adding the jump table adds.
-         * If it is more than 2.5x, skip it.
+         * First, we determine how much space adding the jump table adds.
          *
-         * TODO: Determine best value
+         * If it is more than 2x the number of branches (which is equivalent
+         * to the size of the dictionary), skip it.
          */
 
         const jumpStartValue = branches[0][0];
@@ -120,7 +101,7 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
         const jumpTableOverhead = jumpTableLength / branches.length;
 
         if (jumpTableOverhead <= maxJumpTableOverhead) {
-            const jumpOffset = jumpStartValue - JUMP_OFFSET_BASE;
+            const jumpOffset = jumpStartValue;
 
             assert.ok(
                 binaryLength(jumpOffset) <= 16,
@@ -139,11 +120,10 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
             for (let i = 0; i < jumpTableLength; i++) enc.push(0);
 
             // Write the jump table
-            for (let i = 0; i < branches.length; i++) {
-                const [char, next] = branches[i];
+            for (const [char, next] of branches) {
                 const index = char - jumpStartValue;
                 // Write all values + 1, so 0 will result in a -1 when decoding
-                enc[branchIndex + index] = encodeNode(next, depth) + 1;
+                enc[branchIndex + index] = encodeNode(next) + 1;
             }
 
             return;
@@ -178,14 +158,14 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
                 Number.MAX_SAFE_INTEGER,
                 "Should have the placeholder as the second element"
             );
-            const offset = encodeNode(next, depth);
+            const offset = encodeNode(next);
 
             assert.ok(binaryLength(offset) <= 16, "Too many bits for offset");
             enc[currentIndex] = offset;
         });
     }
 
-    encodeNode(trie, 0);
+    encodeNode(trie);
 
     // Make sure that every value fits in a UInt16
     assert.ok(

diff --git a/scripts/trie/trie.ts b/scripts/trie/trie.ts
@@ -1,7 +1,5 @@
 export interface TrieNode {
     value?: string;
-    postfix?: string;
-    offset?: number;
     next?: Map<number, TrieNode>;
 }
 
@@ -10,6 +8,7 @@ export function getTrie(
     legacy: Record<string, string>
 ): TrieNode {
     const trie = new Map<number, TrieNode>();
+    const root = { next: trie };
 
     for (const key of Object.keys(map)) {
         // Resolve the key
@@ -27,27 +26,53 @@ export function getTrie(
         lastMap.set(";".charCodeAt(0), { value: map[key] });
     }
 
-    // Combine chains of nodes with a single branch to a postfix
-    function addPostfixes(node: TrieNode, offset: number) {
-        if (node.next) {
-            node.next.forEach((next) => addPostfixes(next, offset + 1));
-
-            if (node.value == null && node.next.size === 1) {
-                node.next.forEach((next, char) => {
-                    node.postfix =
-                        String.fromCharCode(char) + (next.postfix ?? "");
-                    node.value = next.value;
-                    node.next = next.next;
-                });
-            }
+    function isEqual(node1: TrieNode, node2: TrieNode): boolean {
+        if (node1 === node2) return true;
+
+        if (node1.value !== node2.value) {
+            return false;
+        }
+
+        // Check if the next nodes are equal. That means both are undefined.
+        if (node1.next === node2.next) return true;
+        if (
+            node1.next == null ||
+            node2.next == null ||
+            node1.next.size !== node2.next.size
+        ) {
+            return false;
         }
 
-        if (node.value != null) {
-            node.offset = offset + (node.postfix?.length ?? 0);
+        const next1 = [...node1.next];
+        const next2 = [...node2.next];
+
+        return next1.every(([char1, node1], idx) => {
+            const [char2, node2] = next2[idx];
+            return char1 === char2 && isEqual(node1, node2);
+        });
+    }
+
+    function mergeDuplicates(node: TrieNode) {
+        const nodes = [node];
+
+        for (let nodeIdx = 0; nodeIdx < nodes.length; nodeIdx++) {
+            const { next } = nodes[nodeIdx];
+
+            if (!next) continue;
+
+            for (const [char, node] of next) {
+                const idx = nodes.findIndex((n) => isEqual(n, node));
+
+                if (idx >= 0) {
+                    next.set(char, nodes[idx]);
+                } else {
+                    nodes.push(node);
+                }
+            }
         }
     }
 
-    trie.forEach((node) => addPostfixes(node, 0));
+    mergeDuplicates(root);
 
-    return { next: trie };
+    return root;
 }