Skip to content

Commit

Permalink
refactor(trie): Store short values in nodes (#773)
Browse files Browse the repository at this point in the history
  • Loading branch information
fb55 authored Mar 31, 2022
1 parent 2a2bc5f commit 7347f3a
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 53 deletions.
26 changes: 15 additions & 11 deletions scripts/trie/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,26 +58,30 @@ A node may contain one or two bytes of data and/or branch data. The layout of a
node is as follows:

```
1 bit | 7 bit | 1 bit | 7 bit
\ \ \ \
\ \ \ \
\ \ \ jump table offset
\ \ flag if the value uses two bytes (for surrugate pairs)
2 bit | 7 bit | 7 bit
\ \ \
\ \ \
\ \ \
\ \ jump table offset
\ number of branches
has value flag
value length
```

If the _has value_ flag is set, the node will immediately be followed by the
value. If it has any branch data (indicated by the _number of branches_ or the
_jump table offset_ being set), it will then be followed by the branch data.
The _value length_ is the number of bytes in the value. If the length is 1, the
node does not have any branches and the value will be stored inside the lower 14
bit of the node. Otherwise, the value will be stored in the next bytes of the
array.

If it has any branch data (indicated by the _number of branches_ or the _jump
table offset_ being set), the node will be followed by the branch data.

### Branch data

Branches can be represented in three different ways:

1. If we only have a single branch, and this branch wasn't encoded earlier in
the tree, we set the number of branch to 0 and the jump table offset to the
branch value. The node will be followed by the serialized branch.
the tree, we set the number of branches to 0 and the jump table offset to
the branch value. The node will be followed by the serialized branch.
2. If the branch values are close to one another, we use a jump table. This is
indicated by the jump table offset not being 0. The jump table is an array
of destination indices.
Expand Down
25 changes: 10 additions & 15 deletions scripts/trie/encode-trie.spec.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { BinTrieFlags } from "../../src/decode";
import { encodeTrie } from "./encode-trie";
import type { TrieNode } from "./trie";

Expand All @@ -9,14 +8,13 @@ describe("encode_trie", () => {

it("should encode a node with a value", () => {
expect(encodeTrie({ value: "a" })).toStrictEqual([
BinTrieFlags.HAS_VALUE,
"a".charCodeAt(0),
0b0100_0000_0000_0000 | "a".charCodeAt(0),
]);
});

it("should encode a node with a multi-byte value", () => {
expect(encodeTrie({ value: "ab" })).toStrictEqual([
BinTrieFlags.HAS_VALUE | BinTrieFlags.MULTI_BYTE,
0b1100_0000_0000_0000,
"a".charCodeAt(0),
"b".charCodeAt(0),
]);
Expand All @@ -29,8 +27,7 @@ describe("encode_trie", () => {
})
).toStrictEqual([
"b".charCodeAt(0),
BinTrieFlags.HAS_VALUE,
"a".charCodeAt(0),
0b0100_0000_0000_0000 | "a".charCodeAt(0),
]);
});

Expand All @@ -44,14 +41,13 @@ describe("encode_trie", () => {
]),
};
expect(encodeTrie(trie)).toStrictEqual([
0b0000_0010_0000_0000,
0b0000_0001_0000_0000,
"A".charCodeAt(0),
"b".charCodeAt(0),
0b101,
0b111,
BinTrieFlags.HAS_VALUE,
"a".charCodeAt(0),
0b0000_0001_0000_0000 | "c".charCodeAt(0),
0b110,
0b0100_0000_0000_0000 | "a".charCodeAt(0),
0b0000_0000_1000_0000 | "c".charCodeAt(0),
0b110, // Index plus one
]);
});
Expand All @@ -61,13 +57,12 @@ describe("encode_trie", () => {
recursiveTrie.next.set("a".charCodeAt(0), { value: "a" });
recursiveTrie.next.set("0".charCodeAt(0), recursiveTrie);
expect(encodeTrie(recursiveTrie)).toStrictEqual([
0b0000_0010_0000_0000,
0b0000_0001_0000_0000,
"0".charCodeAt(0),
"a".charCodeAt(0),
0,
5,
BinTrieFlags.HAS_VALUE,
"a".charCodeAt(0),
0b0100_0000_0000_0000 | "a".charCodeAt(0),
]);
});

Expand All @@ -77,7 +72,7 @@ describe("encode_trie", () => {
jumpRecursiveTrie.next.set(val, jumpRecursiveTrie)
);
expect(encodeTrie(jumpRecursiveTrie)).toStrictEqual([
0b0000_1010_0011_0000, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
0b0000_0101_0011_0000, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
]);
});
});
40 changes: 31 additions & 9 deletions scripts/trie/encode-trie.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
/* eslint-disable node/no-unsupported-features/es-builtins */

import * as assert from "assert";
import { BinTrieFlags } from "../../src/decode";
import { TrieNode } from "./trie";

function binaryLength(num: number) {
Expand Down Expand Up @@ -43,14 +42,37 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
const nodeIdx = enc.push(0) - 1;

if (node.value != null) {
enc[nodeIdx] |= BinTrieFlags.HAS_VALUE;

if (node.value.length === 2) {
enc[nodeIdx] |= BinTrieFlags.MULTI_BYTE;
let valueLength = 0;

/*
* If we don't have a branch and the value is short, we can
* store the value in the node.
*/
if (
node.next ||
node.value.length > 1 ||
binaryLength(node.value.charCodeAt(0)) > 14
) {
valueLength = node.value.length;
}

for (let i = 0; i < node.value.length; i++)
enc.push(node.value.charCodeAt(i));
// Add 1 to the value length, to signal that we have a value.
valueLength += 1;

assert.ok(
binaryLength(valueLength) <= 2,
"Too many bits for value length"
);

enc[nodeIdx] |= valueLength << 14;

if (valueLength === 1) {
enc[nodeIdx] |= node.value.charCodeAt(0);
} else {
for (let i = 0; i < node.value.length; i++) {
enc.push(node.value.charCodeAt(i));
}
}
}

if (node.next) addBranches(node.next, nodeIdx);
Expand Down Expand Up @@ -109,7 +131,7 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
);

// Write the length of the adjusted table, plus jump offset
enc[nodeIdx] |= (jumpTableLength << 8) | jumpOffset;
enc[nodeIdx] |= (jumpTableLength << 7) | jumpOffset;

assert.ok(
binaryLength(jumpTableLength) <= 7,
Expand All @@ -129,7 +151,7 @@ export function encodeTrie(trie: TrieNode, maxJumpTableOverhead = 2): number[] {
return;
}

enc[nodeIdx] |= branches.length << 8;
enc[nodeIdx] |= branches.length << 7;

enc.push(
...branches.map(([char]) => char),
Expand Down
43 changes: 27 additions & 16 deletions src/decode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ const enum CharCodes {
}

export enum BinTrieFlags {
HAS_VALUE = 0b1000_0000_0000_0000,
BRANCH_LENGTH = 0b0111_1111_0000_0000,
MULTI_BYTE = 0b0000_0000_1000_0000,
VALUE_LENGTH = 0b1100_0000_0000_0000,
BRANCH_LENGTH = 0b0011_1111_1000_0000,
JUMP_TABLE = 0b0000_0000_0111_1111,
}

Expand Down Expand Up @@ -91,31 +90,39 @@ function getDecoder(decodeTree: Uint16Array) {

current = decodeTree[treeIdx];

const masked = current & BinTrieFlags.VALUE_LENGTH;

// If the branch is a value, store it and continue
if (current & BinTrieFlags.HAS_VALUE) {
if (masked) {
// If we have a legacy entity while parsing strictly, just skip the number of bytes
if (strict && str.charCodeAt(strIdx) !== CharCodes.SEMI) {
// No need to consider multi-byte values, as the legacy entity is always a single byte
treeIdx += 1;
} else {
// If this is a surrogate pair, combine the higher bits from the node with the next byte
if (!strict || str.charCodeAt(strIdx) === CharCodes.SEMI) {
resultIdx = treeIdx;
treeIdx +=
1 +
Number((current & BinTrieFlags.MULTI_BYTE) !== 0);
excess = 0;
}

// The mask is the number of bytes of the value, including the current byte.
const valueLength = (masked >> 14) - 1;

if (valueLength === 0) break;

treeIdx += valueLength;
}
}

if (resultIdx !== 0) {
const valueLength =
(decodeTree[resultIdx] & BinTrieFlags.VALUE_LENGTH) >> 14;
ret +=
decodeTree[resultIdx] & BinTrieFlags.MULTI_BYTE
valueLength === 1
? String.fromCharCode(
decodeTree[resultIdx] & ~BinTrieFlags.VALUE_LENGTH
)
: valueLength === 2
? String.fromCharCode(decodeTree[resultIdx + 1])
: String.fromCharCode(
decodeTree[resultIdx + 1],
decodeTree[resultIdx + 2]
)
: String.fromCharCode(decodeTree[resultIdx + 1]);
);
lastIdx = strIdx - excess + 1;
}
}
Expand All @@ -130,13 +137,15 @@ export function determineBranch(
nodeIdx: number,
char: number
): number {
const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 8;
const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
const jumpOffset = current & BinTrieFlags.JUMP_TABLE;

// Case 1: Single branch encoded in jump offset
if (branchCount === 0) {
return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
}

// Case 2: Multiple branches encoded in jump table
if (jumpOffset) {
const value = char - jumpOffset;

Expand All @@ -145,6 +154,8 @@ export function determineBranch(
: decodeTree[nodeIdx + value] - 1;
}

// Case 3: Multiple branches encoded in dictionary

// Binary search for the character.
let lo = nodeIdx;
let hi = lo + branchCount - 1;
Expand Down
2 changes: 1 addition & 1 deletion src/generated/decode-data-html.ts

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/generated/decode-data-xml.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
// Generated using scripts/write-decode-map.ts
// prettier-ignore
export default new Uint16Array([1024,97,103,108,113,9,23,27,31,1133,15,0,0,19,112,59,32768,38,111,115,59,32768,39,116,59,32768,62,116,59,32768,60,117,111,116,59,32768,34]);
export default new Uint16Array([512,97,103,108,113,9,21,24,27,621,15,0,0,18,112,59,16422,111,115,59,16423,116,59,16446,116,59,16444,117,111,116,59,16418]);

0 comments on commit 7347f3a

Please sign in to comment.