Skip to content

Commit

Permalink
Auto merge of #33098 - raphlinus:master, r=alexcrichton
Browse files Browse the repository at this point in the history
Efficient trie lookup for boolean Unicode properties

Replace binary search of ranges with trie lookup using leaves of
64-bit bitmap chunks. Benchmarks suggest this is approximately 10x
faster than the bsearch approach.
  • Loading branch information
bors committed May 23, 2016
2 parents 6e45564 + cfaf66c commit 4c6b6c2
Show file tree
Hide file tree
Showing 2 changed files with 1,383 additions and 1,134 deletions.
138 changes: 134 additions & 4 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

import fileinput, re, os, sys, operator

bytes_old = 0
bytes_new = 0

preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
Expand Down Expand Up @@ -307,12 +310,137 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
format_table_content(f, data, 8)
f.write("\n ];\n\n")

def emit_trie_lookup_range_table(f):
f.write("""
// BoolTrie is a trie for representing a set of Unicode codepoints. It is
// implemented with postfix compression (sharing of identical child nodes),
// which gives both compact size and fast lookup.
//
// The space of Unicode codepoints is divided into 3 subareas, each
// represented by a trie with different depth. In the first (0..0x800), there
// is no trie structure at all; each u64 entry corresponds to a bitvector
// effectively holding 64 bool values.
//
// In the second (0x800..0x10000), each child of the root node represents a
// 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
// the trie stores an 8-bit index into a shared table of leaf values. This
// exploits the fact that in reasonable sets, many such leaves can be shared.
//
// In the third (0x10000..0x110000), each child of the root node represents a
// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
// of a child tree. Each of these 64 bytes represents an index into the table
// of shared 64-bit leaf values. This exploits the sparse structure in the
// non-BMP range of most Unicode sets.
pub struct BoolTrie {
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
r1: [u64; 32], // leaves
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
r2: [u8; 992], // first level
r3: &'static [u64], // leaves
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
r4: [u8; 256], // first level
r5: &'static [u8], // second level
r6: &'static [u64], // leaves
}
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
((bitmap_chunk >> (c & 63)) & 1) != 0
}
fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
let c = c as usize;
if c < 0x800 {
trie_range_leaf(c, r.r1[c >> 6])
} else if c < 0x10000 {
let child = r.r2[(c >> 6) - 0x20];
trie_range_leaf(c, r.r3[child as usize])
} else {
let child = r.r4[(c >> 12) - 0x10];
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
trie_range_leaf(c, r.r6[leaf as usize])
}
}\n
""")

def compute_trie(rawdata, chunksize):
root = []
childmap = {}
child_data = []
for i in range(len(rawdata) / chunksize):
data = rawdata[i * chunksize: (i + 1) * chunksize]
child = '|'.join(map(str, data))
if child not in childmap:
childmap[child] = len(childmap)
child_data.extend(data)
root.append(childmap[child])
return (root, child_data)

def emit_bool_trie(f, name, t_data, is_pub=True):
global bytes_old, bytes_new
bytes_old += 8 * len(t_data)
CHUNK = 64
rawdata = [False] * 0x110000;
for (lo, hi) in t_data:
for cp in range(lo, hi + 1):
rawdata[cp] = True

# convert to bitmap chunks of 64 bits each
chunks = []
for i in range(0x110000 / CHUNK):
chunk = 0
for j in range(64):
if rawdata[i * 64 + j]:
chunk |= 1 << j
chunks.append(chunk)

pub_string = ""
if is_pub:
pub_string = "pub "
f.write(" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n" % (pub_string, name))
f.write(" r1: [\n")
data = ','.join('0x%016x' % chunk for chunk in chunks[0:0x800 / CHUNK])
format_table_content(f, data, 12)
f.write("\n ],\n")

# 0x800..0x10000 trie
(r2, r3) = compute_trie(chunks[0x800 / CHUNK : 0x10000 / CHUNK], 64 / CHUNK)
f.write(" r2: [\n")
data = ','.join(str(node) for node in r2)
format_table_content(f, data, 12)
f.write("\n ],\n")
f.write(" r3: &[\n")
data = ','.join('0x%016x' % chunk for chunk in r3)
format_table_content(f, data, 12)
f.write("\n ],\n")

# 0x10000..0x110000 trie
(mid, r6) = compute_trie(chunks[0x10000 / CHUNK : 0x110000 / CHUNK], 64 / CHUNK)
(r4, r5) = compute_trie(mid, 64)
f.write(" r4: [\n")
data = ','.join(str(node) for node in r4)
format_table_content(f, data, 12)
f.write("\n ],\n")
f.write(" r5: &[\n")
data = ','.join(str(node) for node in r5)
format_table_content(f, data, 12)
f.write("\n ],\n")
f.write(" r6: &[\n")
data = ','.join('0x%016x' % chunk for chunk in r6)
format_table_content(f, data, 12)
f.write("\n ],\n")

f.write(" };\n\n")
bytes_new += 256 + 992 + 256 + 8 * len(r3) + len(r5) + 8 * len(r6)

def emit_property_module(f, mod, tbl, emit):
f.write("pub mod %s {\n" % mod)
for cat in sorted(emit):
emit_table(f, "%s_table" % cat, tbl[cat])
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")

Expand Down Expand Up @@ -402,8 +530,9 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
norm_props = load_properties("DerivedNormalizationProps.txt",
["Full_Composition_Exclusion"])

# bsearch_range_table is used in all the property modules below
emit_bsearch_range_table(rf)
# trie_lookup_table is used in all the property modules below
emit_trie_lookup_range_table(rf)
# emit_bsearch_range_table(rf)

# category tables
for (name, cat, pfuns) in ("general_category", gencats, ["N", "Cc"]), \
Expand All @@ -414,3 +543,4 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
emit_conversions_module(rf, to_upper, to_lower, to_title)
#print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)
Loading

0 comments on commit 4c6b6c2

Please sign in to comment.