From 35d49f3bdb089c6afe5071068da6aca3ef922118 Mon Sep 17 00:00:00 2001 From: Charles Lew Date: Mon, 4 May 2020 18:00:08 +0800 Subject: [PATCH] Implement rustc_mixed_script_confusable_detection. --- scripts/unicode.py | 332 +++++++++++++++++- src/lib.rs | 2 + ...rustc_mixed_script_confusable_detection.rs | 17 + src/tables.rs | 155 ++++++++ 4 files changed, 497 insertions(+), 9 deletions(-) create mode 100644 src/rustc_mixed_script_confusable_detection.rs diff --git a/scripts/unicode.py b/scripts/unicode.py index 1ef28ec..fbcdd0a 100644 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -47,6 +47,15 @@ def fetch(f): sys.stderr.write("cannot load %s\n" % f) exit(1) +def fetch_unidata(f): + if not os.path.exists(os.path.basename(f)): + os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s" + % (UNICODE_VERSION_NUMBER, f)) + + if not os.path.exists(os.path.basename(f)): + sys.stderr.write("cannot load %s" % f) + exit(1) + # Implementation from unicode-segmentation def load_properties(f, interestingprops = None): fetch(f) @@ -81,6 +90,41 @@ def load_properties(f, interestingprops = None): return props +def load_script_properties(f, interestingprops): + fetch_unidata(f) + props = {} + # Note: these regexes are different from those in unicode-segmentation, + # becase we need to handle spaces here + re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#") + + for line in fileinput.input(os.path.basename(f)): + prop = None + d_lo = 0 + d_hi = 0 + m = re1.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(1) + prop = m.group(2).strip() + else: + m = re2.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(2) + prop = m.group(3).strip() + else: + continue + if interestingprops and prop not in interestingprops: + continue + d_lo = int(d_lo, 16) + d_hi = int(d_hi, 16) + if prop not in props: + props[prop] = [] + props[prop].append((d_lo, d_hi)) + + return props + def load_confusables(f): fetch(f) confusables = [] @@ -97,12 +141,244 @@ def load_confusables(f): raise Exception('More than one code point in first column') d_input = int(d_inputs[0].strip(), 16) for d_output in m.group(2).split(): - d_outputitem = int(d_output, 16); - d_outputs.append(d_outputitem); + d_outputitem = int(d_output, 16) + d_outputs.append(d_outputitem) confusables.append((d_input, d_outputs)) return confusables +def aliases(): + """ + Fetch the shorthand aliases for each longhand Script name + """ + fetch_unidata("PropertyValueAliases.txt") + longforms = {} + shortforms = {} + re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)") + for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")): + m = re1.match(line) + if m: + l = m.group(2).strip() + s = m.group(1).strip() + assert(s not in longforms) + assert(l not in shortforms) + longforms[s] = l + shortforms[l] = s + else: + continue + + return (longforms, shortforms) + +def load_scripts(f): + (longforms, shortforms) = aliases() + scripts = load_script_properties(f, []) + + script_table = [] + script_list = [] + + for script in scripts: + if script not in ["Common", "Unknown", "Inherited"]: + script_list.append(shortforms[script]) + script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]]) + script_list.sort() + script_table.sort(key=lambda w: w[0]) + return (longforms, script_table) + +def is_script_ignored_in_mixedscript(source): + return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz' + +def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts): + script_lst = script_list(proto_lst, scripts) + script_lst.sort() + # here's a few rules to process current version of Unicode data (13.0 at this time) + script_lst_len = len(script_lst) + assert(script_lst_len > 0) + # Rule: A - A -> Processed, DontAdd + if script_lst_len == 1 and script_lst[0] == script_i: + return True, False + # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 1 and not is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_i) + and script_lst[0] != script_i): + return True, True + # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 1 and is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_i)): + return True, True + # Rule: A ... - A -> Processed, DontAdd + if script_lst_len > 1 and script_i in script_lst: + return True, False + # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_lst[1]) + and not is_script_ignored_in_mixedscript(script_i) + and script_lst[1] != script_i): + return True, True + if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[1]) + and not is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_i) + and script_lst[0] != script_i): + return True, True + # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) + and is_script_ignored_in_mixedscript(script_lst[1]) + and not is_script_ignored_in_mixedscript(script_i)): + return True, True + + # NotProcessed, DontAdd + return False, False + +def is_codepoint_identifier_allowed(c, identifier_allowed): + for data in identifier_allowed: + if c >= data[0] and c <= data[1]: + return True + return False + +def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts): + confusables = load_confusables(f) + seekup_map = {} + for item in confusables: + d_proto_list = item[1] + d_source = item[0] + assert(len(d_proto_list) > 0) + if len(d_proto_list) == 1: + seekup_map[escape_char(d_source)] = d_proto_list + # collect prototypes + codepoint_map = {} + multicodepoint_map = {} + for item in confusables: + d_source = item[0] + if not is_codepoint_identifier_allowed(d_source, identifier_allowed): + continue + d_proto_list = item[1] + if len(d_proto_list) == 1: + d_proto = escape_char(d_proto_list[0]) + if d_proto not in codepoint_map: + codepoint_map[d_proto] = [] + if d_proto not in seekup_map and is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed): + codepoint_map[d_proto].append(d_proto_list[0]) + codepoint_map[d_proto].append(d_source) + else: + d_protos = escape_char_list(d_proto_list) + if d_protos not in multicodepoint_map: + multicodepoint_map[d_protos] = (d_proto_list, []) + multicodepoint_map[d_protos][1].append(d_source) + + mixedscript_confusable = {} + + def confusable_entry_item(confusable, script, item_text, item): + if script not in confusable: + confusable[script] = {} + script_entry = confusable[script] + if item_text not in script_entry: + script_entry[item_text] = (item, []) + return script_entry[item_text][1] + + # between single charpoint that has single charpoint prototype + for _, source in codepoint_map.items(): + source_len = len(source) + for i in range(0, source_len - 1): + for j in range(i + 1, source_len): + item_i, item_j = source[i], source[j] + script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) + if script_i == script_j: + continue + if not is_script_ignored_in_mixedscript(script_i): + confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) + if not is_script_ignored_in_mixedscript(script_j): + confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) + + # between single charpoint that has multi charpoint prototype + for _, proto_lst_and_source in multicodepoint_map.items(): + source = proto_lst_and_source[1] + source_len = len(source) + for i in range(0, source_len - 1): + for j in range(i + 1, source_len): + item_i, item_j = source[i], source[j] + script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) + if script_i == script_j: + continue + if not is_script_ignored_in_mixedscript(script_i): + confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) + if not is_script_ignored_in_mixedscript(script_j): + confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) + + mixedscript_confusable_unresolved = {} + # single charpoint that has multi charpoint prototype and its prototype + for _, proto_lst_and_source in multicodepoint_map.items(): + proto_lst = proto_lst_and_source[0] + proto_lst_can_be_part_of_identifier = True + for c in proto_lst: + if not is_codepoint_identifier_allowed(c, identifier_allowed): + proto_lst_can_be_part_of_identifier = False + break + if not proto_lst_can_be_part_of_identifier: + continue + source = proto_lst_and_source[1] + source_len = len(source) + for i in range(0, source_len): + item_i = source[i] + script_i = codepoint_script(item_i, scripts) + if is_script_ignored_in_mixedscript(script_i): + continue + processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts) + if should_add: + assert(processed) + confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi') + if processed: + continue + proto_lst_text = escape_char_list(proto_lst) + if not proto_lst_text in mixedscript_confusable_unresolved: + mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, []) + mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i) + return (mixedscript_confusable, mixedscript_confusable_unresolved) + +def codepoint_script(c, scripts): + for x, y, script in scripts: + if c >= x and c <= y: + return script + raise Exception("Not in scripts: " + escape_char(c)) + +def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts): + f.write("/* " + text + "\n") + for script, lst in mixedscript_confusable.items(): + f.write("/// Script - " + script + "\n") + source_lst = [v[0] for (_, v) in lst.items()] + source_lst.sort() + for source in source_lst: + source_text = escape_char(source) + source_item_and_target_lst = lst[source_text] + target_lst = source_item_and_target_lst[1] + f.write(source_text + " => " + escape_char_list(target_lst) + " // " + escape_script_list(target_lst, scripts)+ "\n") + f.write("*/\n") + + +def script_list(char_lst, scripts): + script_lst = [] + for c in char_lst: + if c == 'multi': + script = 'Z~multi' + else: + script = codepoint_script(c, scripts) + if script not in script_lst: + script_lst.append(script) + return script_lst + +def escape_script_list(char_lst, scripts): + script_lst = script_list(char_lst, scripts) + script_lst.sort() + return str(script_lst) + +def debug_emit_mixedscript_confusable_unresolved(f, map, text, scripts): + if len(map) == 0: + return + print("// " + text + "\n") + for prototype_text, pair in map.items(): + prototype = pair[0] + source = pair[1] + print(prototype_text + " => " + escape_char_list(source) + " // " + escape_script_list(prototype, scripts) + " => " + escape_script_list(source, scripts) + "\n") + raise Exception("update the python script to add new rules for new data") + def format_table_content(f, content, indent): line = " "*indent first = True @@ -119,18 +395,20 @@ def format_table_content(f, content, indent): f.write(line) def escape_char(c): + if c == 'multi': + return "\"\"" return "'\\u{%x}'" % c def escape_char_list(l): - line = "["; - first = True; + line = "[" + first = True for c in l: if first: - line += escape_char(c); + line += escape_char(c) else: - line += ", " + escape_char(c); - first = False; - line += "]"; + line += ", " + escape_char(c) + first = False + line += "]" return line def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, @@ -226,7 +504,7 @@ def emit_confusable_detection_module(f): confusable_table.sort(key=lambda w: w[0]) last_key = None - for (k, v) in confusable_table: + for (k, _) in confusable_table: if k == last_key: raise Exception("duplicate keys in confusables table: %s" % k) last_key = k @@ -235,6 +513,40 @@ def emit_confusable_detection_module(f): pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1]))) f.write("}\n\n") +def escape_script_constant(name, longforms): + return "Script::" + longforms[name].strip() + +def emit_rustc_mixed_script_confusable_detection(f): + f.write("pub mod rustc_mixed_script_confusable_detection {") + f.write(""" + use unicode_script::Script; + + #[inline] + pub fn is_rustc_mixed_script_confusable(c: char) -> Option