diff options
Diffstat (limited to 'third_party/rust/unicode-normalization/scripts/unicode.py')
-rwxr-xr-x | third_party/rust/unicode-normalization/scripts/unicode.py | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/third_party/rust/unicode-normalization/scripts/unicode.py b/third_party/rust/unicode-normalization/scripts/unicode.py new file mode 100755 index 000000000..1b0ef9f52 --- /dev/null +++ b/third_party/rust/unicode-normalization/scripts/unicode.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python +# +# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - DerivedNormalizationProps.txt +# - ReadMe.txt +# - UnicodeData.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the unicode.rs file into git. + +import fileinput, re, os, sys + +preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly + +#![allow(missing_docs, non_upper_case_globals, non_snake_case)] +''' + +# Mapping taken from Table 12 from: +# http://www.unicode.org/reports/tr44/#General_Category_Values +expanded_categories = { + 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], + 'Lm': ['L'], 'Lo': ['L'], + 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], + 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], + 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], + 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], + 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], + 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], + 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], +} + +# these are the surrogate codepoints, which are not valid rust characters +surrogate_codepoints = (0xd800, 0xdfff) + +def fetch(f): + if not os.path.exists(os.path.basename(f)): + os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" + % f) + + if not os.path.exists(os.path.basename(f)): + sys.stderr.write("cannot load %s" % f) + exit(1) + +def is_surrogate(n): + return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] + +def load_unicode_data(f): + fetch(f) + combines = {} + canon_decomp = {} + compat_decomp = {} + general_category_mark = [] + + udict = {}; + range_start = -1; + for line in fileinput.input(f): + data = line.split(';'); + if len(data) != 15: + continue + cp = int(data[0], 16); + if is_surrogate(cp): + continue + if range_start >= 0: + for i in xrange(range_start, cp): + udict[i] = data; + range_start = -1; + if data[1].endswith(", First>"): + range_start = cp; + continue; + udict[cp] = data; + + for code in udict: + [code_org, name, gencat, combine, bidi, + decomp, deci, digit, num, mirror, + old, iso, upcase, lowcase, titlecase ] = udict[code]; + + # store decomposition, if given + if decomp != "": + if decomp.startswith('<'): + seq = [] + for i in decomp.split()[1:]: + seq.append(int(i, 16)) + compat_decomp[code] = seq + else: + seq = [] + for i in decomp.split(): + seq.append(int(i, 16)) + canon_decomp[code] = seq + + # record combining class, if any + if combine != "0": + if combine not in combines: + combines[combine] = [] + combines[combine].append(code) + + if 'M' in [gencat] + expanded_categories.get(gencat, []): + general_category_mark.append(code) + general_category_mark = group_cat(general_category_mark) + + combines = to_combines(group_cats(combines)) + + return (canon_decomp, compat_decomp, combines, general_category_mark) + +def group_cats(cats): + cats_out = {} + for cat in cats: + cats_out[cat] = group_cat(cats[cat]) + return cats_out + +def group_cat(cat): + cat_out = [] + letters = sorted(set(cat)) + cur_start = letters.pop(0) + cur_end = cur_start + for letter in letters: + assert letter > cur_end, \ + "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) + if letter == cur_end + 1: + cur_end = letter + else: + cat_out.append((cur_start, cur_end)) + cur_start = cur_end = letter + cat_out.append((cur_start, cur_end)) + return cat_out + +def ungroup_cat(cat): + cat_out = [] + for (lo, hi) in cat: + while lo <= hi: + cat_out.append(lo) + lo += 1 + return cat_out + +def to_combines(combs): + combs_out = [] + for comb in combs: + for (lo, hi) in combs[comb]: + combs_out.append((lo, hi, comb)) + combs_out.sort(key=lambda comb: comb[0]) + return combs_out + +def format_table_content(f, content, indent): + line = " "*indent + first = True + for chunk in content.split(","): + if len(line) + len(chunk) < 98: + if first: + line += chunk + else: + line += ", " + chunk + first = False + else: + f.write(line + ",\n") + line = " "*indent + chunk + f.write(line) + +def load_properties(f, interestingprops): + fetch(f) + props = {} + re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") + re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") + + for line in fileinput.input(os.path.basename(f)): + prop = None + d_lo = 0 + d_hi = 0 + m = re1.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(1) + prop = m.group(2) + else: + m = re2.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(2) + prop = m.group(3) + else: + continue + if interestingprops and prop not in interestingprops: + continue + d_lo = int(d_lo, 16) + d_hi = int(d_hi, 16) + if prop not in props: + props[prop] = [] + props[prop].append((d_lo, d_hi)) + + # optimize if possible + for prop in props: + props[prop] = group_cat(ungroup_cat(props[prop])) + + return props + +def escape_char(c): + return "'\\u{%x}'" % c + +def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, + pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): + pub_string = "" + if is_pub: + pub_string = "pub " + f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) + data = "" + first = True + for dat in t_data: + if not first: + data += "," + first = False + data += pfun(dat) + format_table_content(f, data, 8) + f.write("\n ];\n\n") + +def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mark): + canon_keys = canon.keys() + canon_keys.sort() + + compat_keys = compat.keys() + compat_keys.sort() + + canon_comp = {} + comp_exclusions = norm_props["Full_Composition_Exclusion"] + for char in canon_keys: + if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions): + continue + decomp = canon[char] + if len(decomp) == 2: + if not canon_comp.has_key(decomp[0]): + canon_comp[decomp[0]] = [] + canon_comp[decomp[0]].append( (decomp[1], char) ) + canon_comp_keys = canon_comp.keys() + canon_comp_keys.sort() + + f.write("pub mod normalization {\n") + + def mkdata_fun(table): + def f(char): + data = "(%s,&[" % escape_char(char) + first = True + for d in table[char]: + if not first: + data += "," + first = False + data += escape_char(d) + data += "])" + return data + return f + + f.write(" // Canonical decompositions\n") + emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]", + pfun=mkdata_fun(canon)) + + f.write(" // Compatibility decompositions\n") + emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]", + pfun=mkdata_fun(compat)) + + def comp_pfun(char): + data = "(%s,&[" % escape_char(char) + canon_comp[char].sort(lambda x, y: x[0] - y[0]) + first = True + for pair in canon_comp[char]: + if not first: + data += "," + first = False + data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1])) + data += "])" + return data + + f.write(" // Canonical compositions\n") + emit_table(f, "composition_table", canon_comp_keys, + "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun) + + f.write(""" + fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { + use std::cmp::Ordering::{Equal, Less, Greater}; + match r.binary_search_by(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Ok(idx) => { + let (_, _, result) = r[idx]; + result + } + Err(_) => 0 + } + }\n +""") + + emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False, + pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) + + f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" + + " bsearch_range_value_table(c, combining_class_table)\n" + + " }\n") + + f.write(""" + fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool { + use std::cmp::Ordering::{Equal, Less, Greater}; + r.binary_search_by(|&(lo, hi)| { + if lo <= c && c <= hi { + Equal + } else if hi < c { + Less + } else { + Greater + } + }) + .is_ok() + } + + /// Return whether the given character is a combining mark (`General_Category=Mark`) + pub fn is_combining_mark(c: char) -> bool { + bsearch_range_table(c, general_category_mark) + } + +""") + + emit_table(f, "general_category_mark", combine, "&'static [(char, char)]", is_pub=False, + pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))) + + f.write(""" +} + +""") + +if __name__ == "__main__": + r = "tables.rs" + if os.path.exists(r): + os.remove(r) + with open(r, "w") as rf: + # write the file's preamble + rf.write(preamble) + + # download and parse all the data + fetch("ReadMe.txt") + with open("ReadMe.txt") as readme: + pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + unicode_version = re.search(pattern, readme.read()).groups() + rf.write(""" +/// The version of [Unicode](http://www.unicode.org/) +/// that this version of unicode-normalization is based on. +pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); + +""" % unicode_version) + (canon_decomp, compat_decomp, combines, general_category_mark) = \ + load_unicode_data("UnicodeData.txt") + norm_props = load_properties("DerivedNormalizationProps.txt", + ["Full_Composition_Exclusion"]) + + # normalizations and conversions module + emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props, + general_category_mark) |