diff options
Diffstat (limited to 'third_party/rust/unicode-normalization/scripts/unicode.py')
-rwxr-xr-x | third_party/rust/unicode-normalization/scripts/unicode.py | 372 |
1 files changed, 0 insertions, 372 deletions
diff --git a/third_party/rust/unicode-normalization/scripts/unicode.py b/third_party/rust/unicode-normalization/scripts/unicode.py deleted file mode 100755 index 1b0ef9f52..000000000 --- a/third_party/rust/unicode-normalization/scripts/unicode.py +++ /dev/null @@ -1,372 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT -# file at the top-level directory of this distribution and at -# http://rust-lang.org/COPYRIGHT. -# -# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# This script uses the following Unicode tables: -# - DerivedNormalizationProps.txt -# - ReadMe.txt -# - UnicodeData.txt -# -# Since this should not require frequent updates, we just store this -# out-of-line and check the unicode.rs file into git. - -import fileinput, re, os, sys - -preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly - -#![allow(missing_docs, non_upper_case_globals, non_snake_case)] -''' - -# Mapping taken from Table 12 from: -# http://www.unicode.org/reports/tr44/#General_Category_Values -expanded_categories = { - 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], - 'Lm': ['L'], 'Lo': ['L'], - 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], - 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], - 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], - 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], - 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], - 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], - 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], -} - -# these are the surrogate codepoints, which are not valid rust characters -surrogate_codepoints = (0xd800, 0xdfff) - -def fetch(f): - if not os.path.exists(os.path.basename(f)): - os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s" - % f) - - if not os.path.exists(os.path.basename(f)): - sys.stderr.write("cannot load %s" % f) - exit(1) - -def is_surrogate(n): - return surrogate_codepoints[0] <= n <= surrogate_codepoints[1] - -def load_unicode_data(f): - fetch(f) - combines = {} - canon_decomp = {} - compat_decomp = {} - general_category_mark = [] - - udict = {}; - range_start = -1; - for line in fileinput.input(f): - data = line.split(';'); - if len(data) != 15: - continue - cp = int(data[0], 16); - if is_surrogate(cp): - continue - if range_start >= 0: - for i in xrange(range_start, cp): - udict[i] = data; - range_start = -1; - if data[1].endswith(", First>"): - range_start = cp; - continue; - udict[cp] = data; - - for code in udict: - [code_org, name, gencat, combine, bidi, - decomp, deci, digit, num, mirror, - old, iso, upcase, lowcase, titlecase ] = udict[code]; - - # store decomposition, if given - if decomp != "": - if decomp.startswith('<'): - seq = [] - for i in decomp.split()[1:]: - seq.append(int(i, 16)) - compat_decomp[code] = seq - else: - seq = [] - for i in decomp.split(): - seq.append(int(i, 16)) - canon_decomp[code] = seq - - # record combining class, if any - if combine != "0": - if combine not in combines: - combines[combine] = [] - combines[combine].append(code) - - if 'M' in [gencat] + expanded_categories.get(gencat, []): - general_category_mark.append(code) - general_category_mark = group_cat(general_category_mark) - - combines = to_combines(group_cats(combines)) - - return (canon_decomp, compat_decomp, combines, general_category_mark) - -def group_cats(cats): - cats_out = {} - for cat in cats: - cats_out[cat] = group_cat(cats[cat]) - return cats_out - -def group_cat(cat): - cat_out = [] - letters = sorted(set(cat)) - cur_start = letters.pop(0) - cur_end = cur_start - for letter in letters: - assert letter > cur_end, \ - "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter)) - if letter == cur_end + 1: - cur_end = letter - else: - cat_out.append((cur_start, cur_end)) - cur_start = cur_end = letter - cat_out.append((cur_start, cur_end)) - return cat_out - -def ungroup_cat(cat): - cat_out = [] - for (lo, hi) in cat: - while lo <= hi: - cat_out.append(lo) - lo += 1 - return cat_out - -def to_combines(combs): - combs_out = [] - for comb in combs: - for (lo, hi) in combs[comb]: - combs_out.append((lo, hi, comb)) - combs_out.sort(key=lambda comb: comb[0]) - return combs_out - -def format_table_content(f, content, indent): - line = " "*indent - first = True - for chunk in content.split(","): - if len(line) + len(chunk) < 98: - if first: - line += chunk - else: - line += ", " + chunk - first = False - else: - f.write(line + ",\n") - line = " "*indent + chunk - f.write(line) - -def load_properties(f, interestingprops): - fetch(f) - props = {} - re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)") - re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") - - for line in fileinput.input(os.path.basename(f)): - prop = None - d_lo = 0 - d_hi = 0 - m = re1.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(1) - prop = m.group(2) - else: - m = re2.match(line) - if m: - d_lo = m.group(1) - d_hi = m.group(2) - prop = m.group(3) - else: - continue - if interestingprops and prop not in interestingprops: - continue - d_lo = int(d_lo, 16) - d_hi = int(d_hi, 16) - if prop not in props: - props[prop] = [] - props[prop].append((d_lo, d_hi)) - - # optimize if possible - for prop in props: - props[prop] = group_cat(ungroup_cat(props[prop])) - - return props - -def escape_char(c): - return "'\\u{%x}'" % c - -def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, - pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))): - pub_string = "" - if is_pub: - pub_string = "pub " - f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type)) - data = "" - first = True - for dat in t_data: - if not first: - data += "," - first = False - data += pfun(dat) - format_table_content(f, data, 8) - f.write("\n ];\n\n") - -def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mark): - canon_keys = canon.keys() - canon_keys.sort() - - compat_keys = compat.keys() - compat_keys.sort() - - canon_comp = {} - comp_exclusions = norm_props["Full_Composition_Exclusion"] - for char in canon_keys: - if True in map(lambda (lo, hi): lo <= char <= hi, comp_exclusions): - continue - decomp = canon[char] - if len(decomp) == 2: - if not canon_comp.has_key(decomp[0]): - canon_comp[decomp[0]] = [] - canon_comp[decomp[0]].append( (decomp[1], char) ) - canon_comp_keys = canon_comp.keys() - canon_comp_keys.sort() - - f.write("pub mod normalization {\n") - - def mkdata_fun(table): - def f(char): - data = "(%s,&[" % escape_char(char) - first = True - for d in table[char]: - if not first: - data += "," - first = False - data += escape_char(d) - data += "])" - return data - return f - - f.write(" // Canonical decompositions\n") - emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]", - pfun=mkdata_fun(canon)) - - f.write(" // Compatibility decompositions\n") - emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]", - pfun=mkdata_fun(compat)) - - def comp_pfun(char): - data = "(%s,&[" % escape_char(char) - canon_comp[char].sort(lambda x, y: x[0] - y[0]) - first = True - for pair in canon_comp[char]: - if not first: - data += "," - first = False - data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1])) - data += "])" - return data - - f.write(" // Canonical compositions\n") - emit_table(f, "composition_table", canon_comp_keys, - "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun) - - f.write(""" - fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { - use std::cmp::Ordering::{Equal, Less, Greater}; - match r.binary_search_by(|&(lo, hi, _)| { - if lo <= c && c <= hi { Equal } - else if hi < c { Less } - else { Greater } - }) { - Ok(idx) => { - let (_, _, result) = r[idx]; - result - } - Err(_) => 0 - } - }\n -""") - - emit_table(f, "combining_class_table", combine, "&'static [(char, char, u8)]", is_pub=False, - pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) - - f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" - + " bsearch_range_value_table(c, combining_class_table)\n" - + " }\n") - - f.write(""" - fn bsearch_range_table(c: char, r: &'static [(char, char)]) -> bool { - use std::cmp::Ordering::{Equal, Less, Greater}; - r.binary_search_by(|&(lo, hi)| { - if lo <= c && c <= hi { - Equal - } else if hi < c { - Less - } else { - Greater - } - }) - .is_ok() - } - - /// Return whether the given character is a combining mark (`General_Category=Mark`) - pub fn is_combining_mark(c: char) -> bool { - bsearch_range_table(c, general_category_mark) - } - -""") - - emit_table(f, "general_category_mark", combine, "&'static [(char, char)]", is_pub=False, - pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))) - - f.write(""" -} - -""") - -if __name__ == "__main__": - r = "tables.rs" - if os.path.exists(r): - os.remove(r) - with open(r, "w") as rf: - # write the file's preamble - rf.write(preamble) - - # download and parse all the data - fetch("ReadMe.txt") - with open("ReadMe.txt") as readme: - pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode" - unicode_version = re.search(pattern, readme.read()).groups() - rf.write(""" -/// The version of [Unicode](http://www.unicode.org/) -/// that this version of unicode-normalization is based on. -pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); - -""" % unicode_version) - (canon_decomp, compat_decomp, combines, general_category_mark) = \ - load_unicode_data("UnicodeData.txt") - norm_props = load_properties("DerivedNormalizationProps.txt", - ["Full_Composition_Exclusion"]) - - # normalizations and conversions module - emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props, - general_category_mark) |