diff options
Diffstat (limited to 'third_party/rust/unicode-normalization/scripts/unicode_gen_normtests.py')
-rwxr-xr-x | third_party/rust/unicode-normalization/scripts/unicode_gen_normtests.py | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/third_party/rust/unicode-normalization/scripts/unicode_gen_normtests.py b/third_party/rust/unicode-normalization/scripts/unicode_gen_normtests.py new file mode 100755 index 000000000..2c77ac584 --- /dev/null +++ b/third_party/rust/unicode-normalization/scripts/unicode_gen_normtests.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# +# Copyright 2015 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# - NormalizationTest.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the unicode.rs file into git. + +import unicode, re, os, fileinput + +def load_test_data(f): + outls = [] + testRe = re.compile("^(.*?);(.*?);(.*?);(.*?);(.*?);\s+#.*$") + + unicode.fetch(f) + for line in fileinput.input(os.path.basename(f)): + # comment and header lines start with # and @ respectively + if len(line) < 1 or line[0:1] == '#' or line[0:1] == '@': + continue + + m = testRe.match(line) + groups = [] + if not m: + print "error: no match on line where test was expected: %s" % line + continue + + has_surrogates = False + for i in range(1, 6): + group = [] + chs = m.group(i).split() + for ch in chs: + intch = int(ch,16) + if unicode.is_surrogate(intch): + has_surrogates = True + break + group.append(intch) + + if has_surrogates: + break + groups.append(group) + + if has_surrogates: + continue + outls.append(groups) + + return outls + +def showfun(gs): + outstr = '(' + gfirst = True + for g in gs: + if not gfirst: + outstr += ',' + gfirst = False + + outstr += '"' + for ch in g: + outstr += "\\u{%x}" % ch + outstr += '"' + outstr += ')' + return outstr + +if __name__ == "__main__": + d = load_test_data("NormalizationTest.txt") + ntype = "&'static [(&'static str, &'static str, &'static str, &'static str, &'static str)]" + with open("testdata.rs", "w") as nf: + nf.write(unicode.preamble) + nf.write("\n") + nf.write(" // official Unicode test data\n") + nf.write(" // http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt\n") + unicode.emit_table(nf, "TEST_NORM", d, ntype, True, showfun) |