third_party/rust/unicode-normalization/scripts/unicode_gen_normtests.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

#!/usr/bin/env python
#
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

# This script uses the following Unicode tables:
# - NormalizationTest.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.

import unicode, re, os, fileinput

def load_test_data(f):
    outls = []
    testRe = re.compile("^(.*?);(.*?);(.*?);(.*?);(.*?);\s+#.*$")

    unicode.fetch(f)
    for line in fileinput.input(os.path.basename(f)):
        # comment and header lines start with # and @ respectively
        if len(line) < 1 or line[0:1] == '#' or line[0:1] == '@':
            continue

        m = testRe.match(line)
        groups = []
        if not m:
            print "error: no match on line where test was expected: %s" % line
            continue

        has_surrogates = False
        for i in range(1, 6):
            group = []
            chs = m.group(i).split()
            for ch in chs:
                intch = int(ch,16)
                if unicode.is_surrogate(intch):
                    has_surrogates = True
                    break
                group.append(intch)

            if has_surrogates:
                break
            groups.append(group)

        if has_surrogates:
            continue
        outls.append(groups)

    return outls

def showfun(gs):
    outstr = '('
    gfirst = True
    for g in gs:
        if not gfirst:
            outstr += ','
        gfirst = False

        outstr += '"'
        for ch in g:
            outstr += "\\u{%x}" % ch
        outstr += '"'
    outstr += ')'
    return outstr

if __name__ == "__main__":
    d = load_test_data("NormalizationTest.txt")
    ntype = "&'static [(&'static str, &'static str, &'static str, &'static str, &'static str)]"
    with open("testdata.rs", "w") as nf:
        nf.write(unicode.preamble)
        nf.write("\n")
        nf.write("    // official Unicode test data\n")
        nf.write("    // http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt\n")
        unicode.emit_table(nf, "TEST_NORM", d, ntype, True, showfun)