summaryrefslogtreecommitdiffstats
path: root/intl/unicharutil/tests/genNormalizationData.pl
blob: 816ab94e7add1ab2dd33ad47cc98b47026f987f4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/perl 
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

open ( TEXTFILE , "< NormalizationTest.txt")
    || die "Cannot find NormalizationTest.txt. The latest version should be available from\n http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt\n";

open ( OUT , "> NormalizationData.h")
#open ( OUT , "> test.txt")
    || die "Cannot create output file NormalizationData.h\n";

$mpl = <<END_OF_MPL;
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/* 
    DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
    mozilla/intl/unicharutil/tools/genNormalizationData.pl
 */
END_OF_MPL

print OUT $mpl;

# XXX This code assumes that wchar_t is 16-bit unsigned, which is currently
#      true on Windows, Linux and Mac (with |g++ -fshort-wchar|).
#      To make it work where that assumption doesn't hold, one could generate
#      one huge array containing all the strings as 16-bit units (including
#      the 0 terminator) and initialize the array of testcaseLine with pointers
#      into the huge array.

while(<TEXTFILE>) {
    chop;
    if (/^# NormalizationTest-(.+)\.txt/) {
	print OUT "static char versionText[] = \"$1\";\n";
    } elsif (/^\@Part(.)/) {
	if ($1 != "0") {
	    print OUT "  {\n";
	    print OUT "    L\"\",\n";
	    print OUT "    L\"\",\n";
	    print OUT "    L\"\",\n";
	    print OUT "    L\"\",\n";
	    print OUT "    L\"\",\n";
	    print OUT "    \"\",\n";
	    print OUT "  },\n";
	    print OUT "};\n";
	}
	print OUT "\n";
	print OUT "static testcaseLine Part$1TestData[] = \n";
	print OUT "{\n";
    } else {
	unless (/^\#/) {
	    @cases = split(/;/ , $_);
	    print OUT "  {\n";
	    for ($case = 0; $case < 5; ++$case) {
		$c = $cases[$case];
		print OUT "    L\"";
		@codepoints = split(/ / , $c);
		foreach (@codepoints) {
		    $cp = hex($_);
		    if ($cp < 0x10000) {
                      # BMP codepoint
			printf OUT "\\x%04X", $cp;
		    } else {
                      # non-BMP codepoint, convert to surrogate pair
			printf OUT "\\x%04X\\x%04X",
			           ($cp >> 10) + 0xD7C0,
			           ($cp & 0x03FF) | 0xDC00;
		    }
		}
		print OUT "\",\n";
	    }
	    $description = $cases[10];
	    $description =~ s/^ \) //;
	    print OUT "    \"$description\"\n";
	    print OUT "  },\n";
	}
    }
}
 
print OUT "  {\n";
print OUT "    L\"\",\n";
print OUT "    L\"\",\n";
print OUT "    L\"\",\n";
print OUT "    L\"\",\n";
print OUT "    L\"\",\n";
print OUT "    \"\",\n";
print OUT "  },\n";
print OUT "};\n";
close (OUT);
close (TEXTFILE);