summaryrefslogtreecommitdiffstats
path: root/intl/lwbrk/tools/anzx4051.pl
diff options
context:
space:
mode:
Diffstat (limited to 'intl/lwbrk/tools/anzx4051.pl')
-rw-r--r--intl/lwbrk/tools/anzx4051.pl356
1 files changed, 356 insertions, 0 deletions
diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl
new file mode 100644
index 000000000..b13315b38
--- /dev/null
+++ b/intl/lwbrk/tools/anzx4051.pl
@@ -0,0 +1,356 @@
+#!/usr/bin/perl
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+######################################################################
+#
+# Initial global variable
+#
+######################################################################
+%utot = ();
+$ui=0;
+$li=0;
+
+######################################################################
+#
+# Open the unicode database file
+#
+######################################################################
+open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt")
+ || die "cannot find UnicodeData-Latest.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class file
+#
+######################################################################
+open ( CLASS , "< jisx4051class.txt")
+ || die "cannot find jisx4051class.txt";
+
+######################################################################
+#
+# Open the JIS X 4051 Class simplified mapping
+#
+######################################################################
+open ( SIMP , "< jisx4051simp.txt")
+ || die "cannot find jisx4051simp.txt";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( OUT , "> anzx4051.html")
+ || die "cannot open output anzx4051.html file";
+
+######################################################################
+#
+# Open the output file
+#
+######################################################################
+open ( HEADER , "> ../src/jisx4051class.h")
+ || die "cannot open output ../src/jisx4051class.h file";
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$hthmlheader = <<END_OF_HTML;
+<!-- This Source Code Form is subject to the terms of the Mozilla Public
+ - License, v. 2.0. If a copy of the MPL was not distributed with this
+ - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
+
+<HTML>
+<HEAD>
+<TITLE>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</TITLE>
+</HEAD>
+<BODY>
+<H1>
+Analysis of JIS X 4051 to Unicode General Category Mapping
+</H1>
+END_OF_HTML
+print OUT $hthmlheader;
+
+######################################################################
+#
+# Generate license and header
+#
+######################################################################
+$npl = <<END_OF_NPL;
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+/*
+ DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY
+ mozilla/intl/lwbrk/tools/anzx4051.pl
+ */
+END_OF_NPL
+print HEADER $npl;
+
+%occ = ();
+%gcat = ();
+%dcat = ();
+%simp = ();
+%gcount = ();
+%dcount = ();
+%sccount = ();
+%rangecount = ();
+
+######################################################################
+#
+# Process the file line by line
+#
+######################################################################
+while(<UNICODATA>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+ $c = $f[0]; # The unicode value
+ $g = $f[2];
+ $d = substr($g, 0, 1);
+
+ $gcat{$c} = $g;
+ $dcat{$c} = $d;
+ $gcount{$g}++;
+ $dcount{$d}++;
+}
+close(UNIDATA);
+
+while(<SIMP>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ $simp{$f[0]} = $f[1];
+ $sccount{$f[1]}++;
+}
+close(SIMP);
+
+sub GetClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $gcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "Lo";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "Cs";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "Cs";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "Cs";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "Co";
+ } else {
+ printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex;
+ }
+}
+sub GetDClass{
+ my ($u) = @_;
+ my $hex = DecToHex($u);
+ $g = $dcat{$hex};
+ if($g ne "") {
+ return $g;
+ } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) {
+ return "Han";
+ } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) {
+ return "L";
+ } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) {
+ return "C";
+ } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) {
+ return "C";
+ } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) {
+ return "C";
+ } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) {
+ return "C";
+ } else {
+ printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex;
+ }
+}
+sub DecToHex{
+ my ($d) = @_;
+ return sprintf("%04X", $d);
+}
+%gtotal = ();
+%dtotal = ();
+while(<CLASS>) {
+ chop;
+ ######################################################################
+ #
+ # Get value from fields
+ #
+ ######################################################################
+ @f = split(/;/ , $_);
+
+ if( substr($f[2], 0, 1) ne "a")
+ {
+ $sc = $simp{$f[2]};
+ $l = hex($f[0]);
+ if($f[1] eq "")
+ {
+ $h = $l;
+ } else {
+ $h = hex($f[1]);
+ }
+ for($k = $l; $k <= $h ; $k++)
+ {
+ if( exists($occ{$k}))
+ {
+ # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n",
+ # DecToHex($k), $occ{$k} , $f[2] , $sc;
+ }
+ else
+ {
+ $occ{$k} = $sc . " | " . $f[2];
+ $gclass = GetClass($k);
+ $dclass = GetDClass($k);
+ $gtotal{$sc . $gclass}++;
+ $dtotal{$sc . $dclass}++;
+ $u = DecToHex($k);
+ $rk = " " . substr($u,0,2) . ":" . $sc;
+ $rangecount{$rk}++;
+ }
+ }
+ }
+}
+
+#print %gtotal;
+#print %dtotal;
+
+sub printreport
+{
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $d (sort(keys %dcount)) {
+ print OUT "<TD BGCOLOR=red>$d</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>Total</TD>\n";
+ foreach $g (sort(keys %gcount)) {
+ print OUT "<TD BGCOLOR=yellow>$g</TD>\n";
+ }
+ print OUT "</TR>\n";
+ foreach $sc (sort(keys %sccount)) {
+
+ print OUT "<TR><TH>$sc<TH>\n";
+
+ $total = 0;
+ foreach $d (sort (keys %dcount)) {
+ $count = $dtotal{$sc . $d};
+ $total += $count;
+ print OUT "<TD>$count</TD>\n";
+ }
+
+ print OUT "<TD BGCOLOR=white>$total</TD>\n";
+
+ foreach $g (sort(keys %gcount)) {
+ $count = $gtotal{$sc . $g};
+ print OUT "<TD>$count</TD>\n";
+ }
+
+
+ print OUT "</TR>\n";
+ }
+ print OUT "</TABLE>\n";
+
+
+ print OUT "<TABLE BORDER=3>\n";
+ print OUT "<TR BGCOLOR=blue><TH><TH>\n";
+
+ foreach $sc (sort(keys %sccount))
+ {
+ print OUT "<TD BGCOLOR=red>$sc</TD>\n";
+ }
+
+ print OUT "</TR>\n";
+
+
+ for($rr = 0; $rr < 0x4f; $rr++)
+ {
+ $empty = 0;
+ $r = sprintf("%02X" , $rr) ;
+ $tmp = "<TR><TH>" . $r . "<TH>\n";
+
+ foreach $sc (sort(keys %sccount)) {
+ $count = $rangecount{ " " .$r . ":" .$sc};
+ $tmp .= sprintf("<TD>%s</TD>\n", $count);
+ $empty += $count;
+ }
+
+ $tmp .= "</TR>\n";
+
+ if($empty ne 0)
+ {
+ print OUT $tmp;
+ }
+ }
+ print OUT "</TABLE>\n";
+
+}
+printreport();
+
+sub printarray
+{
+ my($r, $def) = @_;
+printf "[%s || %s]\n", $r, $def;
+ $k = hex($r) * 256;
+ printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r;
+ for($i = 0 ; $i < 256; $i+= 8)
+ {
+ for($j = 7 ; $j >= 0; $j-- )
+ {
+ $v = $k + $i + $j;
+ if( exists($occ{$v}))
+ {
+ $p = substr($occ{$v}, 1,1);
+ } else {
+ $p = $def;
+ }
+
+ if($j eq 7 )
+ {
+ printf HEADER "0x%s" , $p;
+ } else {
+ printf HEADER "%s", $p ;
+ }
+ }
+ printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7);
+ }
+ print HEADER "};\n\n";
+}
+printarray("00", "7");
+printarray("20", "7");
+printarray("21", "7");
+printarray("30", "5");
+printarray("0E", "8");
+printarray("17", "7");
+
+#print %rangecount;
+
+######################################################################
+#
+# Close files
+#
+######################################################################
+close(HEADER);
+close(CLASS);
+close(OUT);
+