summaryrefslogtreecommitdiffstats
path: root/intl/uconv/tools/gen-big5-data.py
diff options
context:
space:
mode:
Diffstat (limited to 'intl/uconv/tools/gen-big5-data.py')
-rw-r--r--intl/uconv/tools/gen-big5-data.py253
1 files changed, 253 insertions, 0 deletions
diff --git a/intl/uconv/tools/gen-big5-data.py b/intl/uconv/tools/gen-big5-data.py
new file mode 100644
index 000000000..1d0f59bb4
--- /dev/null
+++ b/intl/uconv/tools/gen-big5-data.py
@@ -0,0 +1,253 @@
+#!/usr/bin/python
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# Adapted from
+# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py
+
+# indexes.json comes from
+# https://encoding.spec.whatwg.org/indexes.json
+# i.e.
+# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json
+
+import json
+
+indexes = json.load(open("indexes.json", "r"))
+
+def nullToZero(codePoint):
+ if not codePoint:
+ codePoint = 0
+ return codePoint
+
+index = []
+
+for codePoint in indexes["big5"]:
+ index.append(nullToZero(codePoint))
+
+# There are four major gaps consisting of more than 4 consecutive invalid pointers
+gaps = []
+consecutive = 0
+consecutiveStart = 0
+offset = 0
+for codePoint in index:
+ if codePoint == 0:
+ if consecutive == 0:
+ consecutiveStart = offset
+ consecutive +=1
+ else:
+ if consecutive > 4:
+ gaps.append((consecutiveStart, consecutiveStart + consecutive))
+ consecutive = 0
+ offset += 1
+
+def invertRanges(ranges, cap):
+ inverted = []
+ invertStart = 0
+ for (start, end) in ranges:
+ if start != 0:
+ inverted.append((invertStart, start))
+ invertStart = end
+ inverted.append((invertStart, cap))
+ return inverted
+
+cap = len(index)
+ranges = invertRanges(gaps, cap)
+
+# Now compute a compressed lookup table for astralness
+
+gaps = []
+consecutive = 0
+consecutiveStart = 0
+offset = 0
+for codePoint in index:
+ if codePoint <= 0xFFFF:
+ if consecutive == 0:
+ consecutiveStart = offset
+ consecutive +=1
+ else:
+ if consecutive > 40:
+ gaps.append((consecutiveStart, consecutiveStart + consecutive))
+ consecutive = 0
+ offset += 1
+
+astralRanges = invertRanges(gaps, cap)
+
+
+classFile = open("../ucvtw/nsBIG5Data.cpp", "w")
+classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
+ */
+
+#include "nsBIG5Data.h"
+
+static const char16_t kBig5LowBitsTable[] = {
+''')
+
+for (low, high) in ranges:
+ for i in xrange(low, high):
+ classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
+
+classFile.write('''};
+
+static const uint32_t kBig5AstralnessTable[] = {
+''')
+
+# An array of bool is inefficient per
+# http://stackoverflow.com/questions/4049156/1-bit-per-bool-in-array-c
+
+bits = []
+for (low, high) in astralRanges:
+ for i in xrange(low, high):
+ bits.append(1 if index[i] > 0xFFFF else 0)
+# pad length to multiple of 32
+for i in xrange(32 - (len(bits) % 32)):
+ bits.append(0)
+i = 0
+while i < len(bits):
+ accu = 0
+ for j in xrange(32):
+ accu |= bits[i + j] << j
+ classFile.write(' 0x%08X,\n' % accu)
+ i += 32
+
+classFile.write('''};
+
+// static
+char16_t
+nsBIG5Data::LowBits(size_t aPointer)
+{
+''')
+
+base = 0
+for (low, high) in ranges:
+ classFile.write(''' if (aPointer < %d) {
+ return 0;
+ }
+ if (aPointer < %d) {
+ return kBig5LowBitsTable[%d + (aPointer - %d)];
+ }
+''' % (low, high, base, low))
+ base += (high - low)
+
+classFile.write(''' return 0;
+}
+
+// static
+bool
+nsBIG5Data::IsAstral(size_t aPointer)
+{
+''')
+
+base = 0
+for (low, high) in astralRanges:
+ if high - low == 1:
+ classFile.write(''' if (aPointer < %d) {
+ return false;
+ }
+ if (aPointer == %d) {
+ return true;
+ }
+''' % (low, low))
+ else:
+ classFile.write(''' if (aPointer < %d) {
+ return false;
+ }
+ if (aPointer < %d) {
+ size_t index = %d + (aPointer - %d);
+ return kBig5AstralnessTable[index >> 5] & (1 << (index & 0x1F));
+ }
+''' % (low, high, base, low))
+ base += (high - low)
+
+classFile.write(''' return false;
+}
+
+//static
+size_t
+nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
+{
+ if (!aIsAstral) {
+ switch (aLowBits) {
+''')
+
+hkscsBound = (0xA1 - 0x81) * 157
+
+preferLast = [
+ 0x2550,
+ 0x255E,
+ 0x2561,
+ 0x256A,
+ 0x5341,
+ 0x5345,
+]
+
+for codePoint in preferLast:
+ # Python lists don't have .rindex() :-(
+ for i in xrange(len(index) - 1, -1, -1):
+ candidate = index[i]
+ if candidate == codePoint:
+ classFile.write(''' case 0x%04X:
+ return %d;
+''' % (codePoint, i))
+ break
+
+classFile.write(''' default:
+ break;
+ }
+ }''')
+
+base = 0
+start = 0
+for (low, high) in ranges:
+ if low <= hkscsBound and hkscsBound < high:
+ # This is the first range we don't ignore and the
+ # range that contains the first non-HKSCS pointer.
+ # Avoid searching HKSCS.
+ start = base + hkscsBound - low
+ break
+ base += (high - low)
+
+classFile.write('''
+ for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
+ if (kBig5LowBitsTable[i] == aLowBits) {
+ size_t pointer;
+ ''' % start)
+
+base = 0
+prevLow = 0
+prevHigh = 0
+prevBase = 0
+writing = False
+for (low, high) in ranges:
+ if writing:
+ classFile.write('''if (i < %d) {
+ pointer = i + %d;
+ } else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase)))
+ prevLow = low
+ prevHigh = high
+ prevBase = base
+ if high > hkscsBound:
+ writing = True
+ base += (high - low)
+
+classFile.write('''{
+ pointer = i + %d;
+ }''' % (prevLow - prevBase))
+
+classFile.write('''
+ if (aIsAstral == IsAstral(pointer)) {
+ return pointer;
+ }
+ }
+ }
+ return 0;
+}
+''')
+classFile.close()