summaryrefslogtreecommitdiffstats
path: root/parser/html/java/htmlparser/generate-encoding-data.py
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2020-01-15 14:56:04 -0500
committerMatt A. Tobin <email@mattatobin.com>2020-01-15 14:56:04 -0500
commit6168dbe21f5f83b906e562ea0ab232d499b275a6 (patch)
tree658a4b27554c85ebcaad655fc83f2c2bb99e8e80 /parser/html/java/htmlparser/generate-encoding-data.py
parent09314667a692fedff8564fc347c8a3663474faa6 (diff)
downloadUXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.gz
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.lz
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.xz
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.zip
Add java htmlparser sources that match the original 52-level state
https://hg.mozilla.org/projects/htmlparser/ Commit: abe62ab2a9b69ccb3b5d8a231ec1ae11154c571d
Diffstat (limited to 'parser/html/java/htmlparser/generate-encoding-data.py')
-rw-r--r--parser/html/java/htmlparser/generate-encoding-data.py745
1 files changed, 745 insertions, 0 deletions
diff --git a/parser/html/java/htmlparser/generate-encoding-data.py b/parser/html/java/htmlparser/generate-encoding-data.py
new file mode 100644
index 000000000..69b2fdc30
--- /dev/null
+++ b/parser/html/java/htmlparser/generate-encoding-data.py
@@ -0,0 +1,745 @@
+#!/usr/bin/python
+
+# Copyright (c) 2013-2015 Mozilla Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+import json
+
+class Label:
+ def __init__(self, label, preferred):
+ self.label = label
+ self.preferred = preferred
+ def __cmp__(self, other):
+ return cmp(self.label, other.label)
+
+# If a multi-byte encoding is on this list, it is assumed to have a
+# non-generated decoder implementation class. Otherwise, the JDK default
+# decoder is used as a placeholder.
+MULTI_BYTE_DECODER_IMPLEMENTED = [
+ u"x-user-defined",
+ u"replacement",
+ u"big5",
+]
+
+MULTI_BYTE_ENCODER_IMPLEMENTED = [
+ u"big5",
+]
+
+preferred = []
+
+labels = []
+
+data = json.load(open("../encoding/encodings.json", "r"))
+
+indexes = json.load(open("../encoding/indexes.json", "r"))
+
+single_byte = []
+
+multi_byte = []
+
+def to_camel_name(name):
+ if name == u"iso-8859-8-i":
+ return u"Iso8I"
+ if name.startswith(u"iso-8859-"):
+ return name.replace(u"iso-8859-", u"Iso")
+ return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
+
+def to_constant_name(name):
+ return name.replace(u"-", u"_").upper()
+
+# Encoding.java
+
+for group in data:
+ if group["heading"] == "Legacy single-byte encodings":
+ single_byte = group["encodings"]
+ else:
+ multi_byte.extend(group["encodings"])
+ for encoding in group["encodings"]:
+ preferred.append(encoding["name"])
+ for label in encoding["labels"]:
+ labels.append(Label(label, encoding["name"]))
+
+preferred.sort()
+labels.sort()
+
+label_file = open("src/nu/validator/encoding/Encoding.java", "w")
+
+label_file.write("""/*
+ * Copyright (c) 2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.encoding;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.nio.charset.spi.CharsetProvider;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a>
+ * as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding
+ * Standard</a>, provides access to each encoding defined in the Encoding
+ * Standard via a static constant and provides the
+ * "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an
+ * encoding</a>" algorithm defined in the Encoding Standard.
+ *
+ * <p>This class inherits from {@link Charset} to allow the Encoding
+ * Standard-compliant encodings to be used in contexts that support
+ * <code>Charset</code> instances. However, by design, the Encoding
+ * Standard-compliant encodings are not supplied via a {@link CharsetProvider}
+ * and, therefore, are not available via and do not interfere with the static
+ * methods provided by <code>Charset</code>. (This class provides methods of
+ * the same name to hide each static method of <code>Charset</code> to help
+ * avoid accidental calls to the static methods of the superclass when working
+ * with Encoding Standard-compliant encodings.)
+ *
+ * <p>When an application needs to use a particular encoding, such as utf-8
+ * or windows-1252, the corresponding constant, i.e.
+ * {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252}
+ * respectively, should be used. However, when the application receives an
+ * encoding label from external input, the method {@link #forName(String)
+ * forName()} should be used to obtain the object representing the encoding
+ * identified by the label. In contexts where labels that map to the
+ * <a href="https://encoding.spec.whatwg.org/#replacement">replacement
+ * encoding</a> should be treated as unknown, the method {@link
+ * #forNameNoReplacement(String) forNameNoReplacement()} should be used instead.
+ *
+ *
+ * @author hsivonen
+ */
+public abstract class Encoding extends Charset {
+
+ private static final String[] LABELS = {
+""")
+
+for label in labels:
+ label_file.write(" \"%s\",\n" % label.label)
+
+label_file.write(""" };
+
+ private static final Encoding[] ENCODINGS_FOR_LABELS = {
+""")
+
+for label in labels:
+ label_file.write(" %s.INSTANCE,\n" % to_camel_name(label.preferred))
+
+label_file.write(""" };
+
+ private static final Encoding[] ENCODINGS = {
+""")
+
+for label in preferred:
+ label_file.write(" %s.INSTANCE,\n" % to_camel_name(label))
+
+label_file.write(""" };
+
+""")
+
+for label in preferred:
+ label_file.write(""" /**
+ * The %s encoding.
+ */
+ public static final Encoding %s = %s.INSTANCE;
+
+""" % (label, to_constant_name(label), to_camel_name(label)))
+
+label_file.write("""
+private static SortedMap<String, Charset> encodings = null;
+
+ protected Encoding(String canonicalName, String[] aliases) {
+ super(canonicalName, aliases);
+ }
+
+ private enum State {
+ HEAD, LABEL, TAIL
+ };
+
+ public static Encoding forName(String label) {
+ if (label == null) {
+ throw new IllegalArgumentException("Label must not be null.");
+ }
+ if (label.length() == 0) {
+ throw new IllegalCharsetNameException(label);
+ }
+ // First try the fast path
+ int index = Arrays.binarySearch(LABELS, label);
+ if (index >= 0) {
+ return ENCODINGS_FOR_LABELS[index];
+ }
+ // Else, slow path
+ StringBuilder sb = new StringBuilder();
+ State state = State.HEAD;
+ for (int i = 0; i < label.length(); i++) {
+ char c = label.charAt(i);
+ if ((c == ' ') || (c == '\\n') || (c == '\\r') || (c == '\\t')
+ || (c == '\\u000C')) {
+ if (state == State.LABEL) {
+ state = State.TAIL;
+ }
+ continue;
+ }
+ if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
+ switch (state) {
+ case HEAD:
+ state = State.LABEL;
+ // Fall through
+ case LABEL:
+ sb.append(c);
+ continue;
+ case TAIL:
+ throw new IllegalCharsetNameException(label);
+ }
+ }
+ if (c >= 'A' && c <= 'Z') {
+ c += 0x20;
+ switch (state) {
+ case HEAD:
+ state = State.LABEL;
+ // Fall through
+ case LABEL:
+ sb.append(c);
+ continue;
+ case TAIL:
+ throw new IllegalCharsetNameException(label);
+ }
+ }
+ if ((c == '-') || (c == '+') || (c == '.') || (c == ':')
+ || (c == '_')) {
+ switch (state) {
+ case LABEL:
+ sb.append(c);
+ continue;
+ case HEAD:
+ case TAIL:
+ throw new IllegalCharsetNameException(label);
+ }
+ }
+ throw new IllegalCharsetNameException(label);
+ }
+ index = Arrays.binarySearch(LABELS, sb.toString());
+ if (index >= 0) {
+ return ENCODINGS_FOR_LABELS[index];
+ }
+ throw new UnsupportedCharsetException(label);
+ }
+
+ public static Encoding forNameNoReplacement(String label) {
+ Encoding encoding = Encoding.forName(label);
+ if (encoding == Encoding.REPLACEMENT) {
+ throw new UnsupportedCharsetException(label);
+ }
+ return encoding;
+ }
+
+ public static boolean isSupported(String label) {
+ try {
+ Encoding.forName(label);
+ } catch (UnsupportedCharsetException e) {
+ return false;
+ }
+ return true;
+ }
+
+ public static boolean isSupportedNoReplacement(String label) {
+ try {
+ Encoding.forNameNoReplacement(label);
+ } catch (UnsupportedCharsetException e) {
+ return false;
+ }
+ return true;
+ }
+
+ public static SortedMap<String, Charset> availableCharsets() {
+ if (encodings == null) {
+ TreeMap<String, Charset> map = new TreeMap<String, Charset>();
+ for (Encoding encoding : ENCODINGS) {
+ map.put(encoding.name(), encoding);
+ }
+ encodings = Collections.unmodifiableSortedMap(map);
+ }
+ return encodings;
+ }
+
+ public static Encoding defaultCharset() {
+ return WINDOWS_1252;
+ }
+
+ @Override public boolean canEncode() {
+ return false;
+ }
+
+ @Override public boolean contains(Charset cs) {
+ return false;
+ }
+
+ @Override public CharsetEncoder newEncoder() {
+ throw new UnsupportedOperationException("Encoder not implemented.");
+ }
+}
+""")
+
+label_file.close()
+
+# Single-byte encodings
+
+for encoding in single_byte:
+ name = encoding["name"]
+ labels = encoding["labels"]
+ labels.sort()
+ class_name = to_camel_name(name)
+ mapping_name = name
+ if mapping_name == u"iso-8859-8-i":
+ mapping_name = u"iso-8859-8"
+ mapping = indexes[mapping_name]
+ class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
+ class_file.write('''/*
+ * Copyright (c) 2013-2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using generate-encoding-data.py
+ */
+
+package nu.validator.encoding;
+
+import java.nio.charset.CharsetDecoder;
+
+class ''')
+ class_file.write(class_name)
+ class_file.write(''' extends Encoding {
+
+ private static final char[] TABLE = {''')
+ fallible = False
+ comma = False
+ for code_point in mapping:
+ # XXX should we have error reporting?
+ if not code_point:
+ code_point = 0xFFFD
+ fallible = True
+ if comma:
+ class_file.write(",")
+ class_file.write("\n '\u%04x'" % code_point);
+ comma = True
+ class_file.write('''
+ };
+
+ private static final String[] LABELS = {''')
+
+ comma = False
+ for label in labels:
+ if comma:
+ class_file.write(",")
+ class_file.write("\n \"%s\"" % label);
+ comma = True
+ class_file.write('''
+ };
+
+ private static final String NAME = "''')
+ class_file.write(name)
+ class_file.write('''";
+
+ static final Encoding INSTANCE = new ''')
+ class_file.write(class_name)
+ class_file.write('''();
+
+ private ''')
+ class_file.write(class_name)
+ class_file.write('''() {
+ super(NAME, LABELS);
+ }
+
+ @Override public CharsetDecoder newDecoder() {
+ return new ''')
+ class_file.write("Fallible" if fallible else "Infallible")
+ class_file.write('''SingleByteDecoder(this, TABLE);
+ }
+
+}
+''')
+ class_file.close()
+
+# Multi-byte encodings
+
+for encoding in multi_byte:
+ name = encoding["name"]
+ labels = encoding["labels"]
+ labels.sort()
+ class_name = to_camel_name(name)
+ class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
+ class_file.write('''/*
+ * Copyright (c) 2013-2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using generate-encoding-data.py
+ */
+
+package nu.validator.encoding;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+
+class ''')
+ class_file.write(class_name)
+ class_file.write(''' extends Encoding {
+
+ private static final String[] LABELS = {''')
+
+ comma = False
+ for label in labels:
+ if comma:
+ class_file.write(",")
+ class_file.write("\n \"%s\"" % label);
+ comma = True
+ class_file.write('''
+ };
+
+ private static final String NAME = "''')
+ class_file.write(name)
+ class_file.write('''";
+
+ static final ''')
+ class_file.write(class_name)
+ class_file.write(''' INSTANCE = new ''')
+ class_file.write(class_name)
+ class_file.write('''();
+
+ private ''')
+ class_file.write(class_name)
+ class_file.write('''() {
+ super(NAME, LABELS);
+ }
+
+ @Override public CharsetDecoder newDecoder() {
+ ''')
+ if name == "gbk":
+ class_file.write('''return Charset.forName("gb18030").newDecoder();''')
+ elif name in MULTI_BYTE_DECODER_IMPLEMENTED:
+ class_file.write("return new %sDecoder(this);" % class_name)
+ else:
+ class_file.write('''return Charset.forName(NAME).newDecoder();''')
+ class_file.write('''
+ }
+
+ @Override public CharsetEncoder newEncoder() {
+ ''')
+ if name in MULTI_BYTE_ENCODER_IMPLEMENTED:
+ class_file.write("return new %sEncoder(this);" % class_name)
+ else:
+ class_file.write('''return Charset.forName(NAME).newEncoder();''')
+ class_file.write('''
+ }
+}
+''')
+ class_file.close()
+
+# Big5
+
+def null_to_zero(code_point):
+ if not code_point:
+ code_point = 0
+ return code_point
+
+index = []
+
+for code_point in indexes["big5"]:
+ index.append(null_to_zero(code_point))
+
+# There are four major gaps consisting of more than 4 consecutive invalid pointers
+gaps = []
+consecutive = 0
+consecutive_start = 0
+offset = 0
+for code_point in index:
+ if code_point == 0:
+ if consecutive == 0:
+ consecutive_start = offset
+ consecutive +=1
+ else:
+ if consecutive > 4:
+ gaps.append((consecutive_start, consecutive_start + consecutive))
+ consecutive = 0
+ offset += 1
+
+def invert_ranges(ranges, cap):
+ inverted = []
+ invert_start = 0
+ for (start, end) in ranges:
+ if start != 0:
+ inverted.append((invert_start, start))
+ invert_start = end
+ inverted.append((invert_start, cap))
+ return inverted
+
+cap = len(index)
+ranges = invert_ranges(gaps, cap)
+
+# Now compute a compressed lookup table for astralness
+
+gaps = []
+consecutive = 0
+consecutive_start = 0
+offset = 0
+for code_point in index:
+ if code_point <= 0xFFFF:
+ if consecutive == 0:
+ consecutive_start = offset
+ consecutive +=1
+ else:
+ if consecutive > 40:
+ gaps.append((consecutive_start, consecutive_start + consecutive))
+ consecutive = 0
+ offset += 1
+
+astral_ranges = invert_ranges(gaps, cap)
+
+class_file = open("src/nu/validator/encoding/Big5Data.java", "w")
+class_file.write('''/*
+ * Copyright (c) 2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using generate-encoding-data.py
+ */
+
+package nu.validator.encoding;
+
+final class Big5Data {
+
+ private static final String ASTRALNESS = "''')
+
+bits = []
+for (low, high) in astral_ranges:
+ for i in xrange(low, high):
+ bits.append(1 if index[i] > 0xFFFF else 0)
+# pad length to multiple of 16
+for j in xrange(16 - (len(bits) % 16)):
+ bits.append(0)
+
+i = 0
+while i < len(bits):
+ accu = 0
+ for j in xrange(16):
+ accu |= bits[i + j] << j
+ if accu == 0x22:
+ class_file.write('\\"')
+ else:
+ class_file.write('\\u%04X' % accu)
+ i += 16
+
+class_file.write('''";
+
+''')
+
+j = 0
+for (low, high) in ranges:
+ class_file.write(''' private static final String TABLE%d = "''' % j)
+ for i in xrange(low, high):
+ class_file.write('\\u%04X' % (index[i] & 0xFFFF))
+ class_file.write('''";
+
+''')
+ j += 1
+
+class_file.write(''' private static boolean readBit(int i) {
+ return (ASTRALNESS.charAt(i >> 4) & (1 << (i & 0xF))) != 0;
+ }
+
+ static char lowBits(int pointer) {
+''')
+
+j = 0
+for (low, high) in ranges:
+ class_file.write(''' if (pointer < %d) {
+ return '\\u0000';
+ }
+ if (pointer < %d) {
+ return TABLE%d.charAt(pointer - %d);
+ }
+''' % (low, high, j, low))
+ j += 1
+
+class_file.write(''' return '\\u0000';
+ }
+
+ static boolean isAstral(int pointer) {
+''')
+
+base = 0
+for (low, high) in astral_ranges:
+ if high - low == 1:
+ class_file.write(''' if (pointer < %d) {
+ return false;
+ }
+ if (pointer == %d) {
+ return true;
+ }
+''' % (low, low))
+ else:
+ class_file.write(''' if (pointer < %d) {
+ return false;
+ }
+ if (pointer < %d) {
+ return readBit(%d + (pointer - %d));
+ }
+''' % (low, high, base, low))
+ base += (high - low)
+
+class_file.write(''' return false;
+ }
+
+ public static int findPointer(char lowBits, boolean isAstral) {
+ if (!isAstral) {
+ switch (lowBits) {
+''')
+
+hkscs_bound = (0xA1 - 0x81) * 157
+
+prefer_last = [
+ 0x2550,
+ 0x255E,
+ 0x2561,
+ 0x256A,
+ 0x5341,
+ 0x5345,
+]
+
+for code_point in prefer_last:
+ # Python lists don't have .rindex() :-(
+ for i in xrange(len(index) - 1, -1, -1):
+ candidate = index[i]
+ if candidate == code_point:
+ class_file.write(''' case 0x%04X:
+ return %d;
+''' % (code_point, i))
+ break
+
+class_file.write(''' default:
+ break;
+ }
+ }''')
+
+j = 0
+for (low, high) in ranges:
+ if high > hkscs_bound:
+ start = 0
+ if low <= hkscs_bound and hkscs_bound < high:
+ # This is the first range we don't ignore and the
+ # range that contains the first non-HKSCS pointer.
+ # Avoid searching HKSCS.
+ start = hkscs_bound - low
+ class_file.write('''
+ for (int i = %d; i < TABLE%d.length(); i++) {
+ if (TABLE%d.charAt(i) == lowBits) {
+ int pointer = i + %d;
+ if (isAstral == isAstral(pointer)) {
+ return pointer;
+ }
+ }
+ }''' % (start, j, j, low))
+ j += 1
+
+class_file.write('''
+ return 0;
+ }
+}
+''')
+class_file.close()