Add java htmlparser sources that match the original 52-level state

https://hg.mozilla.org/projects/htmlparser/ Commit: abe62ab2a9b69ccb3b5d8a231ec1ae11154c571d
author: Matt A. Tobin <email@mattatobin.com> 2020-01-15 14:56:04 -0500
committer: Matt A. Tobin <email@mattatobin.com> 2020-01-15 14:56:04 -0500
commit: 6168dbe21f5f83b906e562ea0ab232d499b275a6 (patch)
tree: 658a4b27554c85ebcaad655fc83f2c2bb99e8e80 /parser/html/java/htmlparser/generate-encoding-data.py
parent: 09314667a692fedff8564fc347c8a3663474faa6 (diff)
download: UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.gz
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.lz
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.xz
UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.zip
1 files changed, 745 insertions, 0 deletions
diff --git a/parser/html/java/htmlparser/generate-encoding-data.py b/parser/html/java/htmlparser/generate-encoding-data.py
new file mode 100644
index 000000000..69b2fdc30
--- /dev/null
+++ b/parser/html/java/htmlparser/generate-encoding-data.py
@@ -0,0 +1,745 @@
+#!/usr/bin/python
+
+# Copyright (c) 2013-2015 Mozilla Foundation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a 
+# copy of this software and associated documentation files (the "Software"), 
+# to deal in the Software without restriction, including without limitation 
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+# and/or sell copies of the Software, and to permit persons to whom the 
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+# DEALINGS IN THE SOFTWARE.
+
+import json
+
+class Label:
+  def __init__(self, label, preferred):
+    self.label = label
+    self.preferred = preferred
+  def __cmp__(self, other):
+    return cmp(self.label, other.label)
+
+# If a multi-byte encoding is on this list, it is assumed to have a
+# non-generated decoder implementation class. Otherwise, the JDK default
+# decoder is used as a placeholder.
+MULTI_BYTE_DECODER_IMPLEMENTED = [
+  u"x-user-defined",
+  u"replacement",
+  u"big5",
+]
+
+MULTI_BYTE_ENCODER_IMPLEMENTED = [
+  u"big5",
+]
+
+preferred = []
+
+labels = []
+
+data = json.load(open("../encoding/encodings.json", "r"))
+
+indexes = json.load(open("../encoding/indexes.json", "r"))
+
+single_byte = []
+
+multi_byte = []
+
+def to_camel_name(name):
+  if name == u"iso-8859-8-i":
+    return u"Iso8I"
+  if name.startswith(u"iso-8859-"):
+    return name.replace(u"iso-8859-", u"Iso")
+  return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
+
+def to_constant_name(name):
+  return name.replace(u"-", u"_").upper()
+
+# Encoding.java
+
+for group in data:
+  if group["heading"] == "Legacy single-byte encodings":
+    single_byte = group["encodings"]
+  else:
+    multi_byte.extend(group["encodings"])
+  for encoding in group["encodings"]:
+    preferred.append(encoding["name"])
+    for label in encoding["labels"]:
+      labels.append(Label(label, encoding["name"]))
+
+preferred.sort()
+labels.sort()
+
+label_file = open("src/nu/validator/encoding/Encoding.java", "w")
+
+label_file.write("""/*
+ * Copyright (c) 2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.encoding;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.nio.charset.spi.CharsetProvider;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a>
+ * as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding
+ * Standard</a>, provides access to each encoding defined in the Encoding
+ * Standard via a static constant and provides the 
+ * "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an 
+ * encoding</a>" algorithm defined in the Encoding Standard.
+ * 
+ * <p>This class inherits from {@link Charset} to allow the Encoding 
+ * Standard-compliant encodings to be used in contexts that support
+ * <code>Charset</code> instances. However, by design, the Encoding 
+ * Standard-compliant encodings are not supplied via a {@link CharsetProvider}
+ * and, therefore, are not available via and do not interfere with the static
+ * methods provided by <code>Charset</code>. (This class provides methods of
+ * the same name to hide each static method of <code>Charset</code> to help
+ * avoid accidental calls to the static methods of the superclass when working
+ * with Encoding Standard-compliant encodings.)
+ * 
+ * <p>When an application needs to use a particular encoding, such as utf-8
+ * or windows-1252, the corresponding constant, i.e.
+ * {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252}
+ * respectively, should be used. However, when the application receives an
+ * encoding label from external input, the method {@link #forName(String) 
+ * forName()} should be used to obtain the object representing the encoding 
+ * identified by the label. In contexts where labels that map to the 
+ * <a href="https://encoding.spec.whatwg.org/#replacement">replacement
+ * encoding</a> should be treated as unknown, the method {@link
+ * #forNameNoReplacement(String) forNameNoReplacement()} should be used instead.
+ * 
+ * 
+ * @author hsivonen
+ */
+public abstract class Encoding extends Charset {
+
+    private static final String[] LABELS = {
+""")
+
+for label in labels:
+  label_file.write("        \"%s\",\n" % label.label)
+
+label_file.write("""    };
+    
+    private static final Encoding[] ENCODINGS_FOR_LABELS = {
+""")
+
+for label in labels:
+  label_file.write("        %s.INSTANCE,\n" % to_camel_name(label.preferred))
+
+label_file.write("""    };
+
+    private static final Encoding[] ENCODINGS = {
+""")
+
+for label in preferred:
+  label_file.write("        %s.INSTANCE,\n" % to_camel_name(label))
+        
+label_file.write("""    };
+
+""")
+
+for label in preferred:
+  label_file.write("""    /**
+     * The %s encoding.
+     */
+    public static final Encoding %s = %s.INSTANCE;
+
+""" % (label, to_constant_name(label), to_camel_name(label)))
+        
+label_file.write("""
+private static SortedMap<String, Charset> encodings = null;
+
+    protected Encoding(String canonicalName, String[] aliases) {
+        super(canonicalName, aliases);
+    }
+
+    private enum State {
+        HEAD, LABEL, TAIL
+    };
+
+    public static Encoding forName(String label) {
+        if (label == null) {
+            throw new IllegalArgumentException("Label must not be null.");
+        }
+        if (label.length() == 0) {
+            throw new IllegalCharsetNameException(label);
+        }
+        // First try the fast path
+        int index = Arrays.binarySearch(LABELS, label);
+        if (index >= 0) {
+            return ENCODINGS_FOR_LABELS[index];
+        }
+        // Else, slow path
+        StringBuilder sb = new StringBuilder();
+        State state = State.HEAD;
+        for (int i = 0; i < label.length(); i++) {
+            char c = label.charAt(i);
+            if ((c == ' ') || (c == '\\n') || (c == '\\r') || (c == '\\t')
+                    || (c == '\\u000C')) {
+                if (state == State.LABEL) {
+                    state = State.TAIL;
+                }
+                continue;
+            }
+            if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
+                switch (state) {
+                    case HEAD:
+                        state = State.LABEL;
+                        // Fall through
+                    case LABEL:
+                        sb.append(c);
+                        continue;
+                    case TAIL:
+                        throw new IllegalCharsetNameException(label);
+                }
+            }
+            if (c >= 'A' && c <= 'Z') {
+                c += 0x20;
+                switch (state) {
+                    case HEAD:
+                        state = State.LABEL;
+                        // Fall through
+                    case LABEL:
+                        sb.append(c);
+                        continue;
+                    case TAIL:
+                        throw new IllegalCharsetNameException(label);
+                }
+            }
+            if ((c == '-') || (c == '+') || (c == '.') || (c == ':')
+                    || (c == '_')) {
+                switch (state) {
+                    case LABEL:
+                        sb.append(c);
+                        continue;
+                    case HEAD:
+                    case TAIL:
+                        throw new IllegalCharsetNameException(label);
+                }
+            }
+            throw new IllegalCharsetNameException(label);
+        }
+        index = Arrays.binarySearch(LABELS, sb.toString());
+        if (index >= 0) {
+            return ENCODINGS_FOR_LABELS[index];
+        }
+        throw new UnsupportedCharsetException(label);
+    }
+
+    public static Encoding forNameNoReplacement(String label) {
+        Encoding encoding = Encoding.forName(label);
+        if (encoding == Encoding.REPLACEMENT) {
+            throw new UnsupportedCharsetException(label);            
+        }
+        return encoding;
+    }
+
+    public static boolean isSupported(String label) {
+        try {
+            Encoding.forName(label);
+        } catch (UnsupportedCharsetException e) {
+            return false;
+        }
+        return true;
+    }
+
+    public static boolean isSupportedNoReplacement(String label) {
+        try {
+            Encoding.forNameNoReplacement(label);
+        } catch (UnsupportedCharsetException e) {
+            return false;
+        }
+        return true;
+    }
+
+    public static SortedMap<String, Charset> availableCharsets() {
+        if (encodings == null) {
+            TreeMap<String, Charset> map = new TreeMap<String, Charset>();
+            for (Encoding encoding : ENCODINGS) {
+                map.put(encoding.name(), encoding);
+            }
+            encodings = Collections.unmodifiableSortedMap(map);
+        }
+        return encodings;
+    }
+
+    public static Encoding defaultCharset() {
+        return WINDOWS_1252;
+    }
+
+    @Override public boolean canEncode() {
+        return false;
+    }
+
+    @Override public boolean contains(Charset cs) {
+        return false;
+    }
+
+    @Override public CharsetEncoder newEncoder() {
+        throw new UnsupportedOperationException("Encoder not implemented.");
+    }
+}
+""")
+
+label_file.close()
+
+# Single-byte encodings
+
+for encoding in single_byte:
+  name = encoding["name"]
+  labels = encoding["labels"]
+  labels.sort()
+  class_name = to_camel_name(name)
+  mapping_name = name
+  if mapping_name == u"iso-8859-8-i":
+    mapping_name = u"iso-8859-8"
+  mapping = indexes[mapping_name]
+  class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
+  class_file.write('''/*
+ * Copyright (c) 2013-2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using generate-encoding-data.py
+ */
+
+package nu.validator.encoding;
+
+import java.nio.charset.CharsetDecoder;
+
+class ''')
+  class_file.write(class_name)
+  class_file.write(''' extends Encoding {
+
+    private static final char[] TABLE = {''')
+  fallible = False
+  comma = False
+  for code_point in mapping:
+    # XXX should we have error reporting?
+    if not code_point:
+      code_point = 0xFFFD
+      fallible = True
+    if comma:
+      class_file.write(",")
+    class_file.write("\n        '\u%04x'" % code_point);
+    comma = True    
+  class_file.write('''
+    };
+    
+    private static final String[] LABELS = {''')
+
+  comma = False
+  for label in labels:
+    if comma:
+      class_file.write(",")
+    class_file.write("\n        \"%s\"" % label);
+    comma = True    
+  class_file.write('''
+    };
+    
+    private static final String NAME = "''')
+  class_file.write(name)
+  class_file.write('''";
+    
+    static final Encoding INSTANCE = new ''')
+  class_file.write(class_name)
+  class_file.write('''();
+    
+    private ''')
+  class_file.write(class_name)
+  class_file.write('''() {
+        super(NAME, LABELS);
+    }
+
+    @Override public CharsetDecoder newDecoder() {
+        return new ''')
+  class_file.write("Fallible" if fallible else "Infallible")
+  class_file.write('''SingleByteDecoder(this, TABLE);
+    }
+
+}
+''')
+  class_file.close()
+
+# Multi-byte encodings
+
+for encoding in multi_byte:
+  name = encoding["name"]
+  labels = encoding["labels"]
+  labels.sort()
+  class_name = to_camel_name(name)
+  class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
+  class_file.write('''/*
+ * Copyright (c) 2013-2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using generate-encoding-data.py
+ */
+
+package nu.validator.encoding;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+
+class ''')
+  class_file.write(class_name)
+  class_file.write(''' extends Encoding {
+
+    private static final String[] LABELS = {''')
+
+  comma = False
+  for label in labels:
+    if comma:
+      class_file.write(",")
+    class_file.write("\n        \"%s\"" % label);
+    comma = True    
+  class_file.write('''
+    };
+    
+    private static final String NAME = "''')
+  class_file.write(name)
+  class_file.write('''";
+    
+    static final ''')
+  class_file.write(class_name)
+  class_file.write(''' INSTANCE = new ''')
+  class_file.write(class_name)
+  class_file.write('''();
+    
+    private ''')
+  class_file.write(class_name)
+  class_file.write('''() {
+        super(NAME, LABELS);
+    }
+
+    @Override public CharsetDecoder newDecoder() {
+        ''')
+  if name == "gbk":
+    class_file.write('''return Charset.forName("gb18030").newDecoder();''')    
+  elif name in MULTI_BYTE_DECODER_IMPLEMENTED:
+    class_file.write("return new %sDecoder(this);" % class_name)
+  else:
+    class_file.write('''return Charset.forName(NAME).newDecoder();''')
+  class_file.write('''
+    }
+
+    @Override public CharsetEncoder newEncoder() {
+        ''')
+  if name in MULTI_BYTE_ENCODER_IMPLEMENTED:
+    class_file.write("return new %sEncoder(this);" % class_name)
+  else:
+    class_file.write('''return Charset.forName(NAME).newEncoder();''')
+  class_file.write('''
+    }
+}
+''')
+  class_file.close()
+
+# Big5
+
+def null_to_zero(code_point):
+  if not code_point:
+    code_point = 0
+  return code_point
+
+index = []
+
+for code_point in indexes["big5"]:
+  index.append(null_to_zero(code_point))  
+
+# There are four major gaps consisting of more than 4 consecutive invalid pointers
+gaps = []
+consecutive = 0
+consecutive_start = 0
+offset = 0
+for code_point in index:
+  if code_point == 0:
+    if consecutive == 0:
+      consecutive_start = offset
+    consecutive +=1
+  else:
+    if consecutive > 4:
+      gaps.append((consecutive_start, consecutive_start + consecutive))
+    consecutive = 0
+  offset += 1
+
+def invert_ranges(ranges, cap):
+  inverted = []
+  invert_start = 0
+  for (start, end) in ranges:
+    if start != 0:
+      inverted.append((invert_start, start))
+    invert_start = end
+  inverted.append((invert_start, cap))
+  return inverted
+
+cap = len(index)
+ranges = invert_ranges(gaps, cap)
+
+# Now compute a compressed lookup table for astralness
+
+gaps = []
+consecutive = 0
+consecutive_start = 0
+offset = 0
+for code_point in index:
+  if code_point <= 0xFFFF:
+    if consecutive == 0:
+      consecutive_start = offset
+    consecutive +=1
+  else:
+    if consecutive > 40:
+      gaps.append((consecutive_start, consecutive_start + consecutive))
+    consecutive = 0
+  offset += 1
+
+astral_ranges = invert_ranges(gaps, cap)
+
+class_file = open("src/nu/validator/encoding/Big5Data.java", "w")
+class_file.write('''/*
+ * Copyright (c) 2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a 
+ * copy of this software and associated documentation files (the "Software"), 
+ * to deal in the Software without restriction, including without limitation 
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+ * and/or sell copies of the Software, and to permit persons to whom the 
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in 
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
+ * Instead, please regenerate using generate-encoding-data.py
+ */
+
+package nu.validator.encoding;
+
+final class Big5Data {
+
+    private static final String ASTRALNESS = "''')
+
+bits = []
+for (low, high) in astral_ranges:
+  for i in xrange(low, high):
+    bits.append(1 if index[i] > 0xFFFF else 0)
+# pad length to multiple of 16
+for j in xrange(16 - (len(bits) % 16)):
+  bits.append(0)
+
+i = 0
+while i < len(bits):
+  accu = 0
+  for j in xrange(16):
+    accu |= bits[i + j] << j
+  if accu == 0x22:
+    class_file.write('\\"')
+  else:
+    class_file.write('\\u%04X' % accu)
+  i += 16
+
+class_file.write('''";
+
+''')
+
+j = 0
+for (low, high) in ranges:
+  class_file.write('''    private static final String TABLE%d = "''' % j)
+  for i in xrange(low, high):
+    class_file.write('\\u%04X' % (index[i] & 0xFFFF))
+  class_file.write('''";
+
+''')
+  j += 1
+
+class_file.write('''    private static boolean readBit(int i) {
+        return (ASTRALNESS.charAt(i >> 4) & (1 << (i & 0xF))) != 0;
+    }
+
+    static char lowBits(int pointer) {
+''')
+
+j = 0
+for (low, high) in ranges:
+  class_file.write('''        if (pointer < %d) {
+            return '\\u0000';
+        }
+        if (pointer < %d) {
+            return TABLE%d.charAt(pointer - %d);
+        }
+''' % (low, high, j, low))
+  j += 1
+
+class_file.write('''        return '\\u0000';
+    }
+
+    static boolean isAstral(int pointer) {
+''')
+
+base = 0
+for (low, high) in astral_ranges:
+  if high - low == 1:
+    class_file.write('''        if (pointer < %d) {
+            return false;
+        }
+        if (pointer == %d) {
+            return true;
+        }
+''' % (low, low))
+  else:
+    class_file.write('''        if (pointer < %d) {
+            return false;
+        }
+        if (pointer < %d) {
+            return readBit(%d + (pointer - %d));
+        }
+''' % (low, high, base, low))
+  base += (high - low)
+
+class_file.write('''        return false;
+    }
+
+    public static int findPointer(char lowBits, boolean isAstral) {
+        if (!isAstral) {
+            switch (lowBits) {
+''')
+
+hkscs_bound = (0xA1 - 0x81) * 157
+
+prefer_last = [
+  0x2550,
+  0x255E,
+  0x2561,
+  0x256A,
+  0x5341,
+  0x5345,
+]
+
+for code_point in prefer_last:
+  # Python lists don't have .rindex() :-(
+  for i in xrange(len(index) - 1, -1, -1):
+    candidate = index[i]
+    if candidate == code_point:
+       class_file.write('''                case 0x%04X:
+                    return %d;
+''' % (code_point, i))
+       break
+
+class_file.write('''                default:
+                    break;
+            }
+        }''')
+
+j = 0
+for (low, high) in ranges:
+  if high > hkscs_bound:
+    start = 0
+    if low <= hkscs_bound and hkscs_bound < high:
+      # This is the first range we don't ignore and the
+      # range that contains the first non-HKSCS pointer.
+      # Avoid searching HKSCS.
+      start = hkscs_bound - low
+    class_file.write('''
+        for (int i = %d; i < TABLE%d.length(); i++) {
+            if (TABLE%d.charAt(i) == lowBits) {
+                int pointer = i + %d;
+                if (isAstral == isAstral(pointer)) {
+                    return pointer;
+                }
+            }
+        }''' % (start, j, j, low))
+  j += 1
+
+class_file.write('''
+        return 0;
+    }
+}
+''')
+class_file.close()
author	Matt A. Tobin <email@mattatobin.com>	2020-01-15 14:56:04 -0500
committer	Matt A. Tobin <email@mattatobin.com>	2020-01-15 14:56:04 -0500
commit	6168dbe21f5f83b906e562ea0ab232d499b275a6 (patch)
tree	658a4b27554c85ebcaad655fc83f2c2bb99e8e80 /parser/html/java/htmlparser/generate-encoding-data.py
parent	09314667a692fedff8564fc347c8a3663474faa6 (diff)
download	UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.gz UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.lz UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.xz UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.zip