diff options
Diffstat (limited to 'parser/html/java/htmlparser/generate-encoding-data.py')
-rw-r--r-- | parser/html/java/htmlparser/generate-encoding-data.py | 745 |
1 files changed, 745 insertions, 0 deletions
diff --git a/parser/html/java/htmlparser/generate-encoding-data.py b/parser/html/java/htmlparser/generate-encoding-data.py new file mode 100644 index 000000000..69b2fdc30 --- /dev/null +++ b/parser/html/java/htmlparser/generate-encoding-data.py @@ -0,0 +1,745 @@ +#!/usr/bin/python + +# Copyright (c) 2013-2015 Mozilla Foundation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import json + +class Label: + def __init__(self, label, preferred): + self.label = label + self.preferred = preferred + def __cmp__(self, other): + return cmp(self.label, other.label) + +# If a multi-byte encoding is on this list, it is assumed to have a +# non-generated decoder implementation class. Otherwise, the JDK default +# decoder is used as a placeholder. +MULTI_BYTE_DECODER_IMPLEMENTED = [ + u"x-user-defined", + u"replacement", + u"big5", +] + +MULTI_BYTE_ENCODER_IMPLEMENTED = [ + u"big5", +] + +preferred = [] + +labels = [] + +data = json.load(open("../encoding/encodings.json", "r")) + +indexes = json.load(open("../encoding/indexes.json", "r")) + +single_byte = [] + +multi_byte = [] + +def to_camel_name(name): + if name == u"iso-8859-8-i": + return u"Iso8I" + if name.startswith(u"iso-8859-"): + return name.replace(u"iso-8859-", u"Iso") + return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"") + +def to_constant_name(name): + return name.replace(u"-", u"_").upper() + +# Encoding.java + +for group in data: + if group["heading"] == "Legacy single-byte encodings": + single_byte = group["encodings"] + else: + multi_byte.extend(group["encodings"]) + for encoding in group["encodings"]: + preferred.append(encoding["name"]) + for label in encoding["labels"]: + labels.append(Label(label, encoding["name"])) + +preferred.sort() +labels.sort() + +label_file = open("src/nu/validator/encoding/Encoding.java", "w") + +label_file.write("""/* + * Copyright (c) 2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.encoding; + +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.nio.charset.spi.CharsetProvider; +import java.util.Arrays; +import java.util.Collections; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a> + * as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding + * Standard</a>, provides access to each encoding defined in the Encoding + * Standard via a static constant and provides the + * "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an + * encoding</a>" algorithm defined in the Encoding Standard. + * + * <p>This class inherits from {@link Charset} to allow the Encoding + * Standard-compliant encodings to be used in contexts that support + * <code>Charset</code> instances. However, by design, the Encoding + * Standard-compliant encodings are not supplied via a {@link CharsetProvider} + * and, therefore, are not available via and do not interfere with the static + * methods provided by <code>Charset</code>. (This class provides methods of + * the same name to hide each static method of <code>Charset</code> to help + * avoid accidental calls to the static methods of the superclass when working + * with Encoding Standard-compliant encodings.) + * + * <p>When an application needs to use a particular encoding, such as utf-8 + * or windows-1252, the corresponding constant, i.e. + * {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252} + * respectively, should be used. However, when the application receives an + * encoding label from external input, the method {@link #forName(String) + * forName()} should be used to obtain the object representing the encoding + * identified by the label. In contexts where labels that map to the + * <a href="https://encoding.spec.whatwg.org/#replacement">replacement + * encoding</a> should be treated as unknown, the method {@link + * #forNameNoReplacement(String) forNameNoReplacement()} should be used instead. + * + * + * @author hsivonen + */ +public abstract class Encoding extends Charset { + + private static final String[] LABELS = { +""") + +for label in labels: + label_file.write(" \"%s\",\n" % label.label) + +label_file.write(""" }; + + private static final Encoding[] ENCODINGS_FOR_LABELS = { +""") + +for label in labels: + label_file.write(" %s.INSTANCE,\n" % to_camel_name(label.preferred)) + +label_file.write(""" }; + + private static final Encoding[] ENCODINGS = { +""") + +for label in preferred: + label_file.write(" %s.INSTANCE,\n" % to_camel_name(label)) + +label_file.write(""" }; + +""") + +for label in preferred: + label_file.write(""" /** + * The %s encoding. + */ + public static final Encoding %s = %s.INSTANCE; + +""" % (label, to_constant_name(label), to_camel_name(label))) + +label_file.write(""" +private static SortedMap<String, Charset> encodings = null; + + protected Encoding(String canonicalName, String[] aliases) { + super(canonicalName, aliases); + } + + private enum State { + HEAD, LABEL, TAIL + }; + + public static Encoding forName(String label) { + if (label == null) { + throw new IllegalArgumentException("Label must not be null."); + } + if (label.length() == 0) { + throw new IllegalCharsetNameException(label); + } + // First try the fast path + int index = Arrays.binarySearch(LABELS, label); + if (index >= 0) { + return ENCODINGS_FOR_LABELS[index]; + } + // Else, slow path + StringBuilder sb = new StringBuilder(); + State state = State.HEAD; + for (int i = 0; i < label.length(); i++) { + char c = label.charAt(i); + if ((c == ' ') || (c == '\\n') || (c == '\\r') || (c == '\\t') + || (c == '\\u000C')) { + if (state == State.LABEL) { + state = State.TAIL; + } + continue; + } + if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { + switch (state) { + case HEAD: + state = State.LABEL; + // Fall through + case LABEL: + sb.append(c); + continue; + case TAIL: + throw new IllegalCharsetNameException(label); + } + } + if (c >= 'A' && c <= 'Z') { + c += 0x20; + switch (state) { + case HEAD: + state = State.LABEL; + // Fall through + case LABEL: + sb.append(c); + continue; + case TAIL: + throw new IllegalCharsetNameException(label); + } + } + if ((c == '-') || (c == '+') || (c == '.') || (c == ':') + || (c == '_')) { + switch (state) { + case LABEL: + sb.append(c); + continue; + case HEAD: + case TAIL: + throw new IllegalCharsetNameException(label); + } + } + throw new IllegalCharsetNameException(label); + } + index = Arrays.binarySearch(LABELS, sb.toString()); + if (index >= 0) { + return ENCODINGS_FOR_LABELS[index]; + } + throw new UnsupportedCharsetException(label); + } + + public static Encoding forNameNoReplacement(String label) { + Encoding encoding = Encoding.forName(label); + if (encoding == Encoding.REPLACEMENT) { + throw new UnsupportedCharsetException(label); + } + return encoding; + } + + public static boolean isSupported(String label) { + try { + Encoding.forName(label); + } catch (UnsupportedCharsetException e) { + return false; + } + return true; + } + + public static boolean isSupportedNoReplacement(String label) { + try { + Encoding.forNameNoReplacement(label); + } catch (UnsupportedCharsetException e) { + return false; + } + return true; + } + + public static SortedMap<String, Charset> availableCharsets() { + if (encodings == null) { + TreeMap<String, Charset> map = new TreeMap<String, Charset>(); + for (Encoding encoding : ENCODINGS) { + map.put(encoding.name(), encoding); + } + encodings = Collections.unmodifiableSortedMap(map); + } + return encodings; + } + + public static Encoding defaultCharset() { + return WINDOWS_1252; + } + + @Override public boolean canEncode() { + return false; + } + + @Override public boolean contains(Charset cs) { + return false; + } + + @Override public CharsetEncoder newEncoder() { + throw new UnsupportedOperationException("Encoder not implemented."); + } +} +""") + +label_file.close() + +# Single-byte encodings + +for encoding in single_byte: + name = encoding["name"] + labels = encoding["labels"] + labels.sort() + class_name = to_camel_name(name) + mapping_name = name + if mapping_name == u"iso-8859-8-i": + mapping_name = u"iso-8859-8" + mapping = indexes[mapping_name] + class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w") + class_file.write('''/* + * Copyright (c) 2013-2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. + * Instead, please regenerate using generate-encoding-data.py + */ + +package nu.validator.encoding; + +import java.nio.charset.CharsetDecoder; + +class ''') + class_file.write(class_name) + class_file.write(''' extends Encoding { + + private static final char[] TABLE = {''') + fallible = False + comma = False + for code_point in mapping: + # XXX should we have error reporting? + if not code_point: + code_point = 0xFFFD + fallible = True + if comma: + class_file.write(",") + class_file.write("\n '\u%04x'" % code_point); + comma = True + class_file.write(''' + }; + + private static final String[] LABELS = {''') + + comma = False + for label in labels: + if comma: + class_file.write(",") + class_file.write("\n \"%s\"" % label); + comma = True + class_file.write(''' + }; + + private static final String NAME = "''') + class_file.write(name) + class_file.write('''"; + + static final Encoding INSTANCE = new ''') + class_file.write(class_name) + class_file.write('''(); + + private ''') + class_file.write(class_name) + class_file.write('''() { + super(NAME, LABELS); + } + + @Override public CharsetDecoder newDecoder() { + return new ''') + class_file.write("Fallible" if fallible else "Infallible") + class_file.write('''SingleByteDecoder(this, TABLE); + } + +} +''') + class_file.close() + +# Multi-byte encodings + +for encoding in multi_byte: + name = encoding["name"] + labels = encoding["labels"] + labels.sort() + class_name = to_camel_name(name) + class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w") + class_file.write('''/* + * Copyright (c) 2013-2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. + * Instead, please regenerate using generate-encoding-data.py + */ + +package nu.validator.encoding; + +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; + +class ''') + class_file.write(class_name) + class_file.write(''' extends Encoding { + + private static final String[] LABELS = {''') + + comma = False + for label in labels: + if comma: + class_file.write(",") + class_file.write("\n \"%s\"" % label); + comma = True + class_file.write(''' + }; + + private static final String NAME = "''') + class_file.write(name) + class_file.write('''"; + + static final ''') + class_file.write(class_name) + class_file.write(''' INSTANCE = new ''') + class_file.write(class_name) + class_file.write('''(); + + private ''') + class_file.write(class_name) + class_file.write('''() { + super(NAME, LABELS); + } + + @Override public CharsetDecoder newDecoder() { + ''') + if name == "gbk": + class_file.write('''return Charset.forName("gb18030").newDecoder();''') + elif name in MULTI_BYTE_DECODER_IMPLEMENTED: + class_file.write("return new %sDecoder(this);" % class_name) + else: + class_file.write('''return Charset.forName(NAME).newDecoder();''') + class_file.write(''' + } + + @Override public CharsetEncoder newEncoder() { + ''') + if name in MULTI_BYTE_ENCODER_IMPLEMENTED: + class_file.write("return new %sEncoder(this);" % class_name) + else: + class_file.write('''return Charset.forName(NAME).newEncoder();''') + class_file.write(''' + } +} +''') + class_file.close() + +# Big5 + +def null_to_zero(code_point): + if not code_point: + code_point = 0 + return code_point + +index = [] + +for code_point in indexes["big5"]: + index.append(null_to_zero(code_point)) + +# There are four major gaps consisting of more than 4 consecutive invalid pointers +gaps = [] +consecutive = 0 +consecutive_start = 0 +offset = 0 +for code_point in index: + if code_point == 0: + if consecutive == 0: + consecutive_start = offset + consecutive +=1 + else: + if consecutive > 4: + gaps.append((consecutive_start, consecutive_start + consecutive)) + consecutive = 0 + offset += 1 + +def invert_ranges(ranges, cap): + inverted = [] + invert_start = 0 + for (start, end) in ranges: + if start != 0: + inverted.append((invert_start, start)) + invert_start = end + inverted.append((invert_start, cap)) + return inverted + +cap = len(index) +ranges = invert_ranges(gaps, cap) + +# Now compute a compressed lookup table for astralness + +gaps = [] +consecutive = 0 +consecutive_start = 0 +offset = 0 +for code_point in index: + if code_point <= 0xFFFF: + if consecutive == 0: + consecutive_start = offset + consecutive +=1 + else: + if consecutive > 40: + gaps.append((consecutive_start, consecutive_start + consecutive)) + consecutive = 0 + offset += 1 + +astral_ranges = invert_ranges(gaps, cap) + +class_file = open("src/nu/validator/encoding/Big5Data.java", "w") +class_file.write('''/* + * Copyright (c) 2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. + * Instead, please regenerate using generate-encoding-data.py + */ + +package nu.validator.encoding; + +final class Big5Data { + + private static final String ASTRALNESS = "''') + +bits = [] +for (low, high) in astral_ranges: + for i in xrange(low, high): + bits.append(1 if index[i] > 0xFFFF else 0) +# pad length to multiple of 16 +for j in xrange(16 - (len(bits) % 16)): + bits.append(0) + +i = 0 +while i < len(bits): + accu = 0 + for j in xrange(16): + accu |= bits[i + j] << j + if accu == 0x22: + class_file.write('\\"') + else: + class_file.write('\\u%04X' % accu) + i += 16 + +class_file.write('''"; + +''') + +j = 0 +for (low, high) in ranges: + class_file.write(''' private static final String TABLE%d = "''' % j) + for i in xrange(low, high): + class_file.write('\\u%04X' % (index[i] & 0xFFFF)) + class_file.write('''"; + +''') + j += 1 + +class_file.write(''' private static boolean readBit(int i) { + return (ASTRALNESS.charAt(i >> 4) & (1 << (i & 0xF))) != 0; + } + + static char lowBits(int pointer) { +''') + +j = 0 +for (low, high) in ranges: + class_file.write(''' if (pointer < %d) { + return '\\u0000'; + } + if (pointer < %d) { + return TABLE%d.charAt(pointer - %d); + } +''' % (low, high, j, low)) + j += 1 + +class_file.write(''' return '\\u0000'; + } + + static boolean isAstral(int pointer) { +''') + +base = 0 +for (low, high) in astral_ranges: + if high - low == 1: + class_file.write(''' if (pointer < %d) { + return false; + } + if (pointer == %d) { + return true; + } +''' % (low, low)) + else: + class_file.write(''' if (pointer < %d) { + return false; + } + if (pointer < %d) { + return readBit(%d + (pointer - %d)); + } +''' % (low, high, base, low)) + base += (high - low) + +class_file.write(''' return false; + } + + public static int findPointer(char lowBits, boolean isAstral) { + if (!isAstral) { + switch (lowBits) { +''') + +hkscs_bound = (0xA1 - 0x81) * 157 + +prefer_last = [ + 0x2550, + 0x255E, + 0x2561, + 0x256A, + 0x5341, + 0x5345, +] + +for code_point in prefer_last: + # Python lists don't have .rindex() :-( + for i in xrange(len(index) - 1, -1, -1): + candidate = index[i] + if candidate == code_point: + class_file.write(''' case 0x%04X: + return %d; +''' % (code_point, i)) + break + +class_file.write(''' default: + break; + } + }''') + +j = 0 +for (low, high) in ranges: + if high > hkscs_bound: + start = 0 + if low <= hkscs_bound and hkscs_bound < high: + # This is the first range we don't ignore and the + # range that contains the first non-HKSCS pointer. + # Avoid searching HKSCS. + start = hkscs_bound - low + class_file.write(''' + for (int i = %d; i < TABLE%d.length(); i++) { + if (TABLE%d.charAt(i) == lowBits) { + int pointer = i + %d; + if (isAstral == isAstral(pointer)) { + return pointer; + } + } + }''' % (start, j, j, low)) + j += 1 + +class_file.write(''' + return 0; + } +} +''') +class_file.close() |