diff options
author | wolfbeast <mcwerewolf@wolfbeast.com> | 2020-01-13 09:29:30 +0100 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2020-01-13 09:32:00 +0100 |
commit | aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347 (patch) | |
tree | f04b844c58d310e47578bf1fc75cf5e24453dc3b /parser/html/javasrc/MetaScanner.java | |
parent | 60dc9eaa95b96abbe881063b62304a58eadd6b8e (diff) | |
download | UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar.gz UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar.lz UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar.xz UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.zip |
Reinstate the java->c++ source, generator code + documentation.
We've kept the java source up-to-date until its removal, so there should
be very little additional java mangling needed to have it back up to
speed and usable again.
This reverts commit c6446f1126232935c85397aac493113dd38496cd.
Diffstat (limited to 'parser/html/javasrc/MetaScanner.java')
-rw-r--r-- | parser/html/javasrc/MetaScanner.java | 854 |
1 files changed, 854 insertions, 0 deletions
diff --git a/parser/html/javasrc/MetaScanner.java b/parser/html/javasrc/MetaScanner.java new file mode 100644 index 000000000..be9aabfe3 --- /dev/null +++ b/parser/html/javasrc/MetaScanner.java @@ -0,0 +1,854 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008-2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.impl; + +import java.io.IOException; + +import nu.validator.htmlparser.annotation.Auto; +import nu.validator.htmlparser.annotation.Inline; +import nu.validator.htmlparser.common.ByteReadable; + +import org.xml.sax.SAXException; + +public abstract class MetaScanner { + + /** + * Constant for "charset". + */ + private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' }; + + /** + * Constant for "content". + */ + private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' }; + + /** + * Constant for "http-equiv". + */ + private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q', + 'u', 'i', 'v' }; + + /** + * Constant for "content-type". + */ + private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n', + 't', '-', 't', 'y', 'p', 'e' }; + + private static final int NO = 0; + + private static final int M = 1; + + private static final int E = 2; + + private static final int T = 3; + + private static final int A = 4; + + private static final int DATA = 0; + + private static final int TAG_OPEN = 1; + + private static final int SCAN_UNTIL_GT = 2; + + private static final int TAG_NAME = 3; + + private static final int BEFORE_ATTRIBUTE_NAME = 4; + + private static final int ATTRIBUTE_NAME = 5; + + private static final int AFTER_ATTRIBUTE_NAME = 6; + + private static final int BEFORE_ATTRIBUTE_VALUE = 7; + + private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; + + private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; + + private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; + + private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; + + private static final int MARKUP_DECLARATION_OPEN = 13; + + private static final int MARKUP_DECLARATION_HYPHEN = 14; + + private static final int COMMENT_START = 15; + + private static final int COMMENT_START_DASH = 16; + + private static final int COMMENT = 17; + + private static final int COMMENT_END_DASH = 18; + + private static final int COMMENT_END = 19; + + private static final int SELF_CLOSING_START_TAG = 20; + + private static final int HTTP_EQUIV_NOT_SEEN = 0; + + private static final int HTTP_EQUIV_CONTENT_TYPE = 1; + + private static final int HTTP_EQUIV_OTHER = 2; + + /** + * The data source. + */ + protected ByteReadable readable; + + /** + * The state of the state machine that recognizes the tag name "meta". + */ + private int metaState = NO; + + /** + * The current position in recognizing the attribute name "content". + */ + private int contentIndex = Integer.MAX_VALUE; + + /** + * The current position in recognizing the attribute name "charset". + */ + private int charsetIndex = Integer.MAX_VALUE; + + /** + * The current position in recognizing the attribute name "http-equive". + */ + private int httpEquivIndex = Integer.MAX_VALUE; + + /** + * The current position in recognizing the attribute value "content-type". + */ + private int contentTypeIndex = Integer.MAX_VALUE; + + /** + * The tokenizer state. + */ + protected int stateSave = DATA; + + /** + * The currently filled length of strBuf. + */ + private int strBufLen; + + /** + * Accumulation buffer for attribute values. + */ + private @Auto char[] strBuf; + + private String content; + + private String charset; + + private int httpEquivState; + + // CPPONLY: private TreeBuilder treeBuilder; + + public MetaScanner( + // CPPONLY: TreeBuilder tb + ) { + this.readable = null; + this.metaState = NO; + this.contentIndex = Integer.MAX_VALUE; + this.charsetIndex = Integer.MAX_VALUE; + this.httpEquivIndex = Integer.MAX_VALUE; + this.contentTypeIndex = Integer.MAX_VALUE; + this.stateSave = DATA; + this.strBufLen = 0; + this.strBuf = new char[36]; + this.content = null; + this.charset = null; + this.httpEquivState = HTTP_EQUIV_NOT_SEEN; + // CPPONLY: this.treeBuilder = tb; + } + + @SuppressWarnings("unused") private void destructor() { + Portability.releaseString(content); + Portability.releaseString(charset); + } + + // [NOCPP[ + + /** + * Reads a byte from the data source. + * + * -1 means end. + * @return + * @throws IOException + */ + protected int read() throws IOException { + return readable.readByte(); + } + + // ]NOCPP] + + // WARNING When editing this, makes sure the bytecode length shown by javap + // stays under 8000 bytes! + /** + * The runs the meta scanning algorithm. + */ + protected final void stateLoop(int state) + throws SAXException, IOException { + int c = -1; + boolean reconsume = false; + stateloop: for (;;) { + switch (state) { + case DATA: + dataloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '<': + state = MetaScanner.TAG_OPEN; + break dataloop; // FALL THROUGH continue + // stateloop; + default: + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case TAG_OPEN: + tagopenloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case 'm': + case 'M': + metaState = M; + state = MetaScanner.TAG_NAME; + break tagopenloop; + // continue stateloop; + case '!': + state = MetaScanner.MARKUP_DECLARATION_OPEN; + continue stateloop; + case '?': + case '/': + state = MetaScanner.SCAN_UNTIL_GT; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { + metaState = NO; + state = MetaScanner.TAG_NAME; + break tagopenloop; + // continue stateloop; + } + state = MetaScanner.DATA; + reconsume = true; + continue stateloop; + } + } + // FALL THROUGH DON'T REORDER + case TAG_NAME: + tagnameloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + break tagnameloop; + // continue stateloop; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + case 'e': + case 'E': + if (metaState == M) { + metaState = E; + } else { + metaState = NO; + } + continue; + case 't': + case 'T': + if (metaState == E) { + metaState = T; + } else { + metaState = NO; + } + continue; + case 'a': + case 'A': + if (metaState == T) { + metaState = A; + } else { + metaState = NO; + } + continue; + default: + metaState = NO; + continue; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_ATTRIBUTE_NAME: + beforeattributenameloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + /* + * Consume the next input character: + */ + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + continue; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = DATA; + continue stateloop; + case 'c': + case 'C': + contentIndex = 0; + charsetIndex = 0; + httpEquivIndex = Integer.MAX_VALUE; + contentTypeIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + break beforeattributenameloop; + case 'h': + case 'H': + contentIndex = Integer.MAX_VALUE; + charsetIndex = Integer.MAX_VALUE; + httpEquivIndex = 0; + contentTypeIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + break beforeattributenameloop; + default: + contentIndex = Integer.MAX_VALUE; + charsetIndex = Integer.MAX_VALUE; + httpEquivIndex = Integer.MAX_VALUE; + contentTypeIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + break beforeattributenameloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case ATTRIBUTE_NAME: + attributenameloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + state = MetaScanner.AFTER_ATTRIBUTE_NAME; + continue stateloop; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '=': + strBufLen = 0; + contentTypeIndex = 0; + state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; + break attributenameloop; + // continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + if (metaState == A) { + if (c >= 'A' && c <= 'Z') { + c += 0x20; + } + if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { + ++contentIndex; + } else { + contentIndex = Integer.MAX_VALUE; + } + if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { + ++charsetIndex; + } else { + charsetIndex = Integer.MAX_VALUE; + } + if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { + ++httpEquivIndex; + } else { + httpEquivIndex = Integer.MAX_VALUE; + } + } + continue; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_ATTRIBUTE_VALUE: + beforeattributevalueloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + continue; + case '"': + state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; + break beforeattributevalueloop; + // continue stateloop; + case '\'': + state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; + continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + handleCharInAttributeValue(c); + state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: + attributevaluedoublequotedloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '"': + handleAttributeValue(); + state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; + break attributevaluedoublequotedloop; + // continue stateloop; + default: + handleCharInAttributeValue(c); + continue; + } + } + // FALLTHRU DON'T REORDER + case AFTER_ATTRIBUTE_VALUE_QUOTED: + afterattributevaluequotedloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + continue stateloop; + case '/': + state = MetaScanner.SELF_CLOSING_START_TAG; + break afterattributevaluequotedloop; + // continue stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + reconsume = true; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case SELF_CLOSING_START_TAG: + c = read(); + switch (c) { + case -1: + break stateloop; + case '>': + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + reconsume = true; + continue stateloop; + } + // XXX reorder point + case ATTRIBUTE_VALUE_UNQUOTED: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + + case '\u000C': + handleAttributeValue(); + state = MetaScanner.BEFORE_ATTRIBUTE_NAME; + continue stateloop; + case '>': + handleAttributeValue(); + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + default: + handleCharInAttributeValue(c); + continue; + } + } + // XXX reorder point + case AFTER_ATTRIBUTE_NAME: + for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case ' ': + case '\t': + case '\n': + case '\u000C': + continue; + case '/': + handleAttributeValue(); + state = MetaScanner.SELF_CLOSING_START_TAG; + continue stateloop; + case '=': + strBufLen = 0; + contentTypeIndex = 0; + state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; + continue stateloop; + case '>': + handleAttributeValue(); + if (handleTag()) { + break stateloop; + } + state = MetaScanner.DATA; + continue stateloop; + case 'c': + case 'C': + contentIndex = 0; + charsetIndex = 0; + state = MetaScanner.ATTRIBUTE_NAME; + continue stateloop; + default: + contentIndex = Integer.MAX_VALUE; + charsetIndex = Integer.MAX_VALUE; + state = MetaScanner.ATTRIBUTE_NAME; + continue stateloop; + } + } + // XXX reorder point + case MARKUP_DECLARATION_OPEN: + markupdeclarationopenloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.MARKUP_DECLARATION_HYPHEN; + break markupdeclarationopenloop; + // continue stateloop; + default: + state = MetaScanner.SCAN_UNTIL_GT; + reconsume = true; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case MARKUP_DECLARATION_HYPHEN: + markupdeclarationhyphenloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_START; + break markupdeclarationhyphenloop; + // continue stateloop; + default: + state = MetaScanner.SCAN_UNTIL_GT; + reconsume = true; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_START: + commentstartloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_START_DASH; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.COMMENT; + break commentstartloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT: + commentloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_END_DASH; + break commentloop; + // continue stateloop; + default: + continue; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_END_DASH: + commentenddashloop: for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_END; + break commentenddashloop; + // continue stateloop; + default: + state = MetaScanner.COMMENT; + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_END: + for (;;) { + c = read(); + switch (c) { + case -1: + break stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + case '-': + continue; + default: + state = MetaScanner.COMMENT; + continue stateloop; + } + } + // XXX reorder point + case COMMENT_START_DASH: + c = read(); + switch (c) { + case -1: + break stateloop; + case '-': + state = MetaScanner.COMMENT_END; + continue stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + state = MetaScanner.COMMENT; + continue stateloop; + } + // XXX reorder point + case ATTRIBUTE_VALUE_SINGLE_QUOTED: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '\'': + handleAttributeValue(); + state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; + continue stateloop; + default: + handleCharInAttributeValue(c); + continue; + } + } + // XXX reorder point + case SCAN_UNTIL_GT: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + c = read(); + } + switch (c) { + case -1: + break stateloop; + case '>': + state = MetaScanner.DATA; + continue stateloop; + default: + continue; + } + } + } + } + stateSave = state; + } + + private void handleCharInAttributeValue(int c) { + if (metaState == A) { + if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { + addToBuffer(c); + } else if (httpEquivIndex == HTTP_EQUIV.length) { + if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { + ++contentTypeIndex; + } else { + contentTypeIndex = Integer.MAX_VALUE; + } + } + } + } + + @Inline private int toAsciiLowerCase(int c) { + if (c >= 'A' && c <= 'Z') { + return c + 0x20; + } + return c; + } + + /** + * Adds a character to the accumulation buffer. + * @param c the character to add + */ + private void addToBuffer(int c) { + if (strBufLen == strBuf.length) { + char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; + System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); + strBuf = newBuf; + } + strBuf[strBufLen++] = (char)c; + } + + /** + * Attempts to extract a charset name from the accumulation buffer. + * @return <code>true</code> if successful + * @throws SAXException + */ + private void handleAttributeValue() throws SAXException { + if (metaState != A) { + return; + } + if (contentIndex == CONTENT.length && content == null) { + content = Portability.newStringFromBuffer(strBuf, 0, strBufLen + // CPPONLY: , treeBuilder + ); + return; + } + if (charsetIndex == CHARSET.length && charset == null) { + charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen + // CPPONLY: , treeBuilder + ); + return; + } + if (httpEquivIndex == HTTP_EQUIV.length + && httpEquivState == HTTP_EQUIV_NOT_SEEN) { + httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE + : HTTP_EQUIV_OTHER; + return; + } + } + + private boolean handleTag() throws SAXException { + boolean stop = handleTagInner(); + Portability.releaseString(content); + content = null; + Portability.releaseString(charset); + charset = null; + httpEquivState = HTTP_EQUIV_NOT_SEEN; + return stop; + } + + private boolean handleTagInner() throws SAXException { + if (charset != null && tryCharset(charset)) { + return true; + } + if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { + String extract = TreeBuilder.extractCharsetFromContent(content + // CPPONLY: , treeBuilder + ); + if (extract == null) { + return false; + } + boolean success = tryCharset(extract); + Portability.releaseString(extract); + return success; + } + return false; + } + + /** + * Tries to switch to an encoding. + * + * @param encoding + * @return <code>true</code> if successful + * @throws SAXException + */ + protected abstract boolean tryCharset(String encoding) throws SAXException; + +} |