summaryrefslogtreecommitdiffstats
path: root/parser/html/javasrc/MetaScanner.java
diff options
context:
space:
mode:
Diffstat (limited to 'parser/html/javasrc/MetaScanner.java')
-rw-r--r--parser/html/javasrc/MetaScanner.java854
1 files changed, 0 insertions, 854 deletions
diff --git a/parser/html/javasrc/MetaScanner.java b/parser/html/javasrc/MetaScanner.java
deleted file mode 100644
index be9aabfe3..000000000
--- a/parser/html/javasrc/MetaScanner.java
+++ /dev/null
@@ -1,854 +0,0 @@
-/*
- * Copyright (c) 2007 Henri Sivonen
- * Copyright (c) 2008-2015 Mozilla Foundation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-package nu.validator.htmlparser.impl;
-
-import java.io.IOException;
-
-import nu.validator.htmlparser.annotation.Auto;
-import nu.validator.htmlparser.annotation.Inline;
-import nu.validator.htmlparser.common.ByteReadable;
-
-import org.xml.sax.SAXException;
-
-public abstract class MetaScanner {
-
- /**
- * Constant for "charset".
- */
- private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' };
-
- /**
- * Constant for "content".
- */
- private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' };
-
- /**
- * Constant for "http-equiv".
- */
- private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q',
- 'u', 'i', 'v' };
-
- /**
- * Constant for "content-type".
- */
- private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n',
- 't', '-', 't', 'y', 'p', 'e' };
-
- private static final int NO = 0;
-
- private static final int M = 1;
-
- private static final int E = 2;
-
- private static final int T = 3;
-
- private static final int A = 4;
-
- private static final int DATA = 0;
-
- private static final int TAG_OPEN = 1;
-
- private static final int SCAN_UNTIL_GT = 2;
-
- private static final int TAG_NAME = 3;
-
- private static final int BEFORE_ATTRIBUTE_NAME = 4;
-
- private static final int ATTRIBUTE_NAME = 5;
-
- private static final int AFTER_ATTRIBUTE_NAME = 6;
-
- private static final int BEFORE_ATTRIBUTE_VALUE = 7;
-
- private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8;
-
- private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9;
-
- private static final int ATTRIBUTE_VALUE_UNQUOTED = 10;
-
- private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11;
-
- private static final int MARKUP_DECLARATION_OPEN = 13;
-
- private static final int MARKUP_DECLARATION_HYPHEN = 14;
-
- private static final int COMMENT_START = 15;
-
- private static final int COMMENT_START_DASH = 16;
-
- private static final int COMMENT = 17;
-
- private static final int COMMENT_END_DASH = 18;
-
- private static final int COMMENT_END = 19;
-
- private static final int SELF_CLOSING_START_TAG = 20;
-
- private static final int HTTP_EQUIV_NOT_SEEN = 0;
-
- private static final int HTTP_EQUIV_CONTENT_TYPE = 1;
-
- private static final int HTTP_EQUIV_OTHER = 2;
-
- /**
- * The data source.
- */
- protected ByteReadable readable;
-
- /**
- * The state of the state machine that recognizes the tag name "meta".
- */
- private int metaState = NO;
-
- /**
- * The current position in recognizing the attribute name "content".
- */
- private int contentIndex = Integer.MAX_VALUE;
-
- /**
- * The current position in recognizing the attribute name "charset".
- */
- private int charsetIndex = Integer.MAX_VALUE;
-
- /**
- * The current position in recognizing the attribute name "http-equive".
- */
- private int httpEquivIndex = Integer.MAX_VALUE;
-
- /**
- * The current position in recognizing the attribute value "content-type".
- */
- private int contentTypeIndex = Integer.MAX_VALUE;
-
- /**
- * The tokenizer state.
- */
- protected int stateSave = DATA;
-
- /**
- * The currently filled length of strBuf.
- */
- private int strBufLen;
-
- /**
- * Accumulation buffer for attribute values.
- */
- private @Auto char[] strBuf;
-
- private String content;
-
- private String charset;
-
- private int httpEquivState;
-
- // CPPONLY: private TreeBuilder treeBuilder;
-
- public MetaScanner(
- // CPPONLY: TreeBuilder tb
- ) {
- this.readable = null;
- this.metaState = NO;
- this.contentIndex = Integer.MAX_VALUE;
- this.charsetIndex = Integer.MAX_VALUE;
- this.httpEquivIndex = Integer.MAX_VALUE;
- this.contentTypeIndex = Integer.MAX_VALUE;
- this.stateSave = DATA;
- this.strBufLen = 0;
- this.strBuf = new char[36];
- this.content = null;
- this.charset = null;
- this.httpEquivState = HTTP_EQUIV_NOT_SEEN;
- // CPPONLY: this.treeBuilder = tb;
- }
-
- @SuppressWarnings("unused") private void destructor() {
- Portability.releaseString(content);
- Portability.releaseString(charset);
- }
-
- // [NOCPP[
-
- /**
- * Reads a byte from the data source.
- *
- * -1 means end.
- * @return
- * @throws IOException
- */
- protected int read() throws IOException {
- return readable.readByte();
- }
-
- // ]NOCPP]
-
- // WARNING When editing this, makes sure the bytecode length shown by javap
- // stays under 8000 bytes!
- /**
- * The runs the meta scanning algorithm.
- */
- protected final void stateLoop(int state)
- throws SAXException, IOException {
- int c = -1;
- boolean reconsume = false;
- stateloop: for (;;) {
- switch (state) {
- case DATA:
- dataloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- c = read();
- }
- switch (c) {
- case -1:
- break stateloop;
- case '<':
- state = MetaScanner.TAG_OPEN;
- break dataloop; // FALL THROUGH continue
- // stateloop;
- default:
- continue;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case TAG_OPEN:
- tagopenloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case 'm':
- case 'M':
- metaState = M;
- state = MetaScanner.TAG_NAME;
- break tagopenloop;
- // continue stateloop;
- case '!':
- state = MetaScanner.MARKUP_DECLARATION_OPEN;
- continue stateloop;
- case '?':
- case '/':
- state = MetaScanner.SCAN_UNTIL_GT;
- continue stateloop;
- case '>':
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
- metaState = NO;
- state = MetaScanner.TAG_NAME;
- break tagopenloop;
- // continue stateloop;
- }
- state = MetaScanner.DATA;
- reconsume = true;
- continue stateloop;
- }
- }
- // FALL THROUGH DON'T REORDER
- case TAG_NAME:
- tagnameloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
- case '\u000C':
- state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
- break tagnameloop;
- // continue stateloop;
- case '/':
- state = MetaScanner.SELF_CLOSING_START_TAG;
- continue stateloop;
- case '>':
- state = MetaScanner.DATA;
- continue stateloop;
- case 'e':
- case 'E':
- if (metaState == M) {
- metaState = E;
- } else {
- metaState = NO;
- }
- continue;
- case 't':
- case 'T':
- if (metaState == E) {
- metaState = T;
- } else {
- metaState = NO;
- }
- continue;
- case 'a':
- case 'A':
- if (metaState == T) {
- metaState = A;
- } else {
- metaState = NO;
- }
- continue;
- default:
- metaState = NO;
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_ATTRIBUTE_NAME:
- beforeattributenameloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- c = read();
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
- case '\u000C':
- continue;
- case '/':
- state = MetaScanner.SELF_CLOSING_START_TAG;
- continue stateloop;
- case '>':
- if (handleTag()) {
- break stateloop;
- }
- state = DATA;
- continue stateloop;
- case 'c':
- case 'C':
- contentIndex = 0;
- charsetIndex = 0;
- httpEquivIndex = Integer.MAX_VALUE;
- contentTypeIndex = Integer.MAX_VALUE;
- state = MetaScanner.ATTRIBUTE_NAME;
- break beforeattributenameloop;
- case 'h':
- case 'H':
- contentIndex = Integer.MAX_VALUE;
- charsetIndex = Integer.MAX_VALUE;
- httpEquivIndex = 0;
- contentTypeIndex = Integer.MAX_VALUE;
- state = MetaScanner.ATTRIBUTE_NAME;
- break beforeattributenameloop;
- default:
- contentIndex = Integer.MAX_VALUE;
- charsetIndex = Integer.MAX_VALUE;
- httpEquivIndex = Integer.MAX_VALUE;
- contentTypeIndex = Integer.MAX_VALUE;
- state = MetaScanner.ATTRIBUTE_NAME;
- break beforeattributenameloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case ATTRIBUTE_NAME:
- attributenameloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
- case '\u000C':
- state = MetaScanner.AFTER_ATTRIBUTE_NAME;
- continue stateloop;
- case '/':
- state = MetaScanner.SELF_CLOSING_START_TAG;
- continue stateloop;
- case '=':
- strBufLen = 0;
- contentTypeIndex = 0;
- state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
- break attributenameloop;
- // continue stateloop;
- case '>':
- if (handleTag()) {
- break stateloop;
- }
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- if (metaState == A) {
- if (c >= 'A' && c <= 'Z') {
- c += 0x20;
- }
- if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) {
- ++contentIndex;
- } else {
- contentIndex = Integer.MAX_VALUE;
- }
- if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) {
- ++charsetIndex;
- } else {
- charsetIndex = Integer.MAX_VALUE;
- }
- if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) {
- ++httpEquivIndex;
- } else {
- httpEquivIndex = Integer.MAX_VALUE;
- }
- }
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_ATTRIBUTE_VALUE:
- beforeattributevalueloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
- case '\u000C':
- continue;
- case '"':
- state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED;
- break beforeattributevalueloop;
- // continue stateloop;
- case '\'':
- state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED;
- continue stateloop;
- case '>':
- if (handleTag()) {
- break stateloop;
- }
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- handleCharInAttributeValue(c);
- state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED;
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
- attributevaluedoublequotedloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- c = read();
- }
- switch (c) {
- case -1:
- break stateloop;
- case '"':
- handleAttributeValue();
- state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
- break attributevaluedoublequotedloop;
- // continue stateloop;
- default:
- handleCharInAttributeValue(c);
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_ATTRIBUTE_VALUE_QUOTED:
- afterattributevaluequotedloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
- case '\u000C':
- state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
- continue stateloop;
- case '/':
- state = MetaScanner.SELF_CLOSING_START_TAG;
- break afterattributevaluequotedloop;
- // continue stateloop;
- case '>':
- if (handleTag()) {
- break stateloop;
- }
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
- reconsume = true;
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case SELF_CLOSING_START_TAG:
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '>':
- if (handleTag()) {
- break stateloop;
- }
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
- reconsume = true;
- continue stateloop;
- }
- // XXX reorder point
- case ATTRIBUTE_VALUE_UNQUOTED:
- for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- c = read();
- }
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
-
- case '\u000C':
- handleAttributeValue();
- state = MetaScanner.BEFORE_ATTRIBUTE_NAME;
- continue stateloop;
- case '>':
- handleAttributeValue();
- if (handleTag()) {
- break stateloop;
- }
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- handleCharInAttributeValue(c);
- continue;
- }
- }
- // XXX reorder point
- case AFTER_ATTRIBUTE_NAME:
- for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case ' ':
- case '\t':
- case '\n':
- case '\u000C':
- continue;
- case '/':
- handleAttributeValue();
- state = MetaScanner.SELF_CLOSING_START_TAG;
- continue stateloop;
- case '=':
- strBufLen = 0;
- contentTypeIndex = 0;
- state = MetaScanner.BEFORE_ATTRIBUTE_VALUE;
- continue stateloop;
- case '>':
- handleAttributeValue();
- if (handleTag()) {
- break stateloop;
- }
- state = MetaScanner.DATA;
- continue stateloop;
- case 'c':
- case 'C':
- contentIndex = 0;
- charsetIndex = 0;
- state = MetaScanner.ATTRIBUTE_NAME;
- continue stateloop;
- default:
- contentIndex = Integer.MAX_VALUE;
- charsetIndex = Integer.MAX_VALUE;
- state = MetaScanner.ATTRIBUTE_NAME;
- continue stateloop;
- }
- }
- // XXX reorder point
- case MARKUP_DECLARATION_OPEN:
- markupdeclarationopenloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '-':
- state = MetaScanner.MARKUP_DECLARATION_HYPHEN;
- break markupdeclarationopenloop;
- // continue stateloop;
- default:
- state = MetaScanner.SCAN_UNTIL_GT;
- reconsume = true;
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case MARKUP_DECLARATION_HYPHEN:
- markupdeclarationhyphenloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '-':
- state = MetaScanner.COMMENT_START;
- break markupdeclarationhyphenloop;
- // continue stateloop;
- default:
- state = MetaScanner.SCAN_UNTIL_GT;
- reconsume = true;
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT_START:
- commentstartloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '-':
- state = MetaScanner.COMMENT_START_DASH;
- continue stateloop;
- case '>':
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- state = MetaScanner.COMMENT;
- break commentstartloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT:
- commentloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '-':
- state = MetaScanner.COMMENT_END_DASH;
- break commentloop;
- // continue stateloop;
- default:
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT_END_DASH:
- commentenddashloop: for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '-':
- state = MetaScanner.COMMENT_END;
- break commentenddashloop;
- // continue stateloop;
- default:
- state = MetaScanner.COMMENT;
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT_END:
- for (;;) {
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '>':
- state = MetaScanner.DATA;
- continue stateloop;
- case '-':
- continue;
- default:
- state = MetaScanner.COMMENT;
- continue stateloop;
- }
- }
- // XXX reorder point
- case COMMENT_START_DASH:
- c = read();
- switch (c) {
- case -1:
- break stateloop;
- case '-':
- state = MetaScanner.COMMENT_END;
- continue stateloop;
- case '>':
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- state = MetaScanner.COMMENT;
- continue stateloop;
- }
- // XXX reorder point
- case ATTRIBUTE_VALUE_SINGLE_QUOTED:
- for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- c = read();
- }
- switch (c) {
- case -1:
- break stateloop;
- case '\'':
- handleAttributeValue();
- state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED;
- continue stateloop;
- default:
- handleCharInAttributeValue(c);
- continue;
- }
- }
- // XXX reorder point
- case SCAN_UNTIL_GT:
- for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- c = read();
- }
- switch (c) {
- case -1:
- break stateloop;
- case '>':
- state = MetaScanner.DATA;
- continue stateloop;
- default:
- continue;
- }
- }
- }
- }
- stateSave = state;
- }
-
- private void handleCharInAttributeValue(int c) {
- if (metaState == A) {
- if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) {
- addToBuffer(c);
- } else if (httpEquivIndex == HTTP_EQUIV.length) {
- if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) {
- ++contentTypeIndex;
- } else {
- contentTypeIndex = Integer.MAX_VALUE;
- }
- }
- }
- }
-
- @Inline private int toAsciiLowerCase(int c) {
- if (c >= 'A' && c <= 'Z') {
- return c + 0x20;
- }
- return c;
- }
-
- /**
- * Adds a character to the accumulation buffer.
- * @param c the character to add
- */
- private void addToBuffer(int c) {
- if (strBufLen == strBuf.length) {
- char[] newBuf = new char[strBuf.length + (strBuf.length << 1)];
- System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
- strBuf = newBuf;
- }
- strBuf[strBufLen++] = (char)c;
- }
-
- /**
- * Attempts to extract a charset name from the accumulation buffer.
- * @return <code>true</code> if successful
- * @throws SAXException
- */
- private void handleAttributeValue() throws SAXException {
- if (metaState != A) {
- return;
- }
- if (contentIndex == CONTENT.length && content == null) {
- content = Portability.newStringFromBuffer(strBuf, 0, strBufLen
- // CPPONLY: , treeBuilder
- );
- return;
- }
- if (charsetIndex == CHARSET.length && charset == null) {
- charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen
- // CPPONLY: , treeBuilder
- );
- return;
- }
- if (httpEquivIndex == HTTP_EQUIV.length
- && httpEquivState == HTTP_EQUIV_NOT_SEEN) {
- httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE
- : HTTP_EQUIV_OTHER;
- return;
- }
- }
-
- private boolean handleTag() throws SAXException {
- boolean stop = handleTagInner();
- Portability.releaseString(content);
- content = null;
- Portability.releaseString(charset);
- charset = null;
- httpEquivState = HTTP_EQUIV_NOT_SEEN;
- return stop;
- }
-
- private boolean handleTagInner() throws SAXException {
- if (charset != null && tryCharset(charset)) {
- return true;
- }
- if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) {
- String extract = TreeBuilder.extractCharsetFromContent(content
- // CPPONLY: , treeBuilder
- );
- if (extract == null) {
- return false;
- }
- boolean success = tryCharset(extract);
- Portability.releaseString(extract);
- return success;
- }
- return false;
- }
-
- /**
- * Tries to switch to an encoding.
- *
- * @param encoding
- * @return <code>true</code> if successful
- * @throws SAXException
- */
- protected abstract boolean tryCharset(String encoding) throws SAXException;
-
-}