diff options
Diffstat (limited to 'parser/html/javasrc/Tokenizer.java')
-rw-r--r-- | parser/html/javasrc/Tokenizer.java | 7067 |
1 files changed, 7067 insertions, 0 deletions
diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java new file mode 100644 index 000000000..d9eaafeb3 --- /dev/null +++ b/parser/html/javasrc/Tokenizer.java @@ -0,0 +1,7067 @@ +/* + * Copyright (c) 2005-2007 Henri Sivonen + * Copyright (c) 2007-2015 Mozilla Foundation + * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla + * Foundation, and Opera Software ASA. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * The comments following this one that use the same comment syntax as this + * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 + * amended as of June 18 2008 and May 31 2010. + * That document came with this statement: + * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and + * Opera Software ASA. You are granted a license to use, reproduce and + * create derivative works of this document." + */ + +package nu.validator.htmlparser.impl; + +import org.xml.sax.ErrorHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import nu.validator.htmlparser.annotation.Auto; +import nu.validator.htmlparser.annotation.CharacterName; +import nu.validator.htmlparser.annotation.Const; +import nu.validator.htmlparser.annotation.Inline; +import nu.validator.htmlparser.annotation.Local; +import nu.validator.htmlparser.annotation.NoLength; +import nu.validator.htmlparser.common.EncodingDeclarationHandler; +import nu.validator.htmlparser.common.Interner; +import nu.validator.htmlparser.common.TokenHandler; +import nu.validator.htmlparser.common.XmlViolationPolicy; + +/** + * An implementation of + * https://html.spec.whatwg.org/multipage/syntax.html#tokenization + * + * This class implements the <code>Locator</code> interface. This is not an + * incidental implementation detail: Users of this class are encouraged to make + * use of the <code>Locator</code> nature. + * + * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer + * can be configured to treat these conditions as fatal or to coerce the infoset + * to something that XML 1.0 allows. + * + * @version $Id$ + * @author hsivonen + */ +public class Tokenizer implements Locator { + + private static final int DATA_AND_RCDATA_MASK = ~1; + + public static final int DATA = 0; + + public static final int RCDATA = 1; + + public static final int SCRIPT_DATA = 2; + + public static final int RAWTEXT = 3; + + public static final int SCRIPT_DATA_ESCAPED = 4; + + public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; + + public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; + + public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; + + public static final int PLAINTEXT = 8; + + public static final int TAG_OPEN = 9; + + public static final int CLOSE_TAG_OPEN = 10; + + public static final int TAG_NAME = 11; + + public static final int BEFORE_ATTRIBUTE_NAME = 12; + + public static final int ATTRIBUTE_NAME = 13; + + public static final int AFTER_ATTRIBUTE_NAME = 14; + + public static final int BEFORE_ATTRIBUTE_VALUE = 15; + + public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; + + public static final int BOGUS_COMMENT = 17; + + public static final int MARKUP_DECLARATION_OPEN = 18; + + public static final int DOCTYPE = 19; + + public static final int BEFORE_DOCTYPE_NAME = 20; + + public static final int DOCTYPE_NAME = 21; + + public static final int AFTER_DOCTYPE_NAME = 22; + + public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; + + public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; + + public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; + + public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; + + public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; + + public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; + + public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; + + public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; + + public static final int BOGUS_DOCTYPE = 31; + + public static final int COMMENT_START = 32; + + public static final int COMMENT_START_DASH = 33; + + public static final int COMMENT = 34; + + public static final int COMMENT_END_DASH = 35; + + public static final int COMMENT_END = 36; + + public static final int COMMENT_END_BANG = 37; + + public static final int NON_DATA_END_TAG_NAME = 38; + + public static final int MARKUP_DECLARATION_HYPHEN = 39; + + public static final int MARKUP_DECLARATION_OCTYPE = 40; + + public static final int DOCTYPE_UBLIC = 41; + + public static final int DOCTYPE_YSTEM = 42; + + public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; + + public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; + + public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; + + public static final int CONSUME_CHARACTER_REFERENCE = 46; + + public static final int CONSUME_NCR = 47; + + public static final int CHARACTER_REFERENCE_TAIL = 48; + + public static final int HEX_NCR_LOOP = 49; + + public static final int DECIMAL_NRC_LOOP = 50; + + public static final int HANDLE_NCR_VALUE = 51; + + public static final int HANDLE_NCR_VALUE_RECONSUME = 52; + + public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; + + public static final int SELF_CLOSING_START_TAG = 54; + + public static final int CDATA_START = 55; + + public static final int CDATA_SECTION = 56; + + public static final int CDATA_RSQB = 57; + + public static final int CDATA_RSQB_RSQB = 58; + + public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; + + public static final int SCRIPT_DATA_ESCAPE_START = 60; + + public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; + + public static final int SCRIPT_DATA_ESCAPED_DASH = 62; + + public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; + + public static final int BOGUS_COMMENT_HYPHEN = 64; + + public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; + + public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; + + public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; + + public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; + + public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; + + public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; + + public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; + + public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; + + public static final int PROCESSING_INSTRUCTION = 73; + + public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; + + /** + * Magic value for UTF-16 operations. + */ + private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); + + /** + * UTF-16 code unit array containing less than and greater than for emitting + * those characters on certain parse errors. + */ + private static final @NoLength char[] LT_GT = { '<', '>' }; + + /** + * UTF-16 code unit array containing less than and solidus for emitting + * those characters on certain parse errors. + */ + private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; + + /** + * UTF-16 code unit array containing ]] for emitting those characters on + * state transitions. + */ + private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; + + /** + * Array version of U+FFFD. + */ + private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; + + // [NOCPP[ + + /** + * Array version of space. + */ + private static final @NoLength char[] SPACE = { ' ' }; + + // ]NOCPP] + + /** + * Array version of line feed. + */ + private static final @NoLength char[] LF = { '\n' }; + + /** + * "CDATA[" as <code>char[]</code> + */ + private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', + 'A', '[' }; + + /** + * "octype" as <code>char[]</code> + */ + private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', + 'e' }; + + /** + * "ublic" as <code>char[]</code> + */ + private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; + + /** + * "ystem" as <code>char[]</code> + */ + private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; + + private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; + + private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; + + private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; + + private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', + 'e', 'x', 't' }; + + private static final char[] XMP_ARR = { 'x', 'm', 'p' }; + + private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', + 'e', 'a' }; + + private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; + + private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', + 'd' }; + + private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', + 'p', 't' }; + + private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', + 'e', 's' }; + + /** + * The token handler. + */ + protected final TokenHandler tokenHandler; + + protected EncodingDeclarationHandler encodingDeclarationHandler; + + // [NOCPP[ + + /** + * The error handler. + */ + protected ErrorHandler errorHandler; + + // ]NOCPP] + + /** + * Whether the previous char read was CR. + */ + protected boolean lastCR; + + protected int stateSave; + + private int returnStateSave; + + protected int index; + + private boolean forceQuirks; + + private char additional; + + private int entCol; + + private int firstCharKey; + + private int lo; + + private int hi; + + private int candidate; + + private int charRefBufMark; + + protected int value; + + private boolean seenDigits; + + protected int cstart; + + /** + * The SAX public id for the resource being tokenized. (Only passed to back + * as part of locator data.) + */ + private String publicId; + + /** + * The SAX system id for the resource being tokenized. (Only passed to back + * as part of locator data.) + */ + private String systemId; + + /** + * Buffer for bufferable things other than those that fit the description + * of <code>charRefBuf</code>. + */ + private @Auto char[] strBuf; + + /** + * Number of significant <code>char</code>s in <code>strBuf</code>. + */ + private int strBufLen; + + /** + * Buffer for characters that might form a character reference but may + * end up not forming one. + */ + private final @Auto char[] charRefBuf; + + /** + * Number of significant <code>char</code>s in <code>charRefBuf</code>. + */ + private int charRefBufLen; + + /** + * Buffer for expanding NCRs falling into the Basic Multilingual Plane. + */ + private final @Auto char[] bmpChar; + + /** + * Buffer for expanding astral NCRs. + */ + private final @Auto char[] astralChar; + + /** + * The element whose end tag closes the current CDATA or RCDATA element. + */ + protected ElementName endTagExpectation = null; + + private char[] endTagExpectationAsArray; // not @Auto! + + /** + * <code>true</code> if tokenizing an end tag + */ + protected boolean endTag; + + /** + * The current tag token name. + */ + private ElementName tagName = null; + + /** + * The current attribute name. + */ + protected AttributeName attributeName = null; + + // [NOCPP[ + + /** + * Whether comment tokens are emitted. + */ + private boolean wantsComments = false; + + /** + * <code>true</code> when HTML4-specific additional errors are requested. + */ + protected boolean html4; + + /** + * Whether the stream is past the first 1024 bytes. + */ + private boolean metaBoundaryPassed; + + // ]NOCPP] + + /** + * The name of the current doctype token. + */ + private @Local String doctypeName; + + /** + * The public id of the current doctype token. + */ + private String publicIdentifier; + + /** + * The system id of the current doctype token. + */ + private String systemIdentifier; + + /** + * The attribute holder. + */ + private HtmlAttributes attributes; + + // [NOCPP[ + + /** + * The policy for vertical tab and form feed. + */ + private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; + + /** + * The policy for comments. + */ + private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; + + private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; + + private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; + + private boolean html4ModeCompatibleWithXhtml1Schemata; + + private int mappingLangToXmlLang; + + // ]NOCPP] + + private final boolean newAttributesEachTime; + + private boolean shouldSuspend; + + protected boolean confident; + + private int line; + + /* + * The line number of the current attribute. First set to the line of the + * attribute name and if there is a value, set to the line the value + * started on. + */ + // CPPONLY: private int attributeLine; + + private Interner interner; + + // CPPONLY: private boolean viewingXmlSource; + + // [NOCPP[ + + protected LocatorImpl ampersandLocation; + + public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { + this.tokenHandler = tokenHandler; + this.encodingDeclarationHandler = null; + this.newAttributesEachTime = newAttributesEachTime; + // ∳ is the longest valid char ref and + // the semicolon never gets appended to the buffer. + this.charRefBuf = new char[32]; + this.bmpChar = new char[1]; + this.astralChar = new char[2]; + this.tagName = null; + this.attributeName = null; + this.doctypeName = null; + this.publicIdentifier = null; + this.systemIdentifier = null; + this.attributes = null; + } + + // ]NOCPP] + + /** + * The constructor. + * + * @param tokenHandler + * the handler for receiving tokens + */ + public Tokenizer(TokenHandler tokenHandler + // CPPONLY: , boolean viewingXmlSource + ) { + this.tokenHandler = tokenHandler; + this.encodingDeclarationHandler = null; + // [NOCPP[ + this.newAttributesEachTime = false; + // ]NOCPP] + // ∳ is the longest valid char ref and + // the semicolon never gets appended to the buffer. + this.charRefBuf = new char[32]; + this.bmpChar = new char[1]; + this.astralChar = new char[2]; + this.tagName = null; + this.attributeName = null; + this.doctypeName = null; + this.publicIdentifier = null; + this.systemIdentifier = null; + // [NOCPP[ + this.attributes = null; + // ]NOCPP] + // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; + // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); + // CPPONLY: this.viewingXmlSource = viewingXmlSource; + } + + public void setInterner(Interner interner) { + this.interner = interner; + } + + public void initLocation(String newPublicId, String newSystemId) { + this.systemId = newSystemId; + this.publicId = newPublicId; + + } + + // CPPONLY: boolean isViewingXmlSource() { + // CPPONLY: return viewingXmlSource; + // CPPONLY: } + + // [NOCPP[ + + /** + * Returns the mappingLangToXmlLang. + * + * @return the mappingLangToXmlLang + */ + public boolean isMappingLangToXmlLang() { + return mappingLangToXmlLang == AttributeName.HTML_LANG; + } + + /** + * Sets the mappingLangToXmlLang. + * + * @param mappingLangToXmlLang + * the mappingLangToXmlLang to set + */ + public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { + this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG + : AttributeName.HTML; + } + + /** + * Sets the error handler. + * + * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) + */ + public void setErrorHandler(ErrorHandler eh) { + this.errorHandler = eh; + } + + public ErrorHandler getErrorHandler() { + return this.errorHandler; + } + + /** + * Sets the commentPolicy. + * + * @param commentPolicy + * the commentPolicy to set + */ + public void setCommentPolicy(XmlViolationPolicy commentPolicy) { + this.commentPolicy = commentPolicy; + } + + /** + * Sets the contentNonXmlCharPolicy. + * + * @param contentNonXmlCharPolicy + * the contentNonXmlCharPolicy to set + */ + public void setContentNonXmlCharPolicy( + XmlViolationPolicy contentNonXmlCharPolicy) { + if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { + throw new IllegalArgumentException( + "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); + } + } + + /** + * Sets the contentSpacePolicy. + * + * @param contentSpacePolicy + * the contentSpacePolicy to set + */ + public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { + this.contentSpacePolicy = contentSpacePolicy; + } + + /** + * Sets the xmlnsPolicy. + * + * @param xmlnsPolicy + * the xmlnsPolicy to set + */ + public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { + if (xmlnsPolicy == XmlViolationPolicy.FATAL) { + throw new IllegalArgumentException("Can't use FATAL here."); + } + this.xmlnsPolicy = xmlnsPolicy; + } + + public void setNamePolicy(XmlViolationPolicy namePolicy) { + this.namePolicy = namePolicy; + } + + /** + * Sets the html4ModeCompatibleWithXhtml1Schemata. + * + * @param html4ModeCompatibleWithXhtml1Schemata + * the html4ModeCompatibleWithXhtml1Schemata to set + */ + public void setHtml4ModeCompatibleWithXhtml1Schemata( + boolean html4ModeCompatibleWithXhtml1Schemata) { + this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; + } + + // ]NOCPP] + + // For the token handler to call + /** + * Sets the tokenizer state and the associated element name. This should + * only ever used to put the tokenizer into one of the states that have + * a special end tag expectation. + * + * @param specialTokenizerState + * the tokenizer state to set + * @param endTagExpectation + * the expected end tag for transitioning back to normal + */ + public void setStateAndEndTagExpectation(int specialTokenizerState, + @Local String endTagExpectation) { + this.stateSave = specialTokenizerState; + if (specialTokenizerState == Tokenizer.DATA) { + return; + } + @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); + this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0, + asArray.length, interner); + endTagExpectationToArray(); + } + + /** + * Sets the tokenizer state and the associated element name. This should + * only ever used to put the tokenizer into one of the states that have + * a special end tag expectation. + * + * @param specialTokenizerState + * the tokenizer state to set + * @param endTagExpectation + * the expected end tag for transitioning back to normal + */ + public void setStateAndEndTagExpectation(int specialTokenizerState, + ElementName endTagExpectation) { + this.stateSave = specialTokenizerState; + this.endTagExpectation = endTagExpectation; + endTagExpectationToArray(); + } + + private void endTagExpectationToArray() { + switch (endTagExpectation.getGroup()) { + case TreeBuilder.TITLE: + endTagExpectationAsArray = TITLE_ARR; + return; + case TreeBuilder.SCRIPT: + endTagExpectationAsArray = SCRIPT_ARR; + return; + case TreeBuilder.STYLE: + endTagExpectationAsArray = STYLE_ARR; + return; + case TreeBuilder.PLAINTEXT: + endTagExpectationAsArray = PLAINTEXT_ARR; + return; + case TreeBuilder.XMP: + endTagExpectationAsArray = XMP_ARR; + return; + case TreeBuilder.TEXTAREA: + endTagExpectationAsArray = TEXTAREA_ARR; + return; + case TreeBuilder.IFRAME: + endTagExpectationAsArray = IFRAME_ARR; + return; + case TreeBuilder.NOEMBED: + endTagExpectationAsArray = NOEMBED_ARR; + return; + case TreeBuilder.NOSCRIPT: + endTagExpectationAsArray = NOSCRIPT_ARR; + return; + case TreeBuilder.NOFRAMES: + endTagExpectationAsArray = NOFRAMES_ARR; + return; + default: + assert false: "Bad end tag expectation."; + return; + } + } + + /** + * For C++ use only. + */ + public void setLineNumber(int line) { + // CPPONLY: this.attributeLine = line; // XXX is this needed? + this.line = line; + } + + // start Locator impl + + /** + * @see org.xml.sax.Locator#getLineNumber() + */ + @Inline public int getLineNumber() { + return line; + } + + // [NOCPP[ + + /** + * @see org.xml.sax.Locator#getColumnNumber() + */ + @Inline public int getColumnNumber() { + return -1; + } + + /** + * @see org.xml.sax.Locator#getPublicId() + */ + public String getPublicId() { + return publicId; + } + + /** + * @see org.xml.sax.Locator#getSystemId() + */ + public String getSystemId() { + return systemId; + } + + // end Locator impl + + // end public API + + public void notifyAboutMetaBoundary() { + metaBoundaryPassed = true; + } + + void turnOnAdditionalHtml4Errors() { + html4 = true; + } + + // ]NOCPP] + + HtmlAttributes emptyAttributes() { + // [NOCPP[ + if (newAttributesEachTime) { + return new HtmlAttributes(mappingLangToXmlLang); + } else { + // ]NOCPP] + return HtmlAttributes.EMPTY_ATTRIBUTES; + // [NOCPP[ + } + // ]NOCPP] + } + + @Inline private void appendCharRefBuf(char c) { + // CPPONLY: assert charRefBufLen < charRefBuf.length: + // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!"; + charRefBuf[charRefBufLen++] = c; + } + + private void emitOrAppendCharRefBuf(int returnState) throws SAXException { + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + appendCharRefBufToStrBuf(); + } else { + if (charRefBufLen > 0) { + tokenHandler.characters(charRefBuf, 0, charRefBufLen); + charRefBufLen = 0; + } + } + } + + @Inline private void clearStrBufAfterUse() { + strBufLen = 0; + } + + @Inline private void clearStrBufBeforeUse() { + assert strBufLen == 0: "strBufLen not reset after previous use!"; + strBufLen = 0; // no-op in the absence of bugs + } + + @Inline private void clearStrBufAfterOneHyphen() { + assert strBufLen == 1: "strBufLen length not one!"; + assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; + strBufLen = 0; + } + + /** + * Appends to the buffer. + * + * @param c + * the UTF-16 code unit to append + */ + @Inline private void appendStrBuf(char c) { + // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient."; + // CPPONLY: if (strBufLen == strBuf.length) { + // CPPONLY: if (!EnsureBufferSpace(1)) { + // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; + // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not + // CPPONLY: } + strBuf[strBufLen++] = c; + } + + /** + * The buffer as a String. Currently only used for error reporting. + * + * <p> + * C++ memory note: The return value must be released. + * + * @return the buffer as a string + */ + protected String strBufToString() { + String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen + // CPPONLY: , tokenHandler + ); + clearStrBufAfterUse(); + return str; + } + + /** + * Returns the buffer as a local name. The return value is released in + * emitDoctypeToken(). + * + * @return the buffer as local name + */ + private void strBufToDoctypeName() { + doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, + interner); + clearStrBufAfterUse(); + } + + /** + * Emits the buffer as character tokens. + * + * @throws SAXException + * if the token handler threw + */ + private void emitStrBuf() throws SAXException { + if (strBufLen > 0) { + tokenHandler.characters(strBuf, 0, strBufLen); + clearStrBufAfterUse(); + } + } + + @Inline private void appendSecondHyphenToBogusComment() throws SAXException { + // [NOCPP[ + switch (commentPolicy) { + case ALTER_INFOSET: + appendStrBuf(' '); + // FALLTHROUGH + case ALLOW: + warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); + // ]NOCPP] + appendStrBuf('-'); + // [NOCPP[ + break; + case FATAL: + fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); + break; + } + // ]NOCPP] + } + + // [NOCPP[ + private void maybeAppendSpaceToBogusComment() throws SAXException { + switch (commentPolicy) { + case ALTER_INFOSET: + appendStrBuf(' '); + // FALLTHROUGH + case ALLOW: + warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); + break; + case FATAL: + fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); + break; + } + } + + // ]NOCPP] + + @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c) + throws SAXException { + errConsecutiveHyphens(); + // [NOCPP[ + switch (commentPolicy) { + case ALTER_INFOSET: + strBufLen--; + // WARNING!!! This expands the worst case of the buffer length + // given the length of input! + appendStrBuf(' '); + appendStrBuf('-'); + // FALLTHROUGH + case ALLOW: + warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); + // ]NOCPP] + appendStrBuf(c); + // [NOCPP[ + break; + case FATAL: + fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); + break; + } + // ]NOCPP] + } + + private void appendStrBuf(@NoLength char[] buffer, int offset, int length) { + int newLen = strBufLen + length; + // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient."; + // CPPONLY: if (strBuf.length < newLen) { + // CPPONLY: if (!EnsureBufferSpace(length)) { + // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; + // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not + // CPPONLY: } + System.arraycopy(buffer, offset, strBuf, strBufLen, length); + strBufLen = newLen; + } + + /** + * Append the contents of the char reference buffer to the main one. + */ + @Inline private void appendCharRefBufToStrBuf() { + appendStrBuf(charRefBuf, 0, charRefBufLen); + charRefBufLen = 0; + } + + /** + * Emits the current comment token. + * + * @param pos + * TODO + * + * @throws SAXException + */ + private void emitComment(int provisionalHyphens, int pos) + throws SAXException { + // [NOCPP[ + if (wantsComments) { + // ]NOCPP] + tokenHandler.comment(strBuf, 0, strBufLen + - provisionalHyphens); + // [NOCPP[ + } + // ]NOCPP] + clearStrBufAfterUse(); + cstart = pos + 1; + } + + /** + * Flushes coalesced character tokens. + * + * @param buf + * TODO + * @param pos + * TODO + * + * @throws SAXException + */ + protected void flushChars(@NoLength char[] buf, int pos) + throws SAXException { + if (pos > cstart) { + tokenHandler.characters(buf, cstart, pos - cstart); + } + cstart = Integer.MAX_VALUE; + } + + /** + * Reports an condition that would make the infoset incompatible with XML + * 1.0 as fatal. + * + * @param message + * the message + * @throws SAXException + * @throws SAXParseException + */ + public void fatal(String message) throws SAXException { + SAXParseException spe = new SAXParseException(message, this); + if (errorHandler != null) { + errorHandler.fatalError(spe); + } + throw spe; + } + + /** + * Reports a Parse Error. + * + * @param message + * the message + * @throws SAXException + */ + public void err(String message) throws SAXException { + if (errorHandler == null) { + return; + } + SAXParseException spe = new SAXParseException(message, this); + errorHandler.error(spe); + } + + public void errTreeBuilder(String message) throws SAXException { + ErrorHandler eh = null; + if (tokenHandler instanceof TreeBuilder<?>) { + TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; + eh = treeBuilder.getErrorHandler(); + } + if (eh == null) { + eh = errorHandler; + } + if (eh == null) { + return; + } + SAXParseException spe = new SAXParseException(message, this); + eh.error(spe); + } + + /** + * Reports a warning + * + * @param message + * the message + * @throws SAXException + */ + public void warn(String message) throws SAXException { + if (errorHandler == null) { + return; + } + SAXParseException spe = new SAXParseException(message, this); + errorHandler.warning(spe); + } + + private void strBufToElementNameString() { + tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, + interner); + clearStrBufAfterUse(); + } + + private int emitCurrentTagToken(boolean selfClosing, int pos) + throws SAXException { + cstart = pos + 1; + maybeErrSlashInEndTag(selfClosing); + stateSave = Tokenizer.DATA; + HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES + : attributes); + if (endTag) { + /* + * When an end tag token is emitted, the content model flag must be + * switched to the PCDATA state. + */ + maybeErrAttributesOnEndTag(attrs); + // CPPONLY: if (!viewingXmlSource) { + tokenHandler.endTag(tagName); + // CPPONLY: } + // CPPONLY: if (newAttributesEachTime) { + // CPPONLY: Portability.delete(attributes); + // CPPONLY: attributes = null; + // CPPONLY: } + } else { + // CPPONLY: if (viewingXmlSource) { + // CPPONLY: assert newAttributesEachTime; + // CPPONLY: Portability.delete(attributes); + // CPPONLY: attributes = null; + // CPPONLY: } else { + tokenHandler.startTag(tagName, attrs, selfClosing); + // CPPONLY: } + } + tagName.release(); + tagName = null; + if (newAttributesEachTime) { + attributes = null; + } else { + attributes.clear(mappingLangToXmlLang); + } + /* + * The token handler may have called setStateAndEndTagExpectation + * and changed stateSave since the start of this method. + */ + return stateSave; + } + + private void attributeNameComplete() throws SAXException { + attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen + // [NOCPP[ + , namePolicy != XmlViolationPolicy.ALLOW + // ]NOCPP] + , interner); + clearStrBufAfterUse(); + + if (attributes == null) { + attributes = new HtmlAttributes(mappingLangToXmlLang); + } + + /* + * When the user agent leaves the attribute name state (and before + * emitting the tag token, if appropriate), the complete attribute's + * name must be compared to the other attributes on the same token; if + * there is already an attribute on the token with the exact same name, + * then this is a parse error and the new attribute must be dropped, + * along with the value that gets associated with it (if any). + */ + if (attributes.contains(attributeName)) { + errDuplicateAttribute(); + attributeName.release(); + attributeName = null; + } + } + + private void addAttributeWithoutValue() throws SAXException { + noteAttributeWithoutValue(); + + // [NOCPP[ + if (metaBoundaryPassed && AttributeName.CHARSET == attributeName + && ElementName.META == tagName) { + err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); + } + // ]NOCPP] + if (attributeName != null) { + // [NOCPP[ + if (html4) { + if (attributeName.isBoolean()) { + if (html4ModeCompatibleWithXhtml1Schemata) { + attributes.addAttribute(attributeName, + attributeName.getLocal(AttributeName.HTML), + xmlnsPolicy); + } else { + attributes.addAttribute(attributeName, "", xmlnsPolicy); + } + } else { + if (AttributeName.BORDER != attributeName) { + err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); + attributes.addAttribute(attributeName, "", xmlnsPolicy); + } + } + } else { + if (AttributeName.SRC == attributeName + || AttributeName.HREF == attributeName) { + warn("Attribute \u201C" + + attributeName.getLocal(AttributeName.HTML) + + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); + } + // ]NOCPP] + attributes.addAttribute(attributeName, + Portability.newEmptyString() + // [NOCPP[ + , xmlnsPolicy + // ]NOCPP] + // CPPONLY: , attributeLine + ); + // [NOCPP[ + } + // ]NOCPP] + attributeName = null; // attributeName has been adopted by the + // |attributes| object + } else { + clearStrBufAfterUse(); + } + } + + private void addAttributeWithValue() throws SAXException { + // [NOCPP[ + if (metaBoundaryPassed && ElementName.META == tagName + && AttributeName.CHARSET == attributeName) { + err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); + } + // ]NOCPP] + if (attributeName != null) { + String val = strBufToString(); // Ownership transferred to + // HtmlAttributes + // CPPONLY: if (mViewSource) { + // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); + // CPPONLY: } + // [NOCPP[ + if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata + && attributeName.isCaseFolded()) { + val = newAsciiLowerCaseStringFromString(val); + } + // ]NOCPP] + attributes.addAttribute(attributeName, val + // [NOCPP[ + , xmlnsPolicy + // ]NOCPP] + // CPPONLY: , attributeLine + ); + attributeName = null; // attributeName has been adopted by the + // |attributes| object + } else { + // We have a duplicate attribute. Explicitly discard its value. + clearStrBufAfterUse(); + } + } + + // [NOCPP[ + + private static String newAsciiLowerCaseStringFromString(String str) { + if (str == null) { + return null; + } + char[] buf = new char[str.length()]; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (c >= 'A' && c <= 'Z') { + c += 0x20; + } + buf[i] = c; + } + return new String(buf); + } + + protected void startErrorReporting() throws SAXException { + + } + + // ]NOCPP] + + public void start() throws SAXException { + initializeWithoutStarting(); + tokenHandler.startTokenization(this); + // [NOCPP[ + startErrorReporting(); + // ]NOCPP] + } + + public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { + int state = stateSave; + int returnState = returnStateSave; + char c = '\u0000'; + shouldSuspend = false; + lastCR = false; + + int start = buffer.getStart(); + int end = buffer.getEnd(); + + // In C++, the caller of tokenizeBuffer needs to do this explicitly. + // [NOCPP[ + ensureBufferSpace(end - start); + // ]NOCPP] + + /** + * The index of the last <code>char</code> read from <code>buf</code>. + */ + int pos = start - 1; + + /** + * The index of the first <code>char</code> in <code>buf</code> that is + * part of a coalesced run of character tokens or + * <code>Integer.MAX_VALUE</code> if there is not a current run being + * coalesced. + */ + switch (state) { + case DATA: + case RCDATA: + case SCRIPT_DATA: + case PLAINTEXT: + case RAWTEXT: + case CDATA_SECTION: + case SCRIPT_DATA_ESCAPED: + case SCRIPT_DATA_ESCAPE_START: + case SCRIPT_DATA_ESCAPE_START_DASH: + case SCRIPT_DATA_ESCAPED_DASH: + case SCRIPT_DATA_ESCAPED_DASH_DASH: + case SCRIPT_DATA_DOUBLE_ESCAPE_START: + case SCRIPT_DATA_DOUBLE_ESCAPED: + case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: + case SCRIPT_DATA_DOUBLE_ESCAPE_END: + cstart = start; + break; + default: + cstart = Integer.MAX_VALUE; + break; + } + + /** + * The number of <code>char</code>s in <code>buf</code> that have + * meaning. (The rest of the array is garbage and should not be + * examined.) + */ + // CPPONLY: if (mViewSource) { + // CPPONLY: mViewSource.SetBuffer(buffer); + // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); + // CPPONLY: } else { + // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); + // CPPONLY: } + // [NOCPP[ + pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, + end); + // ]NOCPP] + if (pos == end) { + // exiting due to end of buffer + buffer.setStart(pos); + } else { + buffer.setStart(pos + 1); + } + return lastCR; + } + + // [NOCPP[ + private void ensureBufferSpace(int inputLength) throws SAXException { + // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB. + // Adding to the general worst case instead of only the + // TreeBuilder-exposed worst case to avoid re-introducing a bug when + // unifying the tokenizer and tree builder buffers in the future. + int worstCase = strBufLen + inputLength + charRefBufLen + 2; + tokenHandler.ensureBufferSpace(worstCase); + if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { + // When altering infoset, if the comment contents are consecutive + // hyphens, each hyphen generates a space, too. These buffer + // contents never get emitted as characters() to the tokenHandler, + // which is why this calculation happens after the call to + // ensureBufferSpace on tokenHandler. + worstCase *= 2; + } + if (strBuf == null) { + // Add an arbitrary small value to avoid immediate reallocation + // once there are a few characters in the buffer. + strBuf = new char[worstCase + 128]; + } else if (worstCase > strBuf.length) { + // HotSpot reportedly allocates memory with 8-byte accuracy, so + // there's no point in trying to do math here to avoid slop. + // Maybe we should add some small constant to worstCase here + // but not doing that without profiling. In C++ with jemalloc, + // the corresponding method should do math to round up here + // to avoid slop. + char[] newBuf = new char[worstCase]; + System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); + strBuf = newBuf; + } + } + // ]NOCPP] + + @SuppressWarnings("unused") private int stateLoop(int state, char c, + int pos, @NoLength char[] buf, boolean reconsume, int returnState, + int endPos) throws SAXException { + /* + * Idioms used in this code: + * + * + * Consuming the next input character + * + * To consume the next input character, the code does this: if (++pos == + * endPos) { break stateloop; } c = checkChar(buf, pos); + * + * + * Staying in a state + * + * When there's a state that the tokenizer may stay in over multiple + * input characters, the state has a wrapper |for(;;)| loop and staying + * in the state continues the loop. + * + * + * Switching to another state + * + * To switch to another state, the code sets the state variable to the + * magic number of the new state. Then it either continues stateloop or + * breaks out of the state's own wrapper loop if the target state is + * right after the current state in source order. (This is a partial + * workaround for Java's lack of goto.) + * + * + * Reconsume support + * + * The spec sometimes says that an input character is reconsumed in + * another state. If a state can ever be entered so that an input + * character can be reconsumed in it, the state's code starts with an + * |if (reconsume)| that sets reconsume to false and skips over the + * normal code for consuming a new character. + * + * To reconsume the current character in another state, the code sets + * |reconsume| to true and then switches to the other state. + * + * + * Emitting character tokens + * + * This method emits character tokens lazily. Whenever a new range of + * character tokens starts, the field cstart must be set to the start + * index of the range. The flushChars() method must be called at the end + * of a range to flush it. + * + * + * U+0000 handling + * + * The various states have to handle the replacement of U+0000 with + * U+FFFD. However, if U+0000 would be reconsumed in another state, the + * replacement doesn't need to happen, because it's handled by the + * reconsuming state. + * + * + * LF handling + * + * Every state needs to increment the line number upon LF unless the LF + * gets reconsumed by another state which increments the line number. + * + * + * CR handling + * + * Every state needs to handle CR unless the CR gets reconsumed and is + * handled by the reconsuming state. The CR needs to be handled as if it + * were and LF, the lastCR field must be set to true and then this + * method must return. The IO driver will then swallow the next + * character if it is an LF to coalesce CRLF. + */ + stateloop: for (;;) { + switch (state) { + case DATA: + dataloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + switch (c) { + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in data state. + */ + flushChars(buf, pos); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('\u0000'); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + continue stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the tag + * open state. + */ + flushChars(buf, pos); + + state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); + break dataloop; // FALL THROUGH continue + // stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the input character as a + * character token. + * + * Stay in the data state. + */ + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case TAG_OPEN: + tagopenloop: for (;;) { + /* + * The behavior of this state depends on the content + * model flag. + */ + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * If the content model flag is set to the PCDATA state + * Consume the next input character: + */ + if (c >= 'A' && c <= 'Z') { + /* + * U+0041 LATIN CAPITAL LETTER A through to U+005A + * LATIN CAPITAL LETTER Z Create a new start tag + * token, + */ + endTag = false; + /* + * set its tag name to the lowercase version of the + * input character (add 0x0020 to the character's + * code point), + */ + clearStrBufBeforeUse(); + appendStrBuf((char) (c + 0x20)); + /* then switch to the tag name state. */ + state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); + /* + * (Don't emit the token yet; further details will + * be filled in before it is emitted.) + */ + break tagopenloop; + // continue stateloop; + } else if (c >= 'a' && c <= 'z') { + /* + * U+0061 LATIN SMALL LETTER A through to U+007A + * LATIN SMALL LETTER Z Create a new start tag + * token, + */ + endTag = false; + /* + * set its tag name to the input character, + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + /* then switch to the tag name state. */ + state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); + /* + * (Don't emit the token yet; further details will + * be filled in before it is emitted.) + */ + break tagopenloop; + // continue stateloop; + } + switch (c) { + case '!': + /* + * U+0021 EXCLAMATION MARK (!) Switch to the + * markup declaration open state. + */ + state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); + continue stateloop; + case '/': + /* + * U+002F SOLIDUS (/) Switch to the close tag + * open state. + */ + state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); + continue stateloop; + case '?': + // CPPONLY: if (viewingXmlSource) { + // CPPONLY: state = transition(state, + // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, + // CPPONLY: reconsume, + // CPPONLY: pos); + // CPPONLY: continue stateloop; + // CPPONLY: } + /* + * U+003F QUESTION MARK (?) Parse error. + */ + errProcessingInstruction(); + /* + * Switch to the bogus comment state. + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Parse error. + */ + errLtGt(); + /* + * Emit a U+003C LESS-THAN SIGN character token + * and a U+003E GREATER-THAN SIGN character + * token. + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 2); + /* Switch to the data state. */ + cstart = pos + 1; + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + /* + * Anything else Parse error. + */ + errBadCharAfterLt(c); + /* + * Emit a U+003C LESS-THAN SIGN character token + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + /* + * and reconsume the current input character in + * the data state. + */ + cstart = pos; + reconsume = true; + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + } + } + // FALL THROUGH DON'T REORDER + case TAG_NAME: + tagnameloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + strBufToElementNameString(); + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the before attribute name state. + */ + strBufToElementNameString(); + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + break tagnameloop; + // continue stateloop; + case '/': + /* + * U+002F SOLIDUS (/) Switch to the self-closing + * start tag state. + */ + strBufToElementNameString(); + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * tag token. + */ + strBufToElementNameString(); + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + if (c >= 'A' && c <= 'Z') { + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Append the + * lowercase version of the current input + * character (add 0x0020 to the character's + * code point) to the current tag token's + * tag name. + */ + c += 0x20; + } + /* + * Anything else Append the current input + * character to the current tag token's tag + * name. + */ + appendStrBuf(c); + /* + * Stay in the tag name state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_ATTRIBUTE_NAME: + beforeattributenameloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the before attribute name state. + */ + continue; + case '/': + /* + * U+002F SOLIDUS (/) Switch to the self-closing + * start tag state. + */ + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * tag token. + */ + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + case '\"': + case '\'': + case '<': + case '=': + /* + * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE + * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS + * SIGN (=) Parse error. + */ + errBadCharBeforeAttributeNameOrNull(c); + /* + * Treat it as per the "anything else" entry + * below. + */ + default: + /* + * Anything else Start a new attribute in the + * current tag token. + */ + if (c >= 'A' && c <= 'Z') { + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Set that + * attribute's name to the lowercase version + * of the current input character (add + * 0x0020 to the character's code point) + */ + c += 0x20; + } + // CPPONLY: attributeLine = line; + /* + * Set that attribute's name to the current + * input character, + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + /* + * and its value to the empty string. + */ + // Will do later. + /* + * Switch to the attribute name state. + */ + state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); + break beforeattributenameloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case ATTRIBUTE_NAME: + attributenameloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + attributeNameComplete(); + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the after attribute name state. + */ + attributeNameComplete(); + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + case '/': + /* + * U+002F SOLIDUS (/) Switch to the self-closing + * start tag state. + */ + attributeNameComplete(); + addAttributeWithoutValue(); + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + continue stateloop; + case '=': + /* + * U+003D EQUALS SIGN (=) Switch to the before + * attribute value state. + */ + attributeNameComplete(); + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); + break attributenameloop; + // continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * tag token. + */ + attributeNameComplete(); + addAttributeWithoutValue(); + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + case '\"': + case '\'': + case '<': + /* + * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE + * (') U+003C LESS-THAN SIGN (<) Parse error. + */ + errQuoteOrLtInAttributeNameOrNull(c); + /* + * Treat it as per the "anything else" entry + * below. + */ + default: + if (c >= 'A' && c <= 'Z') { + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Append the + * lowercase version of the current input + * character (add 0x0020 to the character's + * code point) to the current attribute's + * name. + */ + c += 0x20; + } + /* + * Anything else Append the current input + * character to the current attribute's name. + */ + appendStrBuf(c); + /* + * Stay in the attribute name state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_ATTRIBUTE_VALUE: + beforeattributevalueloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the before attribute value state. + */ + continue; + case '"': + /* + * U+0022 QUOTATION MARK (") Switch to the + * attribute value (double-quoted) state. + */ + // CPPONLY: attributeLine = line; + clearStrBufBeforeUse(); + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); + break beforeattributevalueloop; + // continue stateloop; + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the attribute + * value (unquoted) state and reconsume this + * input character. + */ + // CPPONLY: attributeLine = line; + clearStrBufBeforeUse(); + reconsume = true; + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); + noteUnquotedAttributeValue(); + continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Switch to the attribute + * value (single-quoted) state. + */ + // CPPONLY: attributeLine = line; + clearStrBufBeforeUse(); + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Parse error. + */ + errAttributeValueMissing(); + /* + * Emit the current tag token. + */ + addAttributeWithoutValue(); + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + case '<': + case '=': + case '`': + /* + * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN + * (=) U+0060 GRAVE ACCENT (`) + */ + errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); + /* + * Treat it as per the "anything else" entry + * below. + */ + default: + // [NOCPP[ + errHtml4NonNameInUnquotedAttribute(c); + // ]NOCPP] + /* + * Anything else Append the current input + * character to the current attribute's value. + */ + // CPPONLY: attributeLine = line; + clearStrBufBeforeUse(); + appendStrBuf(c); + /* + * Switch to the attribute value (unquoted) + * state. + */ + + state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); + noteUnquotedAttributeValue(); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: + attributevaluedoublequotedloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '"': + /* + * U+0022 QUOTATION MARK (") Switch to the after + * attribute value (quoted) state. + */ + addAttributeWithValue(); + + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); + break attributevaluedoublequotedloop; + // continue stateloop; + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in attribute value state, with the + * additional allowed character being U+0022 + * QUOTATION MARK ("). + */ + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('\"'); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the current input + * character to the current attribute's value. + */ + appendStrBuf(c); + /* + * Stay in the attribute value (double-quoted) + * state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case AFTER_ATTRIBUTE_VALUE_QUOTED: + afterattributevaluequotedloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the before attribute name state. + */ + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + case '/': + /* + * U+002F SOLIDUS (/) Switch to the self-closing + * start tag state. + */ + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + break afterattributevaluequotedloop; + // continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * tag token. + */ + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + default: + /* + * Anything else Parse error. + */ + errNoSpaceBetweenAttributes(); + /* + * Reconsume the character in the before + * attribute name state. + */ + reconsume = true; + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case SELF_CLOSING_START_TAG: + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Set the self-closing + * flag of the current tag token. Emit the current + * tag token. + */ + // [NOCPP[ + errHtml4XmlVoidSyntax(); + // ]NOCPP] + state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + default: + /* Anything else Parse error. */ + errSlashNotFollowedByGt(); + /* + * Reconsume the character in the before attribute + * name state. + */ + reconsume = true; + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + } + // XXX reorder point + case ATTRIBUTE_VALUE_UNQUOTED: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + addAttributeWithValue(); + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the before attribute name state. + */ + addAttributeWithValue(); + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in attribute value state, with the + * additional allowed character being U+003E + * GREATER-THAN SIGN (>) + */ + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('>'); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * tag token. + */ + addAttributeWithValue(); + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + case '<': + case '\"': + case '\'': + case '=': + case '`': + /* + * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE + * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS + * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. + */ + errUnquotedAttributeValOrNull(c); + /* + * Treat it as per the "anything else" entry + * below. + */ + // fall through + default: + // [NOCPP] + errHtml4NonNameInUnquotedAttribute(c); + // ]NOCPP] + /* + * Anything else Append the current input + * character to the current attribute's value. + */ + appendStrBuf(c); + /* + * Stay in the attribute value (unquoted) state. + */ + continue; + } + } + // XXX reorder point + case AFTER_ATTRIBUTE_NAME: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the after attribute name state. + */ + continue; + case '/': + /* + * U+002F SOLIDUS (/) Switch to the self-closing + * start tag state. + */ + addAttributeWithoutValue(); + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + continue stateloop; + case '=': + /* + * U+003D EQUALS SIGN (=) Switch to the before + * attribute value state. + */ + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * tag token. + */ + addAttributeWithoutValue(); + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + /* + * Switch to the data state. + */ + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + case '\"': + case '\'': + case '<': + errQuoteOrLtInAttributeNameOrNull(c); + /* + * Treat it as per the "anything else" entry + * below. + */ + default: + addAttributeWithoutValue(); + /* + * Anything else Start a new attribute in the + * current tag token. + */ + if (c >= 'A' && c <= 'Z') { + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Set that + * attribute's name to the lowercase version + * of the current input character (add + * 0x0020 to the character's code point) + */ + c += 0x20; + } + /* + * Set that attribute's name to the current + * input character, + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + /* + * and its value to the empty string. + */ + // Will do later. + /* + * Switch to the attribute name state. + */ + state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case MARKUP_DECLARATION_OPEN: + markupdeclarationopenloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * If the next two characters are both U+002D + * HYPHEN-MINUS characters (-), consume those two + * characters, create a comment token whose data is the + * empty string, and switch to the comment start state. + * + * Otherwise, if the next seven characters are an ASCII + * case-insensitive match for the word "DOCTYPE", then + * consume those characters and switch to the DOCTYPE + * state. + * + * Otherwise, if the insertion mode is + * "in foreign content" and the current node is not an + * element in the HTML namespace and the next seven + * characters are an case-sensitive match for the string + * "[CDATA[" (the five uppercase letters "CDATA" with a + * U+005B LEFT SQUARE BRACKET character before and + * after), then consume those characters and switch to + * the CDATA section state. + * + * Otherwise, is is a parse error. Switch to the bogus + * comment state. The next character that is consumed, + * if any, is the first character that will be in the + * comment. + */ + switch (c) { + case '-': + clearStrBufBeforeUse(); + appendStrBuf(c); + state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); + break markupdeclarationopenloop; + // continue stateloop; + case 'd': + case 'D': + clearStrBufBeforeUse(); + appendStrBuf(c); + index = 0; + state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); + continue stateloop; + case '[': + if (tokenHandler.cdataSectionAllowed()) { + clearStrBufBeforeUse(); + appendStrBuf(c); + index = 0; + state = transition(state, Tokenizer.CDATA_START, reconsume, pos); + continue stateloop; + } + // else fall through + default: + errBogusComment(); + clearStrBufBeforeUse(); + reconsume = true; + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case MARKUP_DECLARATION_HYPHEN: + markupdeclarationhyphenloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '\u0000': + break stateloop; + case '-': + clearStrBufAfterOneHyphen(); + state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); + break markupdeclarationhyphenloop; + // continue stateloop; + default: + errBogusComment(); + reconsume = true; + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_START: + commentstartloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Comment start state + * + * + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Switch to the comment + * start dash state. + */ + appendStrBuf(c); + state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Parse error. + */ + errPrematureEndOfComment(); + /* Emit the comment token. */ + emitComment(0, pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + break stateloop; + case '\n': + appendStrBufLineFeed(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + break commentstartloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the input character to + * the comment token's data. + */ + appendStrBuf(c); + /* + * Switch to the comment state. + */ + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + break commentstartloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT: + commentloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Comment state Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Switch to the comment + * end dash state + */ + appendStrBuf(c); + state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); + break commentloop; + // continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the input character to + * the comment token's data. + */ + appendStrBuf(c); + /* + * Stay in the comment state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_END_DASH: + commentenddashloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Comment end dash state Consume the next input + * character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Switch to the comment + * end state + */ + appendStrBuf(c); + state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); + break commentenddashloop; + // continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + break stateloop; + case '\n': + appendStrBufLineFeed(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append a U+002D HYPHEN-MINUS + * (-) character and the input character to the + * comment token's data. + */ + appendStrBuf(c); + /* + * Switch to the comment state. + */ + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case COMMENT_END: + commentendloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Comment end dash state Consume the next input + * character: + */ + switch (c) { + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the comment + * token. + */ + emitComment(2, pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '-': + /* U+002D HYPHEN-MINUS (-) Parse error. */ + /* + * Append a U+002D HYPHEN-MINUS (-) character to + * the comment token's data. + */ + adjustDoubleHyphenAndAppendToStrBufAndErr(c); + /* + * Stay in the comment end state. + */ + continue; + case '\r': + adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + break stateloop; + case '\n': + adjustDoubleHyphenAndAppendToStrBufLineFeed(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + case '!': + errHyphenHyphenBang(); + appendStrBuf(c); + state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Append two U+002D HYPHEN-MINUS (-) characters + * and the input character to the comment + * token's data. + */ + adjustDoubleHyphenAndAppendToStrBufAndErr(c); + /* + * Switch to the comment state. + */ + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case COMMENT_END_BANG: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Comment end bang state + * + * Consume the next input character: + */ + switch (c) { + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the comment + * token. + */ + emitComment(3, pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '-': + /* + * Append two U+002D HYPHEN-MINUS (-) characters + * and a U+0021 EXCLAMATION MARK (!) character + * to the comment token's data. + */ + appendStrBuf(c); + /* + * Switch to the comment end dash state. + */ + state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append two U+002D HYPHEN-MINUS + * (-) characters, a U+0021 EXCLAMATION MARK (!) + * character, and the input character to the + * comment token's data. Switch to the comment + * state. + */ + appendStrBuf(c); + /* + * Switch to the comment state. + */ + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case COMMENT_START_DASH: + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Comment start dash state + * + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Switch to the comment end + * state + */ + appendStrBuf(c); + state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); + continue stateloop; + case '>': + errPrematureEndOfComment(); + /* Emit the comment token. */ + emitComment(1, pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + break stateloop; + case '\n': + appendStrBufLineFeed(); + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Append a U+002D HYPHEN-MINUS character (-) and + * the current input character to the comment + * token's data. + */ + appendStrBuf(c); + /* + * Switch to the comment state. + */ + state = transition(state, Tokenizer.COMMENT, reconsume, pos); + continue stateloop; + } + // XXX reorder point + case CDATA_START: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + if (index < 6) { // CDATA_LSQB.length + if (c == Tokenizer.CDATA_LSQB[index]) { + appendStrBuf(c); + } else { + errBogusComment(); + reconsume = true; + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + } + index++; + continue; + } else { + clearStrBufAfterUse(); + cstart = pos; // start coalescing + reconsume = true; + state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); + break; // FALL THROUGH continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case CDATA_SECTION: + cdatasectionloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + switch (c) { + case ']': + flushChars(buf, pos); + state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); + break cdatasectionloop; // FALL THROUGH + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + default: + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case CDATA_RSQB: + cdatarsqb: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case ']': + state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); + break cdatarsqb; + default: + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, + 1); + cstart = pos; + reconsume = true; + state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case CDATA_RSQB_RSQB: + cdatarsqbrsqb: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case ']': + // Saw a third ]. Emit one ] (logically the + // first one) and stay in this state to + // remember that the last two characters seen + // have been ]]. + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); + continue; + case '>': + cstart = pos + 1; + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); + cstart = pos; + reconsume = true; + state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case ATTRIBUTE_VALUE_SINGLE_QUOTED: + attributevaluesinglequotedloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '\'': + /* + * U+0027 APOSTROPHE (') Switch to the after + * attribute value (quoted) state. + */ + addAttributeWithValue(); + + state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); + continue stateloop; + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in attribute value state, with the + * + additional allowed character being U+0027 + * APOSTROPHE ('). + */ + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('\''); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + break attributevaluesinglequotedloop; + // continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the current input + * character to the current attribute's value. + */ + appendStrBuf(c); + /* + * Stay in the attribute value (double-quoted) + * state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case CONSUME_CHARACTER_REFERENCE: + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + if (c == '\u0000') { + break stateloop; + } + /* + * Unlike the definition is the spec, this state does not + * return a value and never requires the caller to + * backtrack. This state takes care of emitting characters + * or appending to the current attribute value. It also + * takes care of that in the case when consuming the + * character reference fails. + */ + /* + * This section defines how to consume a character + * reference. This definition is used when parsing character + * references in text and in attributes. + * + * The behavior depends on the identity of the next + * character (the one immediately after the U+0026 AMPERSAND + * character): + */ + switch (c) { + case ' ': + case '\t': + case '\n': + case '\r': // we'll reconsume! + case '\u000C': + case '<': + case '&': + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + case '#': + /* + * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER + * SIGN. + */ + appendCharRefBuf('#'); + state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); + continue stateloop; + default: + if (c == additional) { + emitOrAppendCharRefBuf(returnState); + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + if (c >= 'a' && c <= 'z') { + firstCharKey = c - 'a' + 26; + } else if (c >= 'A' && c <= 'Z') { + firstCharKey = c - 'A'; + } else { + // No match + /* + * If no match can be made, then this is a parse + * error. + */ + errNoNamedCharacterMatch(); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + // Didn't fail yet + appendCharRefBuf(c); + state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); + // FALL THROUGH continue stateloop; + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case CHARACTER_REFERENCE_HILO_LOOKUP: + { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + if (c == '\u0000') { + break stateloop; + } + /* + * The data structure is as follows: + * + * HILO_ACCEL is a two-dimensional int array whose major + * index corresponds to the second character of the + * character reference (code point as index) and the + * minor index corresponds to the first character of the + * character reference (packed so that A-Z runs from 0 + * to 25 and a-z runs from 26 to 51). This layout makes + * it easier to use the sparseness of the data structure + * to omit parts of it: The second dimension of the + * table is null when no character reference starts with + * the character corresponding to that row. + * + * The int value HILO_ACCEL (by these indeces) is zero + * if there exists no character reference starting with + * that two-letter prefix. Otherwise, the value is an + * int that packs two shorts so that the higher short is + * the index of the highest character reference name + * with that prefix in NAMES and the lower short + * corresponds to the index of the lowest character + * reference name with that prefix. (It happens that the + * first two character reference names share their + * prefix so the packed int cannot be 0 by packing the + * two shorts.) + * + * NAMES is an array of byte arrays where each byte + * array encodes the name of a character references as + * ASCII. The names omit the first two letters of the + * name. (Since storing the first two letters would be + * redundant with the data contained in HILO_ACCEL.) The + * entries are lexically sorted. + * + * For a given index in NAMES, the same index in VALUES + * contains the corresponding expansion as an array of + * two UTF-16 code units (either the character and + * U+0000 or a suggogate pair). + */ + int hilo = 0; + if (c <= 'z') { + @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; + if (row != null) { + hilo = row[firstCharKey]; + } + } + if (hilo == 0) { + /* + * If no match can be made, then this is a parse + * error. + */ + errNoNamedCharacterMatch(); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + // Didn't fail yet + appendCharRefBuf(c); + lo = hilo & 0xFFFF; + hi = hilo >> 16; + entCol = -1; + candidate = -1; + charRefBufMark = 0; + state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); + // FALL THROUGH continue stateloop; + } + case CHARACTER_REFERENCE_TAIL: + outer: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + if (c == '\u0000') { + break stateloop; + } + entCol++; + /* + * Consume the maximum number of characters possible, + * with the consumed characters matching one of the + * identifiers in the first column of the named + * character references table (in a case-sensitive + * manner). + */ + loloop: for (;;) { + if (hi < lo) { + break outer; + } + if (entCol == NamedCharacters.NAMES[lo].length()) { + candidate = lo; + charRefBufMark = charRefBufLen; + lo++; + } else if (entCol > NamedCharacters.NAMES[lo].length()) { + break outer; + } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { + lo++; + } else { + break loloop; + } + } + + hiloop: for (;;) { + if (hi < lo) { + break outer; + } + if (entCol == NamedCharacters.NAMES[hi].length()) { + break hiloop; + } + if (entCol > NamedCharacters.NAMES[hi].length()) { + break outer; + } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { + hi--; + } else { + break hiloop; + } + } + + if (c == ';') { + // If we see a semicolon, there cannot be a + // longer match. Break the loop. However, before + // breaking, take the longest match so far as the + // candidate, if we are just about to complete a + // match. + if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { + candidate = lo; + charRefBufMark = charRefBufLen; + } + break outer; + } + + if (hi < lo) { + break outer; + } + appendCharRefBuf(c); + continue; + } + + if (candidate == -1) { + // reconsume deals with CR, LF or nul + /* + * If no match can be made, then this is a parse error. + */ + errNoNamedCharacterMatch(); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } else { + // c can't be CR, LF or nul if we got here + @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; + if (candidateName.length() == 0 + || candidateName.charAt(candidateName.length() - 1) != ';') { + /* + * If the last character matched is not a U+003B + * SEMICOLON (;), there is a parse error. + */ + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + /* + * If the entity is being consumed as part of an + * attribute, and the last character matched is + * not a U+003B SEMICOLON (;), + */ + char ch; + if (charRefBufMark == charRefBufLen) { + ch = c; + } else { + ch = charRefBuf[charRefBufMark]; + } + if (ch == '=' || (ch >= '0' && ch <= '9') + || (ch >= 'A' && ch <= 'Z') + || (ch >= 'a' && ch <= 'z')) { + /* + * and the next character is either a U+003D + * EQUALS SIGN character (=) or in the range + * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, + * U+0041 LATIN CAPITAL LETTER A to U+005A + * LATIN CAPITAL LETTER Z, or U+0061 LATIN + * SMALL LETTER A to U+007A LATIN SMALL + * LETTER Z, then, for historical reasons, + * all the characters that were matched + * after the U+0026 AMPERSAND (&) must be + * unconsumed, and nothing is returned. + */ + errNoNamedCharacterMatch(); + appendCharRefBufToStrBuf(); + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + } + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + errUnescapedAmpersandInterpretedAsCharacterReference(); + } else { + errNotSemicolonTerminated(); + } + } + + /* + * Otherwise, return a character token for the character + * corresponding to the entity name (as given by the + * second column of the named character references + * table). + */ + // CPPONLY: completedNamedCharacterReference(); + @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; + if ( + // [NOCPP[ + val.length == 1 + // ]NOCPP] + // CPPONLY: val[1] == 0 + ) { + emitOrAppendOne(val, returnState); + } else { + emitOrAppendTwo(val, returnState); + } + // this is so complicated! + if (charRefBufMark < charRefBufLen) { + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + appendStrBuf(charRefBuf, charRefBufMark, + charRefBufLen - charRefBufMark); + } else { + tokenHandler.characters(charRefBuf, charRefBufMark, + charRefBufLen - charRefBufMark); + } + } + // charRefBufLen will be zeroed below! + + // Check if we broke out early with c being the last + // character that matched as opposed to being the + // first one that didn't match. In the case of an + // early break, the next run on text should start + // *after* the current character and the current + // character shouldn't be reconsumed. + boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); + charRefBufLen = 0; + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = earlyBreak ? pos + 1 : pos; + } + reconsume = !earlyBreak; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + /* + * If the markup contains I'm ¬it; I tell you, the + * entity is parsed as "not", as in, I'm ¬it; I tell + * you. But if the markup was I'm ∉ I tell you, + * the entity would be parsed as "notin;", resulting in + * I'm ∉ I tell you. + */ + } + // XXX reorder point + case CONSUME_NCR: + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + value = 0; + seenDigits = false; + /* + * The behavior further depends on the character after the + * U+0023 NUMBER SIGN: + */ + switch (c) { + case 'x': + case 'X': + + /* + * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL + * LETTER X Consume the X. + * + * Follow the steps below, but using the range of + * characters U+0030 DIGIT ZERO through to U+0039 + * DIGIT NINE, U+0061 LATIN SMALL LETTER A through + * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN + * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL + * LETTER F (in other words, 0-9, A-F, a-f). + * + * When it comes to interpreting the number, + * interpret it as a hexadecimal number. + */ + appendCharRefBuf(c); + state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); + continue stateloop; + default: + /* + * Anything else Follow the steps below, but using + * the range of characters U+0030 DIGIT ZERO through + * to U+0039 DIGIT NINE (i.e. just 0-9). + * + * When it comes to interpreting the number, + * interpret it as a decimal number. + */ + reconsume = true; + state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); + // FALL THROUGH continue stateloop; + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case DECIMAL_NRC_LOOP: + decimalloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume as many characters as match the range of + * characters given above. + */ + assert value >= 0: "value must not become negative."; + if (c >= '0' && c <= '9') { + seenDigits = true; + // Avoid overflow + if (value <= 0x10FFFF) { + value *= 10; + value += c - '0'; + } + continue; + } else if (c == ';') { + if (seenDigits) { + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos + 1; + } + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); + // FALL THROUGH continue stateloop; + break decimalloop; + } else { + errNoDigitsInNCR(); + appendCharRefBuf(';'); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos + 1; + } + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + } else { + /* + * If no characters match the range, then don't + * consume any characters (and unconsume the U+0023 + * NUMBER SIGN character and, if appropriate, the X + * character). This is a parse error; nothing is + * returned. + * + * Otherwise, if the next character is a U+003B + * SEMICOLON, consume that too. If it isn't, there + * is a parse error. + */ + if (!seenDigits) { + errNoDigitsInNCR(); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } else { + errCharRefLacksSemicolon(); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); + // FALL THROUGH continue stateloop; + break decimalloop; + } + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case HANDLE_NCR_VALUE: + // WARNING previous state sets reconsume + // We are not going to emit the contents of charRefBuf. + charRefBufLen = 0; + // XXX inline this case if the method size can take it + handleNcrValue(returnState); + state = transition(state, returnState, reconsume, pos); + continue stateloop; + // XXX reorder point + case HEX_NCR_LOOP: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume as many characters as match the range of + * characters given above. + */ + assert value >= 0: "value must not become negative."; + if (c >= '0' && c <= '9') { + seenDigits = true; + // Avoid overflow + if (value <= 0x10FFFF) { + value *= 16; + value += c - '0'; + } + continue; + } else if (c >= 'A' && c <= 'F') { + seenDigits = true; + // Avoid overflow + if (value <= 0x10FFFF) { + value *= 16; + value += c - 'A' + 10; + } + continue; + } else if (c >= 'a' && c <= 'f') { + seenDigits = true; + // Avoid overflow + if (value <= 0x10FFFF) { + value *= 16; + value += c - 'a' + 10; + } + continue; + } else if (c == ';') { + if (seenDigits) { + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos + 1; + } + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); + continue stateloop; + } else { + errNoDigitsInNCR(); + appendCharRefBuf(';'); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos + 1; + } + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + } else { + /* + * If no characters match the range, then don't + * consume any characters (and unconsume the U+0023 + * NUMBER SIGN character and, if appropriate, the X + * character). This is a parse error; nothing is + * returned. + * + * Otherwise, if the next character is a U+003B + * SEMICOLON, consume that too. If it isn't, there + * is a parse error. + */ + if (!seenDigits) { + errNoDigitsInNCR(); + emitOrAppendCharRefBuf(returnState); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } else { + errCharRefLacksSemicolon(); + if ((returnState & DATA_AND_RCDATA_MASK) == 0) { + cstart = pos; + } + reconsume = true; + state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); + continue stateloop; + } + } + } + // XXX reorder point + case PLAINTEXT: + plaintextloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + switch (c) { + case '\u0000': + emitPlaintextReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Stay in the + * RAWTEXT state. + */ + continue; + } + } + // XXX reorder point + case CLOSE_TAG_OPEN: + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Otherwise, if the content model flag is set to the PCDATA + * state, or if the next few characters do match that tag + * name, consume the next input character: + */ + switch (c) { + case '>': + /* U+003E GREATER-THAN SIGN (>) Parse error. */ + errLtSlashGt(); + /* + * Switch to the data state. + */ + cstart = pos + 1; + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + silentCarriageReturn(); + /* Anything else Parse error. */ + errGarbageAfterLtSlash(); + /* + * Switch to the bogus comment state. + */ + clearStrBufBeforeUse(); + appendStrBuf('\n'); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + /* Anything else Parse error. */ + errGarbageAfterLtSlash(); + /* + * Switch to the bogus comment state. + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + if (c >= 'A' && c <= 'Z') { + c += 0x20; + } + if (c >= 'a' && c <= 'z') { + /* + * U+0061 LATIN SMALL LETTER A through to U+007A + * LATIN SMALL LETTER Z Create a new end tag + * token, + */ + endTag = true; + /* + * set its tag name to the input character, + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + /* + * then switch to the tag name state. (Don't + * emit the token yet; further details will be + * filled in before it is emitted.) + */ + state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); + continue stateloop; + } else { + /* Anything else Parse error. */ + errGarbageAfterLtSlash(); + /* + * Switch to the bogus comment state. + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case RCDATA: + rcdataloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + switch (c) { + case '&': + /* + * U+0026 AMPERSAND (&) Switch to the character + * reference in RCDATA state. + */ + flushChars(buf, pos); + assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; + appendCharRefBuf(c); + setAdditionalAndRememberAmpersandLocation('\u0000'); + returnState = state; + state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); + continue stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the + * RCDATA less-than sign state. + */ + flushChars(buf, pos); + + returnState = state; + state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); + continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Emit the current input character as a + * character token. Stay in the RCDATA state. + */ + continue; + } + } + // XXX reorder point + case RAWTEXT: + rawtextloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + switch (c) { + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the + * RAWTEXT less-than sign state. + */ + flushChars(buf, pos); + + returnState = state; + state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); + break rawtextloop; + // FALL THRU continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Emit the current input character as a + * character token. Stay in the RAWTEXT state. + */ + continue; + } + } + // XXX fallthru don't reorder + case RAWTEXT_RCDATA_LESS_THAN_SIGN: + rawtextrcdatalessthansignloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '/': + /* + * U+002F SOLIDUS (/) Set the temporary buffer + * to the empty string. Switch to the script + * data end tag open state. + */ + index = 0; + clearStrBufBeforeUse(); + state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); + break rawtextrcdatalessthansignloop; + // FALL THRU continue stateloop; + default: + /* + * Otherwise, emit a U+003C LESS-THAN SIGN + * character token + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + /* + * and reconsume the current input character in + * the data state. + */ + cstart = pos; + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + } + // XXX fall thru. don't reorder. + case NON_DATA_END_TAG_NAME: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * ASSERT! when entering this state, set index to 0 and + * call clearStrBufBeforeUse() assert (contentModelElement != + * null); Let's implement the above without lookahead. + * strBuf is the 'temporary buffer'. + */ + if (index < endTagExpectationAsArray.length) { + char e = endTagExpectationAsArray[index]; + char folded = c; + if (c >= 'A' && c <= 'Z') { + folded += 0x20; + } + if (folded != e) { + // [NOCPP[ + errHtml4LtSlashInRcdata(folded); + // ]NOCPP] + tokenHandler.characters(Tokenizer.LT_SOLIDUS, + 0, 2); + emitStrBuf(); + cstart = pos; + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + appendStrBuf(c); + index++; + continue; + } else { + endTag = true; + // XXX replace contentModelElement with different + // type + tagName = endTagExpectation; + switch (c) { + case '\r': + silentCarriageReturn(); + clearStrBufAfterUse(); // strBuf not used + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE + * FEED (LF) U+000C FORM FEED (FF) U+0020 + * SPACE If the current end tag token is an + * appropriate end tag token, then switch to + * the before attribute name state. + */ + clearStrBufAfterUse(); // strBuf not used + state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); + continue stateloop; + case '/': + /* + * U+002F SOLIDUS (/) If the current end tag + * token is an appropriate end tag token, + * then switch to the self-closing start tag + * state. + */ + clearStrBufAfterUse(); // strBuf not used + state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) If the + * current end tag token is an appropriate + * end tag token, then emit the current tag + * token and switch to the data state. + */ + clearStrBufAfterUse(); // strBuf not used + state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); + if (shouldSuspend) { + break stateloop; + } + continue stateloop; + default: + /* + * Emit a U+003C LESS-THAN SIGN character + * token, a U+002F SOLIDUS character token, + * a character token for each of the + * characters in the temporary buffer (in + * the order they were added to the buffer), + * and reconsume the current input character + * in the RAWTEXT state. + */ + // [NOCPP[ + errWarnLtSlashInRcdata(); + // ]NOCPP] + tokenHandler.characters( + Tokenizer.LT_SOLIDUS, 0, 2); + emitStrBuf(); + if (c == '\u0000') { + emitReplacementCharacter(buf, pos); + } else { + cstart = pos; // don't drop the + // character + } + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } + } + } + // XXX reorder point + // BEGIN HOTSPOT WORKAROUND + case BOGUS_COMMENT: + boguscommentloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume every character up to and including the first + * U+003E GREATER-THAN SIGN character (>) or the end of + * the file (EOF), whichever comes first. Emit a comment + * token whose data is the concatenation of all the + * characters starting from and including the character + * that caused the state machine to switch into the + * bogus comment state, up to and including the + * character immediately before the last consumed + * character (i.e. up to the character just before the + * U+003E or EOF character). (If the comment was started + * by the end of the file (EOF), the token is empty.) + * + * Switch to the data state. + * + * If the end of the file was reached, reconsume the EOF + * character. + */ + switch (c) { + case '>': + emitComment(0, pos); + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '-': + appendStrBuf(c); + state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); + break boguscommentloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + appendStrBuf(c); + continue; + } + } + // FALLTHRU DON'T REORDER + case BOGUS_COMMENT_HYPHEN: + boguscommenthyphenloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '>': + // [NOCPP[ + maybeAppendSpaceToBogusComment(); + // ]NOCPP] + emitComment(0, pos); + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '-': + appendSecondHyphenToBogusComment(); + continue boguscommenthyphenloop; + case '\r': + appendStrBufCarriageReturn(); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + break stateloop; + case '\n': + appendStrBufLineFeed(); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + appendStrBuf(c); + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case SCRIPT_DATA: + scriptdataloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + switch (c) { + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the + * script data less-than sign state. + */ + flushChars(buf, pos); + returnState = state; + state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); + break scriptdataloop; // FALL THRU continue + // stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Stay in the + * script data state. + */ + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_LESS_THAN_SIGN: + scriptdatalessthansignloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '/': + /* + * U+002F SOLIDUS (/) Set the temporary buffer + * to the empty string. Switch to the script + * data end tag open state. + */ + index = 0; + clearStrBufBeforeUse(); + state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); + continue stateloop; + case '!': + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + cstart = pos; + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); + break scriptdatalessthansignloop; // FALL THRU + // continue + // stateloop; + default: + /* + * Otherwise, emit a U+003C LESS-THAN SIGN + * character token + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + /* + * and reconsume the current input character in + * the data state. + */ + cstart = pos; + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_ESCAPE_START: + scriptdataescapestartloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Switch to the + * script data escape start dash state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); + break scriptdataescapestartloop; // FALL THRU + // continue + // stateloop; + default: + /* + * Anything else Reconsume the current input + * character in the script data state. + */ + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_ESCAPE_START_DASH: + scriptdataescapestartdashloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Switch to the + * script data escaped dash dash state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); + break scriptdataescapestartdashloop; + // continue stateloop; + default: + /* + * Anything else Reconsume the current input + * character in the script data state. + */ + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_ESCAPED_DASH_DASH: + scriptdataescapeddashdashloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Stay in the + * script data escaped dash dash state. + */ + continue; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the + * script data escaped less-than sign state. + */ + flushChars(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit a U+003E + * GREATER-THAN SIGN character token. Switch to + * the script data state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); + continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + break scriptdataescapeddashdashloop; + case '\r': + emitCarriageReturn(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Switch to the + * script data escaped state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + break scriptdataescapeddashdashloop; + // continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_ESCAPED: + scriptdataescapedloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Switch to the + * script data escaped dash state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); + break scriptdataescapedloop; // FALL THRU + // continue + // stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the + * script data escaped less-than sign state. + */ + flushChars(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Stay in the + * script data escaped state. + */ + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_ESCAPED_DASH: + scriptdataescapeddashloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Switch to the + * script data escaped dash dash state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); + continue stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Switch to the + * script data escaped less-than sign state. + */ + flushChars(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + break scriptdataescapeddashloop; + // continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + continue stateloop; + case '\r': + emitCarriageReturn(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Switch to the + * script data escaped state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: + scriptdataescapedlessthanloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '/': + /* + * U+002F SOLIDUS (/) Set the temporary buffer + * to the empty string. Switch to the script + * data escaped end tag open state. + */ + index = 0; + clearStrBufBeforeUse(); + returnState = Tokenizer.SCRIPT_DATA_ESCAPED; + state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); + continue stateloop; + case 'S': + case 's': + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Emit a U+003C + * LESS-THAN SIGN character token and the + * current input character as a character token. + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + cstart = pos; + index = 1; + /* + * Set the temporary buffer to the empty string. + * Append the lowercase version of the current + * input character (add 0x0020 to the + * character's code point) to the temporary + * buffer. Switch to the script data double + * escape start state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); + break scriptdataescapedlessthanloop; + // continue stateloop; + default: + /* + * Anything else Emit a U+003C LESS-THAN SIGN + * character token and reconsume the current + * input character in the script data escaped + * state. + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + cstart = pos; + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_DOUBLE_ESCAPE_START: + scriptdatadoubleescapestartloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + assert index > 0; + if (index < 6) { // SCRIPT_ARR.length + char folded = c; + if (c >= 'A' && c <= 'Z') { + folded += 0x20; + } + if (folded != Tokenizer.SCRIPT_ARR[index]) { + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + continue stateloop; + } + index++; + continue; + } + switch (c) { + case '\r': + emitCarriageReturn(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + case ' ': + case '\t': + case '\u000C': + case '/': + case '>': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN + * (>) Emit the current input character as a + * character token. If the temporary buffer is + * the string "script", then switch to the + * script data double escaped state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + break scriptdatadoubleescapestartloop; + // continue stateloop; + default: + /* + * Anything else Reconsume the current input + * character in the script data escaped state. + */ + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_DOUBLE_ESCAPED: + scriptdatadoubleescapedloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Switch to the + * script data double escaped dash state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); + break scriptdatadoubleescapedloop; // FALL THRU + // continue + // stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Emit a U+003C + * LESS-THAN SIGN character token. Switch to the + * script data double escaped less-than sign + * state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + continue; + case '\r': + emitCarriageReturn(buf, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Stay in the + * script data double escaped state. + */ + continue; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: + scriptdatadoubleescapeddashloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Switch to the + * script data double escaped dash dash state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); + break scriptdatadoubleescapeddashloop; + // continue stateloop; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Emit a U+003C + * LESS-THAN SIGN character token. Switch to the + * script data double escaped less-than sign + * state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + case '\r': + emitCarriageReturn(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Switch to the + * script data double escaped state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: + scriptdatadoubleescapeddashdashloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '-': + /* + * U+002D HYPHEN-MINUS (-) Emit a U+002D + * HYPHEN-MINUS character token. Stay in the + * script data double escaped dash dash state. + */ + continue; + case '<': + /* + * U+003C LESS-THAN SIGN (<) Emit a U+003C + * LESS-THAN SIGN character token. Switch to the + * script data double escaped less-than sign + * state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); + break scriptdatadoubleescapeddashdashloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit a U+003E + * GREATER-THAN SIGN character token. Switch to + * the script data state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); + continue stateloop; + case '\u0000': + emitReplacementCharacter(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + case '\r': + emitCarriageReturn(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + default: + /* + * Anything else Emit the current input + * character as a character token. Switch to the + * script data double escaped state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: + scriptdatadoubleescapedlessthanloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '/': + /* + * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS + * character token. Set the temporary buffer to + * the empty string. Switch to the script data + * double escape end state. + */ + index = 0; + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); + break scriptdatadoubleescapedlessthanloop; + default: + /* + * Anything else Reconsume the current input + * character in the script data double escaped + * state. + */ + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER + case SCRIPT_DATA_DOUBLE_ESCAPE_END: + scriptdatadoubleescapeendloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + if (index < 6) { // SCRIPT_ARR.length + char folded = c; + if (c >= 'A' && c <= 'Z') { + folded += 0x20; + } + if (folded != Tokenizer.SCRIPT_ARR[index]) { + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + } + index++; + continue; + } + switch (c) { + case '\r': + emitCarriageReturn(buf, pos); + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + case ' ': + case '\t': + case '\u000C': + case '/': + case '>': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN + * (>) Emit the current input character as a + * character token. If the temporary buffer is + * the string "script", then switch to the + * script data escaped state. + */ + state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); + continue stateloop; + default: + /* + * Reconsume the current input character in the + * script data double escaped state. + */ + reconsume = true; + state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); + continue stateloop; + } + } + // XXX reorder point + case MARKUP_DECLARATION_OCTYPE: + markupdeclarationdoctypeloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + if (index < 6) { // OCTYPE.length + char folded = c; + if (c >= 'A' && c <= 'Z') { + folded += 0x20; + } + if (folded == Tokenizer.OCTYPE[index]) { + appendStrBuf(c); + } else { + errBogusComment(); + reconsume = true; + state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); + continue stateloop; + } + index++; + continue; + } else { + reconsume = true; + state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); + break markupdeclarationdoctypeloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case DOCTYPE: + doctypeloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + initDoctypeFields(); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the before DOCTYPE name state. + */ + state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); + break doctypeloop; + // continue stateloop; + default: + /* + * Anything else Parse error. + */ + errMissingSpaceBeforeDoctypeName(); + /* + * Reconsume the current character in the before + * DOCTYPE name state. + */ + reconsume = true; + state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); + break doctypeloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_DOCTYPE_NAME: + beforedoctypenameloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the before DOCTYPE name state. + */ + continue; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Parse error. + */ + errNamelessDoctype(); + /* + * Create a new DOCTYPE token. Set its + * force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit the token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + if (c >= 'A' && c <= 'Z') { + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Create a + * new DOCTYPE token. Set the token's name + * to the lowercase version of the input + * character (add 0x0020 to the character's + * code point). + */ + c += 0x20; + } + /* Anything else Create a new DOCTYPE token. */ + /* + * Set the token's name name to the current + * input character. + */ + clearStrBufBeforeUse(); + appendStrBuf(c); + /* + * Switch to the DOCTYPE name state. + */ + state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); + break beforedoctypenameloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case DOCTYPE_NAME: + doctypenameloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + strBufToDoctypeName(); + state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the after DOCTYPE name state. + */ + strBufToDoctypeName(); + state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); + break doctypenameloop; + // continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * DOCTYPE token. + */ + strBufToDoctypeName(); + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * U+0041 LATIN CAPITAL LETTER A through to + * U+005A LATIN CAPITAL LETTER Z Append the + * lowercase version of the input character (add + * 0x0020 to the character's code point) to the + * current DOCTYPE token's name. + */ + if (c >= 'A' && c <= 'Z') { + c += 0x0020; + } + /* + * Anything else Append the current input + * character to the current DOCTYPE token's + * name. + */ + appendStrBuf(c); + /* + * Stay in the DOCTYPE name state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case AFTER_DOCTYPE_NAME: + afterdoctypenameloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the after DOCTYPE name state. + */ + continue; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case 'p': + case 'P': + index = 0; + state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); + break afterdoctypenameloop; + // continue stateloop; + case 's': + case 'S': + index = 0; + state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); + continue stateloop; + default: + /* + * Otherwise, this is the parse error. + */ + bogusDoctype(); + + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case DOCTYPE_UBLIC: + doctypeublicloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * If the six characters starting from the current input + * character are an ASCII case-insensitive match for the + * word "PUBLIC", then consume those characters and + * switch to the before DOCTYPE public identifier state. + */ + if (index < 5) { // UBLIC.length + char folded = c; + if (c >= 'A' && c <= 'Z') { + folded += 0x20; + } + if (folded != Tokenizer.UBLIC[index]) { + bogusDoctype(); + // forceQuirks = true; + reconsume = true; + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + index++; + continue; + } else { + reconsume = true; + state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); + break doctypeublicloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case AFTER_DOCTYPE_PUBLIC_KEYWORD: + afterdoctypepublickeywordloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the before DOCTYPE public + * identifier state. + */ + state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); + break afterdoctypepublickeywordloop; + // FALL THROUGH continue stateloop + case '"': + /* + * U+0022 QUOTATION MARK (") Parse Error. + */ + errNoSpaceBetweenDoctypePublicKeywordAndQuote(); + /* + * Set the DOCTYPE token's public identifier to + * the empty string (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE public identifier + * (double-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Parse Error. + */ + errNoSpaceBetweenDoctypePublicKeywordAndQuote(); + /* + * Set the DOCTYPE token's public identifier to + * the empty string (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE public identifier + * (single-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + continue stateloop; + case '>': + /* U+003E GREATER-THAN SIGN (>) Parse error. */ + errExpectedPublicId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + bogusDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: + beforedoctypepublicidentifierloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the before DOCTYPE public identifier + * state. + */ + continue; + case '"': + /* + * U+0022 QUOTATION MARK (") Set the DOCTYPE + * token's public identifier to the empty string + * (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE public identifier + * (double-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + break beforedoctypepublicidentifierloop; + // continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Set the DOCTYPE token's + * public identifier to the empty string (not + * missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE public identifier + * (single-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + continue stateloop; + case '>': + /* U+003E GREATER-THAN SIGN (>) Parse error. */ + errExpectedPublicId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + bogusDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: + doctypepublicidentifierdoublequotedloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '"': + /* + * U+0022 QUOTATION MARK (") Switch to the after + * DOCTYPE public identifier state. + */ + publicIdentifier = strBufToString(); + state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); + break doctypepublicidentifierdoublequotedloop; + // continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Parse error. + */ + errGtInPublicId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + publicIdentifier = strBufToString(); + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the current input + * character to the current DOCTYPE token's + * public identifier. + */ + appendStrBuf(c); + /* + * Stay in the DOCTYPE public identifier + * (double-quoted) state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: + afterdoctypepublicidentifierloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the between DOCTYPE public and + * system identifiers state. + */ + state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); + break afterdoctypepublicidentifierloop; + // continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '"': + /* + * U+0022 QUOTATION MARK (") Parse error. + */ + errNoSpaceBetweenPublicAndSystemIds(); + /* + * Set the DOCTYPE token's system identifier to + * the empty string (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE system identifier + * (double-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Parse error. + */ + errNoSpaceBetweenPublicAndSystemIds(); + /* + * Set the DOCTYPE token's system identifier to + * the empty string (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE system identifier + * (single-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + continue stateloop; + default: + bogusDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: + betweendoctypepublicandsystemidentifiersloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the between DOCTYPE public and system + * identifiers state. + */ + continue; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '"': + /* + * U+0022 QUOTATION MARK (") Set the DOCTYPE + * token's system identifier to the empty string + * (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE system identifier + * (double-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + break betweendoctypepublicandsystemidentifiersloop; + // continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Set the DOCTYPE token's + * system identifier to the empty string (not + * missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE system identifier + * (single-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + continue stateloop; + default: + bogusDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: + doctypesystemidentifierdoublequotedloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '"': + /* + * U+0022 QUOTATION MARK (") Switch to the after + * DOCTYPE system identifier state. + */ + systemIdentifier = strBufToString(); + state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); + continue stateloop; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Parse error. + */ + errGtInSystemId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + systemIdentifier = strBufToString(); + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the current input + * character to the current DOCTYPE token's + * system identifier. + */ + appendStrBuf(c); + /* + * Stay in the DOCTYPE system identifier + * (double-quoted) state. + */ + continue; + } + } + // FALLTHRU DON'T REORDER + case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: + afterdoctypesystemidentifierloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the after DOCTYPE system identifier state. + */ + continue; + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit the current + * DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + /* + * Switch to the bogus DOCTYPE state. (This does + * not set the DOCTYPE token's force-quirks flag + * to on.) + */ + bogusDoctypeWithoutQuirks(); + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + break afterdoctypesystemidentifierloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case BOGUS_DOCTYPE: + for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '>': + /* + * U+003E GREATER-THAN SIGN (>) Emit that + * DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + default: + /* + * Anything else Stay in the bogus DOCTYPE + * state. + */ + continue; + } + } + // XXX reorder point + case DOCTYPE_YSTEM: + doctypeystemloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Otherwise, if the six characters starting from the + * current input character are an ASCII case-insensitive + * match for the word "SYSTEM", then consume those + * characters and switch to the before DOCTYPE system + * identifier state. + */ + if (index < 5) { // YSTEM.length + char folded = c; + if (c >= 'A' && c <= 'Z') { + folded += 0x20; + } + if (folded != Tokenizer.YSTEM[index]) { + bogusDoctype(); + reconsume = true; + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + index++; + continue stateloop; + } else { + reconsume = true; + state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); + break doctypeystemloop; + // continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case AFTER_DOCTYPE_SYSTEM_KEYWORD: + afterdoctypesystemkeywordloop: for (;;) { + if (reconsume) { + reconsume = false; + } else { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + } + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE + * Switch to the before DOCTYPE public + * identifier state. + */ + state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); + break afterdoctypesystemkeywordloop; + // FALL THROUGH continue stateloop + case '"': + /* + * U+0022 QUOTATION MARK (") Parse Error. + */ + errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); + /* + * Set the DOCTYPE token's system identifier to + * the empty string (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE public identifier + * (double-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Parse Error. + */ + errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); + /* + * Set the DOCTYPE token's public identifier to + * the empty string (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE public identifier + * (single-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + continue stateloop; + case '>': + /* U+003E GREATER-THAN SIGN (>) Parse error. */ + errExpectedPublicId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + bogusDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: + beforedoctypesystemidentifierloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\r': + silentCarriageReturn(); + break stateloop; + case '\n': + silentLineFeed(); + // fall thru + case ' ': + case '\t': + case '\u000C': + /* + * U+0009 CHARACTER TABULATION U+000A LINE FEED + * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay + * in the before DOCTYPE system identifier + * state. + */ + continue; + case '"': + /* + * U+0022 QUOTATION MARK (") Set the DOCTYPE + * token's system identifier to the empty string + * (not missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE system identifier + * (double-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); + continue stateloop; + case '\'': + /* + * U+0027 APOSTROPHE (') Set the DOCTYPE token's + * system identifier to the empty string (not + * missing), + */ + clearStrBufBeforeUse(); + /* + * then switch to the DOCTYPE system identifier + * (single-quoted) state. + */ + state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); + break beforedoctypesystemidentifierloop; + // continue stateloop; + case '>': + /* U+003E GREATER-THAN SIGN (>) Parse error. */ + errExpectedSystemId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + default: + bogusDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + // done by bogusDoctype(); + /* + * Switch to the bogus DOCTYPE state. + */ + state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); + continue stateloop; + } + } + // FALLTHRU DON'T REORDER + case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\'': + /* + * U+0027 APOSTROPHE (') Switch to the after + * DOCTYPE system identifier state. + */ + systemIdentifier = strBufToString(); + state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); + continue stateloop; + case '>': + errGtInSystemId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + systemIdentifier = strBufToString(); + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the current input + * character to the current DOCTYPE token's + * system identifier. + */ + appendStrBuf(c); + /* + * Stay in the DOCTYPE system identifier + * (double-quoted) state. + */ + continue; + } + } + // XXX reorder point + case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: + for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + /* + * Consume the next input character: + */ + switch (c) { + case '\'': + /* + * U+0027 APOSTROPHE (') Switch to the after + * DOCTYPE public identifier state. + */ + publicIdentifier = strBufToString(); + state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); + continue stateloop; + case '>': + errGtInPublicId(); + /* + * Set the DOCTYPE token's force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + publicIdentifier = strBufToString(); + emitDoctypeToken(pos); + /* + * Switch to the data state. + */ + state = transition(state, Tokenizer.DATA, reconsume, pos); + continue stateloop; + case '\r': + appendStrBufCarriageReturn(); + break stateloop; + case '\n': + appendStrBufLineFeed(); + continue; + case '\u0000': + c = '\uFFFD'; + // fall thru + default: + /* + * Anything else Append the current input + * character to the current DOCTYPE token's + * public identifier. + */ + appendStrBuf(c); + /* + * Stay in the DOCTYPE public identifier + * (single-quoted) state. + */ + continue; + } + } + // XXX reorder point + case PROCESSING_INSTRUCTION: + processinginstructionloop: for (;;) { + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '?': + state = transition( + state, + Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, + reconsume, pos); + break processinginstructionloop; + // continue stateloop; + default: + continue; + } + } + case PROCESSING_INSTRUCTION_QUESTION_MARK: + if (++pos == endPos) { + break stateloop; + } + c = checkChar(buf, pos); + switch (c) { + case '>': + state = transition(state, Tokenizer.DATA, + reconsume, pos); + continue stateloop; + default: + state = transition(state, + Tokenizer.PROCESSING_INSTRUCTION, + reconsume, pos); + continue stateloop; + } + // END HOTSPOT WORKAROUND + } + } + flushChars(buf, pos); + /* + * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } + */ + // Save locals + stateSave = state; + returnStateSave = returnState; + return pos; + } + + // HOTSPOT WORKAROUND INSERTION POINT + + // [NOCPP[ + + protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { + return to; + } + + // ]NOCPP] + + private void initDoctypeFields() { + // Discard the characters "DOCTYPE" accumulated as a potential bogus + // comment into strBuf. + clearStrBufAfterUse(); + doctypeName = ""; + if (systemIdentifier != null) { + Portability.releaseString(systemIdentifier); + systemIdentifier = null; + } + if (publicIdentifier != null) { + Portability.releaseString(publicIdentifier); + publicIdentifier = null; + } + forceQuirks = false; + } + + @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() + throws SAXException { + silentCarriageReturn(); + adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); + } + + @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed() + throws SAXException { + silentLineFeed(); + adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); + } + + @Inline private void appendStrBufLineFeed() { + silentLineFeed(); + appendStrBuf('\n'); + } + + @Inline private void appendStrBufCarriageReturn() { + silentCarriageReturn(); + appendStrBuf('\n'); + } + + @Inline protected void silentCarriageReturn() { + ++line; + lastCR = true; + } + + @Inline protected void silentLineFeed() { + ++line; + } + + private void emitCarriageReturn(@NoLength char[] buf, int pos) + throws SAXException { + silentCarriageReturn(); + flushChars(buf, pos); + tokenHandler.characters(Tokenizer.LF, 0, 1); + cstart = Integer.MAX_VALUE; + } + + private void emitReplacementCharacter(@NoLength char[] buf, int pos) + throws SAXException { + flushChars(buf, pos); + tokenHandler.zeroOriginatingReplacementCharacter(); + cstart = pos + 1; + } + + private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) + throws SAXException { + flushChars(buf, pos); + tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); + cstart = pos + 1; + } + + private void setAdditionalAndRememberAmpersandLocation(char add) { + additional = add; + // [NOCPP[ + ampersandLocation = new LocatorImpl(this); + // ]NOCPP] + } + + private void bogusDoctype() throws SAXException { + errBogusDoctype(); + forceQuirks = true; + } + + private void bogusDoctypeWithoutQuirks() throws SAXException { + errBogusDoctype(); + forceQuirks = false; + } + + private void handleNcrValue(int returnState) throws SAXException { + /* + * If one or more characters match the range, then take them all and + * interpret the string of characters as a number (either hexadecimal or + * decimal as appropriate). + */ + if (value <= 0xFFFF) { + if (value >= 0x80 && value <= 0x9f) { + /* + * If that number is one of the numbers in the first column of + * the following table, then this is a parse error. + */ + errNcrInC1Range(); + /* + * Find the row with that number in the first column, and return + * a character token for the Unicode character given in the + * second column of that row. + */ + @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; + emitOrAppendOne(val, returnState); + // [NOCPP[ + } else if (value == 0xC + && contentSpacePolicy != XmlViolationPolicy.ALLOW) { + if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { + emitOrAppendOne(Tokenizer.SPACE, returnState); + } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { + fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); + } + // ]NOCPP] + } else if (value == 0x0) { + errNcrZero(); + emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); + } else if ((value & 0xF800) == 0xD800) { + errNcrSurrogate(); + emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); + } else { + /* + * Otherwise, return a character token for the Unicode character + * whose code point is that number. + */ + char ch = (char) value; + // [NOCPP[ + if (value == 0x0D) { + errNcrCr(); + } else if ((value <= 0x0008) || (value == 0x000B) + || (value >= 0x000E && value <= 0x001F)) { + ch = errNcrControlChar(ch); + } else if (value >= 0xFDD0 && value <= 0xFDEF) { + errNcrUnassigned(); + } else if ((value & 0xFFFE) == 0xFFFE) { + ch = errNcrNonCharacter(ch); + } else if (value >= 0x007F && value <= 0x009F) { + errNcrControlChar(); + } else { + maybeWarnPrivateUse(ch); + } + // ]NOCPP] + bmpChar[0] = ch; + emitOrAppendOne(bmpChar, returnState); + } + } else if (value <= 0x10FFFF) { + // [NOCPP[ + maybeWarnPrivateUseAstral(); + if ((value & 0xFFFE) == 0xFFFE) { + errAstralNonCharacter(value); + } + // ]NOCPP] + astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); + astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); + emitOrAppendTwo(astralChar, returnState); + } else { + errNcrOutOfRange(); + emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); + } + } + + public void eof() throws SAXException { + int state = stateSave; + int returnState = returnStateSave; + + eofloop: for (;;) { + switch (state) { + case SCRIPT_DATA_LESS_THAN_SIGN: + case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: + /* + * Otherwise, emit a U+003C LESS-THAN SIGN character token + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + /* + * and reconsume the current input character in the data + * state. + */ + break eofloop; + case TAG_OPEN: + /* + * The behavior of this state depends on the content model + * flag. + */ + /* + * Anything else Parse error. + */ + errEofAfterLt(); + /* + * Emit a U+003C LESS-THAN SIGN character token + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + /* + * and reconsume the current input character in the data + * state. + */ + break eofloop; + case RAWTEXT_RCDATA_LESS_THAN_SIGN: + /* + * Emit a U+003C LESS-THAN SIGN character token + */ + tokenHandler.characters(Tokenizer.LT_GT, 0, 1); + /* + * and reconsume the current input character in the RCDATA + * state. + */ + break eofloop; + case NON_DATA_END_TAG_NAME: + /* + * Emit a U+003C LESS-THAN SIGN character token, a U+002F + * SOLIDUS character token, + */ + tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); + /* + * a character token for each of the characters in the + * temporary buffer (in the order they were added to the + * buffer), + */ + emitStrBuf(); + /* + * and reconsume the current input character in the RCDATA + * state. + */ + break eofloop; + case CLOSE_TAG_OPEN: + /* EOF Parse error. */ + errEofAfterLt(); + /* + * Emit a U+003C LESS-THAN SIGN character token and a U+002F + * SOLIDUS character token. + */ + tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case TAG_NAME: + /* + * EOF Parse error. + */ + errEofInTagName(); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case BEFORE_ATTRIBUTE_NAME: + case AFTER_ATTRIBUTE_VALUE_QUOTED: + case SELF_CLOSING_START_TAG: + /* EOF Parse error. */ + errEofWithoutGt(); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case ATTRIBUTE_NAME: + /* + * EOF Parse error. + */ + errEofInAttributeName(); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case AFTER_ATTRIBUTE_NAME: + case BEFORE_ATTRIBUTE_VALUE: + /* EOF Parse error. */ + errEofWithoutGt(); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case ATTRIBUTE_VALUE_DOUBLE_QUOTED: + case ATTRIBUTE_VALUE_SINGLE_QUOTED: + case ATTRIBUTE_VALUE_UNQUOTED: + /* EOF Parse error. */ + errEofInAttributeValue(); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case BOGUS_COMMENT: + emitComment(0, 0); + break eofloop; + case BOGUS_COMMENT_HYPHEN: + // [NOCPP[ + maybeAppendSpaceToBogusComment(); + // ]NOCPP] + emitComment(0, 0); + break eofloop; + case MARKUP_DECLARATION_OPEN: + errBogusComment(); + emitComment(0, 0); + break eofloop; + case MARKUP_DECLARATION_HYPHEN: + errBogusComment(); + emitComment(0, 0); + break eofloop; + case MARKUP_DECLARATION_OCTYPE: + if (index < 6) { + errBogusComment(); + emitComment(0, 0); + } else { + /* EOF Parse error. */ + errEofInDoctype(); + /* + * Create a new DOCTYPE token. Set its force-quirks flag + * to on. + */ + doctypeName = ""; + if (systemIdentifier != null) { + Portability.releaseString(systemIdentifier); + systemIdentifier = null; + } + if (publicIdentifier != null) { + Portability.releaseString(publicIdentifier); + publicIdentifier = null; + } + forceQuirks = true; + /* + * Emit the token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + } + break eofloop; + case COMMENT_START: + case COMMENT: + /* + * EOF Parse error. + */ + errEofInComment(); + /* Emit the comment token. */ + emitComment(0, 0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case COMMENT_END: + errEofInComment(); + /* Emit the comment token. */ + emitComment(2, 0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case COMMENT_END_DASH: + case COMMENT_START_DASH: + errEofInComment(); + /* Emit the comment token. */ + emitComment(1, 0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case COMMENT_END_BANG: + errEofInComment(); + /* Emit the comment token. */ + emitComment(3, 0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case DOCTYPE: + case BEFORE_DOCTYPE_NAME: + errEofInDoctype(); + /* + * Create a new DOCTYPE token. Set its force-quirks flag to + * on. + */ + forceQuirks = true; + /* + * Emit the token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case DOCTYPE_NAME: + errEofInDoctype(); + strBufToDoctypeName(); + /* + * Set the DOCTYPE token's force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case DOCTYPE_UBLIC: + case DOCTYPE_YSTEM: + case AFTER_DOCTYPE_NAME: + case AFTER_DOCTYPE_PUBLIC_KEYWORD: + case AFTER_DOCTYPE_SYSTEM_KEYWORD: + case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: + errEofInDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: + case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: + /* EOF Parse error. */ + errEofInPublicId(); + /* + * Set the DOCTYPE token's force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + publicIdentifier = strBufToString(); + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: + case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: + case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: + errEofInDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: + case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: + /* EOF Parse error. */ + errEofInSystemId(); + /* + * Set the DOCTYPE token's force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + systemIdentifier = strBufToString(); + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: + errEofInDoctype(); + /* + * Set the DOCTYPE token's force-quirks flag to on. + */ + forceQuirks = true; + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case BOGUS_DOCTYPE: + /* + * Emit that DOCTYPE token. + */ + emitDoctypeToken(0); + /* + * Reconsume the EOF character in the data state. + */ + break eofloop; + case CONSUME_CHARACTER_REFERENCE: + /* + * Unlike the definition is the spec, this state does not + * return a value and never requires the caller to + * backtrack. This state takes care of emitting characters + * or appending to the current attribute value. It also + * takes care of that in the case when consuming the entity + * fails. + */ + /* + * This section defines how to consume an entity. This + * definition is used when parsing entities in text and in + * attributes. + * + * The behavior depends on the identity of the next + * character (the one immediately after the U+0026 AMPERSAND + * character): + */ + + emitOrAppendCharRefBuf(returnState); + state = returnState; + continue; + case CHARACTER_REFERENCE_HILO_LOOKUP: + errNoNamedCharacterMatch(); + emitOrAppendCharRefBuf(returnState); + state = returnState; + continue; + case CHARACTER_REFERENCE_TAIL: + outer: for (;;) { + char c = '\u0000'; + entCol++; + /* + * Consume the maximum number of characters possible, + * with the consumed characters matching one of the + * identifiers in the first column of the named + * character references table (in a case-sensitive + * manner). + */ + hiloop: for (;;) { + if (hi == -1) { + break hiloop; + } + if (entCol == NamedCharacters.NAMES[hi].length()) { + break hiloop; + } + if (entCol > NamedCharacters.NAMES[hi].length()) { + break outer; + } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { + hi--; + } else { + break hiloop; + } + } + + loloop: for (;;) { + if (hi < lo) { + break outer; + } + if (entCol == NamedCharacters.NAMES[lo].length()) { + candidate = lo; + charRefBufMark = charRefBufLen; + lo++; + } else if (entCol > NamedCharacters.NAMES[lo].length()) { + break outer; + } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { + lo++; + } else { + break loloop; + } + } + if (hi < lo) { + break outer; + } + continue; + } + + if (candidate == -1) { + /* + * If no match can be made, then this is a parse error. + */ + errNoNamedCharacterMatch(); + emitOrAppendCharRefBuf(returnState); + state = returnState; + continue eofloop; + } else { + @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; + if (candidateName.length() == 0 + || candidateName.charAt(candidateName.length() - 1) != ';') { + /* + * If the last character matched is not a U+003B + * SEMICOLON (;), there is a parse error. + */ + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + /* + * If the entity is being consumed as part of an + * attribute, and the last character matched is + * not a U+003B SEMICOLON (;), + */ + char ch; + if (charRefBufMark == charRefBufLen) { + ch = '\u0000'; + } else { + ch = charRefBuf[charRefBufMark]; + } + if ((ch >= '0' && ch <= '9') + || (ch >= 'A' && ch <= 'Z') + || (ch >= 'a' && ch <= 'z')) { + /* + * and the next character is in the range + * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, + * U+0041 LATIN CAPITAL LETTER A to U+005A + * LATIN CAPITAL LETTER Z, or U+0061 LATIN + * SMALL LETTER A to U+007A LATIN SMALL + * LETTER Z, then, for historical reasons, + * all the characters that were matched + * after the U+0026 AMPERSAND (&) must be + * unconsumed, and nothing is returned. + */ + errNoNamedCharacterMatch(); + appendCharRefBufToStrBuf(); + state = returnState; + continue eofloop; + } + } + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + errUnescapedAmpersandInterpretedAsCharacterReference(); + } else { + errNotSemicolonTerminated(); + } + } + + /* + * Otherwise, return a character token for the character + * corresponding to the entity name (as given by the + * second column of the named character references + * table). + */ + @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; + if ( + // [NOCPP[ + val.length == 1 + // ]NOCPP] + // CPPONLY: val[1] == 0 + ) { + emitOrAppendOne(val, returnState); + } else { + emitOrAppendTwo(val, returnState); + } + // this is so complicated! + if (charRefBufMark < charRefBufLen) { + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + appendStrBuf(charRefBuf, charRefBufMark, + charRefBufLen - charRefBufMark); + } else { + tokenHandler.characters(charRefBuf, charRefBufMark, + charRefBufLen - charRefBufMark); + } + } + charRefBufLen = 0; + state = returnState; + continue eofloop; + /* + * If the markup contains I'm ¬it; I tell you, the + * entity is parsed as "not", as in, I'm ¬it; I tell + * you. But if the markup was I'm ∉ I tell you, + * the entity would be parsed as "notin;", resulting in + * I'm ∉ I tell you. + */ + } + case CONSUME_NCR: + case DECIMAL_NRC_LOOP: + case HEX_NCR_LOOP: + /* + * If no characters match the range, then don't consume any + * characters (and unconsume the U+0023 NUMBER SIGN + * character and, if appropriate, the X character). This is + * a parse error; nothing is returned. + * + * Otherwise, if the next character is a U+003B SEMICOLON, + * consume that too. If it isn't, there is a parse error. + */ + if (!seenDigits) { + errNoDigitsInNCR(); + emitOrAppendCharRefBuf(returnState); + state = returnState; + continue; + } else { + errCharRefLacksSemicolon(); + } + // WARNING previous state sets reconsume + handleNcrValue(returnState); + state = returnState; + continue; + case CDATA_RSQB: + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); + break eofloop; + case CDATA_RSQB_RSQB: + tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); + break eofloop; + case DATA: + default: + break eofloop; + } + } + // case DATA: + /* + * EOF Emit an end-of-file token. + */ + tokenHandler.eof(); + return; + } + + private void emitDoctypeToken(int pos) throws SAXException { + cstart = pos + 1; + tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, + forceQuirks); + // It is OK and sufficient to release these here, since + // there's no way out of the doctype states than through paths + // that call this method. + doctypeName = null; + Portability.releaseString(publicIdentifier); + publicIdentifier = null; + Portability.releaseString(systemIdentifier); + systemIdentifier = null; + } + + @Inline protected char checkChar(@NoLength char[] buf, int pos) + throws SAXException { + return buf[pos]; + } + + public boolean internalEncodingDeclaration(String internalCharset) + throws SAXException { + if (encodingDeclarationHandler != null) { + return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); + } + return false; + } + + /** + * @param val + * @throws SAXException + */ + private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) + throws SAXException { + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + appendStrBuf(val[0]); + appendStrBuf(val[1]); + } else { + tokenHandler.characters(val, 0, 2); + } + } + + private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) + throws SAXException { + if ((returnState & DATA_AND_RCDATA_MASK) != 0) { + appendStrBuf(val[0]); + } else { + tokenHandler.characters(val, 0, 1); + } + } + + public void end() throws SAXException { + strBuf = null; + doctypeName = null; + if (systemIdentifier != null) { + Portability.releaseString(systemIdentifier); + systemIdentifier = null; + } + if (publicIdentifier != null) { + Portability.releaseString(publicIdentifier); + publicIdentifier = null; + } + if (tagName != null) { + tagName.release(); + tagName = null; + } + if (attributeName != null) { + attributeName.release(); + attributeName = null; + } + tokenHandler.endTokenization(); + if (attributes != null) { + // [NOCPP[ + attributes = null; + // ]NOCPP] + // CPPONLY: attributes.clear(mappingLangToXmlLang); + } + } + + public void requestSuspension() { + shouldSuspend = true; + } + + // [NOCPP[ + + public void becomeConfident() { + confident = true; + } + + /** + * Returns the nextCharOnNewLine. + * + * @return the nextCharOnNewLine + */ + public boolean isNextCharOnNewLine() { + return false; + } + + public boolean isPrevCR() { + return lastCR; + } + + /** + * Returns the line. + * + * @return the line + */ + public int getLine() { + return -1; + } + + /** + * Returns the col. + * + * @return the col + */ + public int getCol() { + return -1; + } + + // ]NOCPP] + + public boolean isInDataState() { + return (stateSave == DATA); + } + + public void resetToDataState() { + clearStrBufAfterUse(); + charRefBufLen = 0; + stateSave = Tokenizer.DATA; + // line = 1; XXX line numbers + lastCR = false; + index = 0; + forceQuirks = false; + additional = '\u0000'; + entCol = -1; + firstCharKey = -1; + lo = 0; + hi = 0; // will always be overwritten before use anyway + candidate = -1; + charRefBufMark = 0; + value = 0; + seenDigits = false; + endTag = false; + shouldSuspend = false; + initDoctypeFields(); + if (tagName != null) { + tagName.release(); + tagName = null; + } + if (attributeName != null) { + attributeName.release(); + attributeName = null; + } + if (newAttributesEachTime) { + if (attributes != null) { + Portability.delete(attributes); + attributes = null; + } + } + } + + public void loadState(Tokenizer other) throws SAXException { + strBufLen = other.strBufLen; + if (strBufLen > strBuf.length) { + strBuf = new char[strBufLen]; + } + System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); + + charRefBufLen = other.charRefBufLen; + System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen); + + stateSave = other.stateSave; + returnStateSave = other.returnStateSave; + endTagExpectation = other.endTagExpectation; + endTagExpectationAsArray = other.endTagExpectationAsArray; + // line = 1; XXX line numbers + lastCR = other.lastCR; + index = other.index; + forceQuirks = other.forceQuirks; + additional = other.additional; + entCol = other.entCol; + firstCharKey = other.firstCharKey; + lo = other.lo; + hi = other.hi; + candidate = other.candidate; + charRefBufMark = other.charRefBufMark; + value = other.value; + seenDigits = other.seenDigits; + endTag = other.endTag; + shouldSuspend = false; + + if (other.doctypeName == null) { + doctypeName = null; + } else { + doctypeName = Portability.newLocalFromLocal(other.doctypeName, + interner); + } + + Portability.releaseString(systemIdentifier); + if (other.systemIdentifier == null) { + systemIdentifier = null; + } else { + systemIdentifier = Portability.newStringFromString(other.systemIdentifier); + } + + Portability.releaseString(publicIdentifier); + if (other.publicIdentifier == null) { + publicIdentifier = null; + } else { + publicIdentifier = Portability.newStringFromString(other.publicIdentifier); + } + + if (tagName != null) { + tagName.release(); + } + if (other.tagName == null) { + tagName = null; + } else { + tagName = other.tagName.cloneElementName(interner); + } + + if (attributeName != null) { + attributeName.release(); + } + if (other.attributeName == null) { + attributeName = null; + } else { + attributeName = other.attributeName.cloneAttributeName(interner); + } + + Portability.delete(attributes); + if (other.attributes == null) { + attributes = null; + } else { + attributes = other.attributes.cloneAttributes(interner); + } + } + + public void initializeWithoutStarting() throws SAXException { + confident = false; + strBuf = null; + line = 1; + // CPPONLY: attributeLine = 1; + // [NOCPP[ + html4 = false; + metaBoundaryPassed = false; + wantsComments = tokenHandler.wantsComments(); + if (!newAttributesEachTime) { + attributes = new HtmlAttributes(mappingLangToXmlLang); + } + // ]NOCPP] + resetToDataState(); + } + + protected void errGarbageAfterLtSlash() throws SAXException { + } + + protected void errLtSlashGt() throws SAXException { + } + + protected void errWarnLtSlashInRcdata() throws SAXException { + } + + protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { + } + + protected void errCharRefLacksSemicolon() throws SAXException { + } + + protected void errNoDigitsInNCR() throws SAXException { + } + + protected void errGtInSystemId() throws SAXException { + } + + protected void errGtInPublicId() throws SAXException { + } + + protected void errNamelessDoctype() throws SAXException { + } + + protected void errConsecutiveHyphens() throws SAXException { + } + + protected void errPrematureEndOfComment() throws SAXException { + } + + protected void errBogusComment() throws SAXException { + } + + protected void errUnquotedAttributeValOrNull(char c) throws SAXException { + } + + protected void errSlashNotFollowedByGt() throws SAXException { + } + + protected void errHtml4XmlVoidSyntax() throws SAXException { + } + + protected void errNoSpaceBetweenAttributes() throws SAXException { + } + + protected void errHtml4NonNameInUnquotedAttribute(char c) + throws SAXException { + } + + protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) + throws SAXException { + } + + protected void errAttributeValueMissing() throws SAXException { + } + + protected void errBadCharBeforeAttributeNameOrNull(char c) + throws SAXException { + } + + protected void errEqualsSignBeforeAttributeName() throws SAXException { + } + + protected void errBadCharAfterLt(char c) throws SAXException { + } + + protected void errLtGt() throws SAXException { + } + + protected void errProcessingInstruction() throws SAXException { + } + + protected void errUnescapedAmpersandInterpretedAsCharacterReference() + throws SAXException { + } + + protected void errNotSemicolonTerminated() throws SAXException { + } + + protected void errNoNamedCharacterMatch() throws SAXException { + } + + protected void errQuoteBeforeAttributeName(char c) throws SAXException { + } + + protected void errQuoteOrLtInAttributeNameOrNull(char c) + throws SAXException { + } + + protected void errExpectedPublicId() throws SAXException { + } + + protected void errBogusDoctype() throws SAXException { + } + + protected void maybeWarnPrivateUseAstral() throws SAXException { + } + + protected void maybeWarnPrivateUse(char ch) throws SAXException { + } + + protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) + throws SAXException { + } + + protected void maybeErrSlashInEndTag(boolean selfClosing) + throws SAXException { + } + + protected char errNcrNonCharacter(char ch) throws SAXException { + return ch; + } + + protected void errAstralNonCharacter(int ch) throws SAXException { + } + + protected void errNcrSurrogate() throws SAXException { + } + + protected char errNcrControlChar(char ch) throws SAXException { + return ch; + } + + protected void errNcrCr() throws SAXException { + } + + protected void errNcrInC1Range() throws SAXException { + } + + protected void errEofInPublicId() throws SAXException { + } + + protected void errEofInComment() throws SAXException { + } + + protected void errEofInDoctype() throws SAXException { + } + + protected void errEofInAttributeValue() throws SAXException { + } + + protected void errEofInAttributeName() throws SAXException { + } + + protected void errEofWithoutGt() throws SAXException { + } + + protected void errEofInTagName() throws SAXException { + } + + protected void errEofInEndTag() throws SAXException { + } + + protected void errEofAfterLt() throws SAXException { + } + + protected void errNcrOutOfRange() throws SAXException { + } + + protected void errNcrUnassigned() throws SAXException { + } + + protected void errDuplicateAttribute() throws SAXException { + } + + protected void errEofInSystemId() throws SAXException { + } + + protected void errExpectedSystemId() throws SAXException { + } + + protected void errMissingSpaceBeforeDoctypeName() throws SAXException { + } + + protected void errHyphenHyphenBang() throws SAXException { + } + + protected void errNcrControlChar() throws SAXException { + } + + protected void errNcrZero() throws SAXException { + } + + protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() + throws SAXException { + } + + protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { + } + + protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() + throws SAXException { + } + + protected void noteAttributeWithoutValue() throws SAXException { + } + + protected void noteUnquotedAttributeValue() throws SAXException { + } + + /** + * Sets the encodingDeclarationHandler. + * + * @param encodingDeclarationHandler + * the encodingDeclarationHandler to set + */ + public void setEncodingDeclarationHandler( + EncodingDeclarationHandler encodingDeclarationHandler) { + this.encodingDeclarationHandler = encodingDeclarationHandler; + } + + void destructor() { + // The translator will write refcount tracing stuff here + Portability.delete(attributes); + attributes = null; + } + + // [NOCPP[ + + /** + * Sets an offset to be added to the position reported to + * <code>TransitionHandler</code>. + * + * @param offset the offset + */ + public void setTransitionBaseOffset(int offset) { + + } + + // ]NOCPP] + +} |