summaryrefslogtreecommitdiffstats
path: root/parser/html/javasrc/Tokenizer.java
diff options
context:
space:
mode:
authorwolfbeast <mcwerewolf@wolfbeast.com>2020-01-13 09:29:30 +0100
committerwolfbeast <mcwerewolf@wolfbeast.com>2020-01-13 09:32:00 +0100
commitaa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347 (patch)
treef04b844c58d310e47578bf1fc75cf5e24453dc3b /parser/html/javasrc/Tokenizer.java
parent60dc9eaa95b96abbe881063b62304a58eadd6b8e (diff)
downloadUXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar
UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar.gz
UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar.lz
UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.tar.xz
UXP-aa2ac8ddedbfd9fc27a5cf8c3da41ad700ae5347.zip
Reinstate the java->c++ source, generator code + documentation.
We've kept the java source up-to-date until its removal, so there should be very little additional java mangling needed to have it back up to speed and usable again. This reverts commit c6446f1126232935c85397aac493113dd38496cd.
Diffstat (limited to 'parser/html/javasrc/Tokenizer.java')
-rw-r--r--parser/html/javasrc/Tokenizer.java7089
1 files changed, 7089 insertions, 0 deletions
diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java
new file mode 100644
index 000000000..f141d94d7
--- /dev/null
+++ b/parser/html/javasrc/Tokenizer.java
@@ -0,0 +1,7089 @@
+/*
+ * Copyright (c) 2005-2007 Henri Sivonen
+ * Copyright (c) 2007-2015 Mozilla Foundation
+ * Copyright (c) 2019 Moonchild Productions
+ * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
+ * Foundation, and Opera Software ASA.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * The comments following this one that use the same comment syntax as this
+ * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
+ * amended as of June 18 2008 and May 31 2010.
+ * That document came with this statement:
+ * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
+ * Opera Software ASA. You are granted a license to use, reproduce and
+ * create derivative works of this document."
+ */
+
+package nu.validator.htmlparser.impl;
+
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+import nu.validator.htmlparser.annotation.Auto;
+import nu.validator.htmlparser.annotation.CharacterName;
+import nu.validator.htmlparser.annotation.Const;
+import nu.validator.htmlparser.annotation.Inline;
+import nu.validator.htmlparser.annotation.Local;
+import nu.validator.htmlparser.annotation.NoLength;
+import nu.validator.htmlparser.common.EncodingDeclarationHandler;
+import nu.validator.htmlparser.common.Interner;
+import nu.validator.htmlparser.common.TokenHandler;
+import nu.validator.htmlparser.common.XmlViolationPolicy;
+
+/**
+ * An implementation of
+ * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
+ *
+ * This class implements the <code>Locator</code> interface. This is not an
+ * incidental implementation detail: Users of this class are encouraged to make
+ * use of the <code>Locator</code> nature.
+ *
+ * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
+ * can be configured to treat these conditions as fatal or to coerce the infoset
+ * to something that XML 1.0 allows.
+ *
+ * @version $Id$
+ * @author hsivonen
+ */
+public class Tokenizer implements Locator {
+
+ private static final int DATA_AND_RCDATA_MASK = ~1;
+
+ public static final int DATA = 0;
+
+ public static final int RCDATA = 1;
+
+ public static final int SCRIPT_DATA = 2;
+
+ public static final int RAWTEXT = 3;
+
+ public static final int SCRIPT_DATA_ESCAPED = 4;
+
+ public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
+
+ public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
+
+ public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
+
+ public static final int PLAINTEXT = 8;
+
+ public static final int TAG_OPEN = 9;
+
+ public static final int CLOSE_TAG_OPEN = 10;
+
+ public static final int TAG_NAME = 11;
+
+ public static final int BEFORE_ATTRIBUTE_NAME = 12;
+
+ public static final int ATTRIBUTE_NAME = 13;
+
+ public static final int AFTER_ATTRIBUTE_NAME = 14;
+
+ public static final int BEFORE_ATTRIBUTE_VALUE = 15;
+
+ public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
+
+ public static final int BOGUS_COMMENT = 17;
+
+ public static final int MARKUP_DECLARATION_OPEN = 18;
+
+ public static final int DOCTYPE = 19;
+
+ public static final int BEFORE_DOCTYPE_NAME = 20;
+
+ public static final int DOCTYPE_NAME = 21;
+
+ public static final int AFTER_DOCTYPE_NAME = 22;
+
+ public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
+
+ public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
+
+ public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
+
+ public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
+
+ public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
+
+ public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
+
+ public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
+
+ public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
+
+ public static final int BOGUS_DOCTYPE = 31;
+
+ public static final int COMMENT_START = 32;
+
+ public static final int COMMENT_START_DASH = 33;
+
+ public static final int COMMENT = 34;
+
+ public static final int COMMENT_END_DASH = 35;
+
+ public static final int COMMENT_END = 36;
+
+ public static final int COMMENT_END_BANG = 37;
+
+ public static final int NON_DATA_END_TAG_NAME = 38;
+
+ public static final int MARKUP_DECLARATION_HYPHEN = 39;
+
+ public static final int MARKUP_DECLARATION_OCTYPE = 40;
+
+ public static final int DOCTYPE_UBLIC = 41;
+
+ public static final int DOCTYPE_YSTEM = 42;
+
+ public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
+
+ public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
+
+ public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
+
+ public static final int CONSUME_CHARACTER_REFERENCE = 46;
+
+ public static final int CONSUME_NCR = 47;
+
+ public static final int CHARACTER_REFERENCE_TAIL = 48;
+
+ public static final int HEX_NCR_LOOP = 49;
+
+ public static final int DECIMAL_NRC_LOOP = 50;
+
+ public static final int HANDLE_NCR_VALUE = 51;
+
+ public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
+
+ public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
+
+ public static final int SELF_CLOSING_START_TAG = 54;
+
+ public static final int CDATA_START = 55;
+
+ public static final int CDATA_SECTION = 56;
+
+ public static final int CDATA_RSQB = 57;
+
+ public static final int CDATA_RSQB_RSQB = 58;
+
+ public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
+
+ public static final int SCRIPT_DATA_ESCAPE_START = 60;
+
+ public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
+
+ public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
+
+ public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
+
+ public static final int BOGUS_COMMENT_HYPHEN = 64;
+
+ public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
+
+ public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
+
+ public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
+
+ public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
+
+ public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
+
+ public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
+
+ public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
+
+ public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
+
+ public static final int PROCESSING_INSTRUCTION = 73;
+
+ public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
+
+ /**
+ * Magic value for UTF-16 operations.
+ */
+ private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
+
+ /**
+ * UTF-16 code unit array containing less than and greater than for emitting
+ * those characters on certain parse errors.
+ */
+ private static final @NoLength char[] LT_GT = { '<', '>' };
+
+ /**
+ * UTF-16 code unit array containing less than and solidus for emitting
+ * those characters on certain parse errors.
+ */
+ private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
+
+ /**
+ * UTF-16 code unit array containing ]] for emitting those characters on
+ * state transitions.
+ */
+ private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
+
+ /**
+ * Array version of U+FFFD.
+ */
+ private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
+
+ // [NOCPP[
+
+ /**
+ * Array version of space.
+ */
+ private static final @NoLength char[] SPACE = { ' ' };
+
+ // ]NOCPP]
+
+ /**
+ * Array version of line feed.
+ */
+ private static final @NoLength char[] LF = { '\n' };
+
+ /**
+ * "CDATA[" as <code>char[]</code>
+ */
+ private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
+ 'A', '[' };
+
+ /**
+ * "octype" as <code>char[]</code>
+ */
+ private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
+ 'e' };
+
+ /**
+ * "ublic" as <code>char[]</code>
+ */
+ private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
+
+ /**
+ * "ystem" as <code>char[]</code>
+ */
+ private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
+
+ private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
+
+ private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
+
+ private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
+
+ private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
+ 'e', 'x', 't' };
+
+ private static final char[] XMP_ARR = { 'x', 'm', 'p' };
+
+ private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
+ 'e', 'a' };
+
+ private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
+
+ private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
+ 'd' };
+
+ private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
+ 'p', 't' };
+
+ private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
+ 'e', 's' };
+
+ /**
+ * The token handler.
+ */
+ protected final TokenHandler tokenHandler;
+
+ protected EncodingDeclarationHandler encodingDeclarationHandler;
+
+ // [NOCPP[
+
+ /**
+ * The error handler.
+ */
+ protected ErrorHandler errorHandler;
+
+ // ]NOCPP]
+
+ /**
+ * Whether the previous char read was CR.
+ */
+ protected boolean lastCR;
+
+ protected int stateSave;
+
+ private int returnStateSave;
+
+ protected int index;
+
+ private boolean forceQuirks;
+
+ private char additional;
+
+ private int entCol;
+
+ private int firstCharKey;
+
+ private int lo;
+
+ private int hi;
+
+ private int candidate;
+
+ private int charRefBufMark;
+
+ protected int value;
+
+ private boolean seenDigits;
+
+ protected int cstart;
+
+ /**
+ * The SAX public id for the resource being tokenized. (Only passed to back
+ * as part of locator data.)
+ */
+ private String publicId;
+
+ /**
+ * The SAX system id for the resource being tokenized. (Only passed to back
+ * as part of locator data.)
+ */
+ private String systemId;
+
+ /**
+ * Buffer for bufferable things other than those that fit the description
+ * of <code>charRefBuf</code>.
+ */
+ private @Auto char[] strBuf;
+
+ /**
+ * Number of significant <code>char</code>s in <code>strBuf</code>.
+ */
+ private int strBufLen;
+
+ /**
+ * Buffer for characters that might form a character reference but may
+ * end up not forming one.
+ */
+ private final @Auto char[] charRefBuf;
+
+ /**
+ * Number of significant <code>char</code>s in <code>charRefBuf</code>.
+ */
+ private int charRefBufLen;
+
+ /**
+ * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
+ */
+ private final @Auto char[] bmpChar;
+
+ /**
+ * Buffer for expanding astral NCRs.
+ */
+ private final @Auto char[] astralChar;
+
+ /**
+ * The element whose end tag closes the current CDATA or RCDATA element.
+ */
+ protected ElementName endTagExpectation = null;
+
+ private char[] endTagExpectationAsArray; // not @Auto!
+
+ /**
+ * <code>true</code> if tokenizing an end tag
+ */
+ protected boolean endTag;
+
+ /**
+ * The current tag token name.
+ */
+ private ElementName tagName = null;
+
+ /**
+ * The current attribute name.
+ */
+ protected AttributeName attributeName = null;
+
+ // [NOCPP[
+
+ /**
+ * Whether comment tokens are emitted.
+ */
+ private boolean wantsComments = false;
+
+ /**
+ * <code>true</code> when HTML4-specific additional errors are requested.
+ */
+ protected boolean html4;
+
+ /**
+ * Whether the stream is past the first 1024 bytes.
+ */
+ private boolean metaBoundaryPassed;
+
+ // ]NOCPP]
+
+ /**
+ * The name of the current doctype token.
+ */
+ private @Local String doctypeName;
+
+ /**
+ * The public id of the current doctype token.
+ */
+ private String publicIdentifier;
+
+ /**
+ * The system id of the current doctype token.
+ */
+ private String systemIdentifier;
+
+ /**
+ * The attribute holder.
+ */
+ private HtmlAttributes attributes;
+
+ // [NOCPP[
+
+ /**
+ * The policy for vertical tab and form feed.
+ */
+ private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
+
+ /**
+ * The policy for comments.
+ */
+ private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
+
+ private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
+
+ private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
+
+ private boolean html4ModeCompatibleWithXhtml1Schemata;
+
+ private int mappingLangToXmlLang;
+
+ // ]NOCPP]
+
+ private final boolean newAttributesEachTime;
+
+ private boolean shouldSuspend;
+
+ protected boolean confident;
+
+ private int line;
+
+ /*
+ * The line number of the current attribute. First set to the line of the
+ * attribute name and if there is a value, set to the line the value
+ * started on.
+ */
+ // CPPONLY: private int attributeLine;
+
+ private Interner interner;
+
+ // CPPONLY: private boolean viewingXmlSource;
+
+ // [NOCPP[
+
+ protected LocatorImpl ampersandLocation;
+
+ public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
+ this.tokenHandler = tokenHandler;
+ this.encodingDeclarationHandler = null;
+ this.newAttributesEachTime = newAttributesEachTime;
+ // &CounterClockwiseContourIntegral; is the longest valid char ref and
+ // the semicolon never gets appended to the buffer.
+ this.charRefBuf = new char[32];
+ this.bmpChar = new char[1];
+ this.astralChar = new char[2];
+ this.tagName = null;
+ this.attributeName = null;
+ this.doctypeName = null;
+ this.publicIdentifier = null;
+ this.systemIdentifier = null;
+ this.attributes = null;
+ }
+
+ // ]NOCPP]
+
+ /**
+ * The constructor.
+ *
+ * @param tokenHandler
+ * the handler for receiving tokens
+ */
+ public Tokenizer(TokenHandler tokenHandler
+ // CPPONLY: , boolean viewingXmlSource
+ ) {
+ this.tokenHandler = tokenHandler;
+ this.encodingDeclarationHandler = null;
+ // [NOCPP[
+ this.newAttributesEachTime = false;
+ // ]NOCPP]
+ // &CounterClockwiseContourIntegral; is the longest valid char ref and
+ // the semicolon never gets appended to the buffer.
+ this.charRefBuf = new char[32];
+ this.bmpChar = new char[1];
+ this.astralChar = new char[2];
+ this.tagName = null;
+ this.attributeName = null;
+ this.doctypeName = null;
+ this.publicIdentifier = null;
+ this.systemIdentifier = null;
+ // [NOCPP[
+ this.attributes = null;
+ // ]NOCPP]
+ // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
+ // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
+ // CPPONLY: this.viewingXmlSource = viewingXmlSource;
+ }
+
+ public void setInterner(Interner interner) {
+ this.interner = interner;
+ }
+
+ public void initLocation(String newPublicId, String newSystemId) {
+ this.systemId = newSystemId;
+ this.publicId = newPublicId;
+
+ }
+
+ // CPPONLY: boolean isViewingXmlSource() {
+ // CPPONLY: return viewingXmlSource;
+ // CPPONLY: }
+
+ // [NOCPP[
+
+ /**
+ * Returns the mappingLangToXmlLang.
+ *
+ * @return the mappingLangToXmlLang
+ */
+ public boolean isMappingLangToXmlLang() {
+ return mappingLangToXmlLang == AttributeName.HTML_LANG;
+ }
+
+ /**
+ * Sets the mappingLangToXmlLang.
+ *
+ * @param mappingLangToXmlLang
+ * the mappingLangToXmlLang to set
+ */
+ public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
+ this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
+ : AttributeName.HTML;
+ }
+
+ /**
+ * Sets the error handler.
+ *
+ * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
+ */
+ public void setErrorHandler(ErrorHandler eh) {
+ this.errorHandler = eh;
+ }
+
+ public ErrorHandler getErrorHandler() {
+ return this.errorHandler;
+ }
+
+ /**
+ * Sets the commentPolicy.
+ *
+ * @param commentPolicy
+ * the commentPolicy to set
+ */
+ public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
+ this.commentPolicy = commentPolicy;
+ }
+
+ /**
+ * Sets the contentNonXmlCharPolicy.
+ *
+ * @param contentNonXmlCharPolicy
+ * the contentNonXmlCharPolicy to set
+ */
+ public void setContentNonXmlCharPolicy(
+ XmlViolationPolicy contentNonXmlCharPolicy) {
+ if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
+ throw new IllegalArgumentException(
+ "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
+ }
+ }
+
+ /**
+ * Sets the contentSpacePolicy.
+ *
+ * @param contentSpacePolicy
+ * the contentSpacePolicy to set
+ */
+ public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
+ this.contentSpacePolicy = contentSpacePolicy;
+ }
+
+ /**
+ * Sets the xmlnsPolicy.
+ *
+ * @param xmlnsPolicy
+ * the xmlnsPolicy to set
+ */
+ public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
+ if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
+ throw new IllegalArgumentException("Can't use FATAL here.");
+ }
+ this.xmlnsPolicy = xmlnsPolicy;
+ }
+
+ public void setNamePolicy(XmlViolationPolicy namePolicy) {
+ this.namePolicy = namePolicy;
+ }
+
+ /**
+ * Sets the html4ModeCompatibleWithXhtml1Schemata.
+ *
+ * @param html4ModeCompatibleWithXhtml1Schemata
+ * the html4ModeCompatibleWithXhtml1Schemata to set
+ */
+ public void setHtml4ModeCompatibleWithXhtml1Schemata(
+ boolean html4ModeCompatibleWithXhtml1Schemata) {
+ this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
+ }
+
+ // ]NOCPP]
+
+ // For the token handler to call
+ /**
+ * Sets the tokenizer state and the associated element name. This should
+ * only ever used to put the tokenizer into one of the states that have
+ * a special end tag expectation.
+ *
+ * @param specialTokenizerState
+ * the tokenizer state to set
+ */
+ public void setState(int specialTokenizerState) {
+ this.stateSave = specialTokenizerState;
+ this.endTagExpectation = null;
+ this.endTagExpectationAsArray = null;
+ }
+
+ // [NOCPP[
+
+ /**
+ * Sets the tokenizer state and the associated element name. This should
+ * only ever used to put the tokenizer into one of the states that have
+ * a special end tag expectation. For use from the tokenizer test harness.
+ *
+ * @param specialTokenizerState
+ * the tokenizer state to set
+ * @param endTagExpectation
+ * the expected end tag for transitioning back to normal
+ */
+ public void setStateAndEndTagExpectation(int specialTokenizerState,
+ @Local String endTagExpectation) {
+ this.stateSave = specialTokenizerState;
+ if (specialTokenizerState == Tokenizer.DATA) {
+ return;
+ }
+ @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
+ this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
+ asArray.length, interner);
+ endTagExpectationToArray();
+ }
+
+ // ]NOCPP]
+
+ /**
+ * Sets the tokenizer state and the associated element name. This should
+ * only ever used to put the tokenizer into one of the states that have
+ * a special end tag expectation.
+ *
+ * @param specialTokenizerState
+ * the tokenizer state to set
+ * @param endTagExpectation
+ * the expected end tag for transitioning back to normal
+ */
+ public void setStateAndEndTagExpectation(int specialTokenizerState,
+ ElementName endTagExpectation) {
+ this.stateSave = specialTokenizerState;
+ this.endTagExpectation = endTagExpectation;
+ endTagExpectationToArray();
+ }
+
+ private void endTagExpectationToArray() {
+ switch (endTagExpectation.getGroup()) {
+ case TreeBuilder.TITLE:
+ endTagExpectationAsArray = TITLE_ARR;
+ return;
+ case TreeBuilder.SCRIPT:
+ endTagExpectationAsArray = SCRIPT_ARR;
+ return;
+ case TreeBuilder.STYLE:
+ endTagExpectationAsArray = STYLE_ARR;
+ return;
+ case TreeBuilder.PLAINTEXT:
+ endTagExpectationAsArray = PLAINTEXT_ARR;
+ return;
+ case TreeBuilder.XMP:
+ endTagExpectationAsArray = XMP_ARR;
+ return;
+ case TreeBuilder.TEXTAREA:
+ endTagExpectationAsArray = TEXTAREA_ARR;
+ return;
+ case TreeBuilder.IFRAME:
+ endTagExpectationAsArray = IFRAME_ARR;
+ return;
+ case TreeBuilder.NOEMBED:
+ endTagExpectationAsArray = NOEMBED_ARR;
+ return;
+ case TreeBuilder.NOSCRIPT:
+ endTagExpectationAsArray = NOSCRIPT_ARR;
+ return;
+ case TreeBuilder.NOFRAMES:
+ endTagExpectationAsArray = NOFRAMES_ARR;
+ return;
+ default:
+ assert false: "Bad end tag expectation.";
+ return;
+ }
+ }
+
+ /**
+ * For C++ use only.
+ */
+ public void setLineNumber(int line) {
+ // CPPONLY: this.attributeLine = line; // XXX is this needed?
+ this.line = line;
+ }
+
+ // start Locator impl
+
+ /**
+ * @see org.xml.sax.Locator#getLineNumber()
+ */
+ @Inline public int getLineNumber() {
+ return line;
+ }
+
+ // [NOCPP[
+
+ /**
+ * @see org.xml.sax.Locator#getColumnNumber()
+ */
+ @Inline public int getColumnNumber() {
+ return -1;
+ }
+
+ /**
+ * @see org.xml.sax.Locator#getPublicId()
+ */
+ public String getPublicId() {
+ return publicId;
+ }
+
+ /**
+ * @see org.xml.sax.Locator#getSystemId()
+ */
+ public String getSystemId() {
+ return systemId;
+ }
+
+ // end Locator impl
+
+ // end public API
+
+ public void notifyAboutMetaBoundary() {
+ metaBoundaryPassed = true;
+ }
+
+ void turnOnAdditionalHtml4Errors() {
+ html4 = true;
+ }
+
+ // ]NOCPP]
+
+ HtmlAttributes emptyAttributes() {
+ // [NOCPP[
+ if (newAttributesEachTime) {
+ return new HtmlAttributes(mappingLangToXmlLang);
+ } else {
+ // ]NOCPP]
+ return HtmlAttributes.EMPTY_ATTRIBUTES;
+ // [NOCPP[
+ }
+ // ]NOCPP]
+ }
+
+ @Inline private void appendCharRefBuf(char c) {
+ // CPPONLY: assert charRefBufLen < charRefBuf.length:
+ // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!";
+ charRefBuf[charRefBufLen++] = c;
+ }
+
+ private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ appendCharRefBufToStrBuf();
+ } else {
+ if (charRefBufLen > 0) {
+ tokenHandler.characters(charRefBuf, 0, charRefBufLen);
+ charRefBufLen = 0;
+ }
+ }
+ }
+
+ @Inline private void clearStrBufAfterUse() {
+ strBufLen = 0;
+ }
+
+ @Inline private void clearStrBufBeforeUse() {
+ assert strBufLen == 0: "strBufLen not reset after previous use!";
+ strBufLen = 0; // no-op in the absence of bugs
+ }
+
+ @Inline private void clearStrBufAfterOneHyphen() {
+ assert strBufLen == 1: "strBufLen length not one!";
+ assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
+ strBufLen = 0;
+ }
+
+ /**
+ * Appends to the buffer.
+ *
+ * @param c
+ * the UTF-16 code unit to append
+ */
+ @Inline private void appendStrBuf(char c) {
+ // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient.";
+ // CPPONLY: if (strBufLen == strBuf.length) {
+ // CPPONLY: if (!EnsureBufferSpace(1)) {
+ // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure";
+ // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not
+ // CPPONLY: }
+ strBuf[strBufLen++] = c;
+ }
+
+ /**
+ * The buffer as a String. Currently only used for error reporting.
+ *
+ * <p>
+ * C++ memory note: The return value must be released.
+ *
+ * @return the buffer as a string
+ */
+ protected String strBufToString() {
+ String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
+ // CPPONLY: , tokenHandler
+ );
+ clearStrBufAfterUse();
+ return str;
+ }
+
+ /**
+ * Returns the buffer as a local name. The return value is released in
+ * emitDoctypeToken().
+ *
+ * @return the buffer as local name
+ */
+ private void strBufToDoctypeName() {
+ doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
+ interner);
+ clearStrBufAfterUse();
+ }
+
+ /**
+ * Emits the buffer as character tokens.
+ *
+ * @throws SAXException
+ * if the token handler threw
+ */
+ private void emitStrBuf() throws SAXException {
+ if (strBufLen > 0) {
+ tokenHandler.characters(strBuf, 0, strBufLen);
+ clearStrBufAfterUse();
+ }
+ }
+
+ @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
+ // [NOCPP[
+ switch (commentPolicy) {
+ case ALTER_INFOSET:
+ appendStrBuf(' ');
+ // FALLTHROUGH
+ case ALLOW:
+ warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
+ // ]NOCPP]
+ appendStrBuf('-');
+ // [NOCPP[
+ break;
+ case FATAL:
+ fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
+ break;
+ }
+ // ]NOCPP]
+ }
+
+ // [NOCPP[
+ private void maybeAppendSpaceToBogusComment() throws SAXException {
+ switch (commentPolicy) {
+ case ALTER_INFOSET:
+ appendStrBuf(' ');
+ // FALLTHROUGH
+ case ALLOW:
+ warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
+ break;
+ case FATAL:
+ fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
+ break;
+ }
+ }
+
+ // ]NOCPP]
+
+ @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
+ throws SAXException {
+ errConsecutiveHyphens();
+ // [NOCPP[
+ switch (commentPolicy) {
+ case ALTER_INFOSET:
+ strBufLen--;
+ // WARNING!!! This expands the worst case of the buffer length
+ // given the length of input!
+ appendStrBuf(' ');
+ appendStrBuf('-');
+ // FALLTHROUGH
+ case ALLOW:
+ warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
+ // ]NOCPP]
+ appendStrBuf(c);
+ // [NOCPP[
+ break;
+ case FATAL:
+ fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
+ break;
+ }
+ // ]NOCPP]
+ }
+
+ private void appendStrBuf(@NoLength char[] buffer, int offset, int length) {
+ int newLen = strBufLen + length;
+ // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient.";
+ // CPPONLY: if (strBuf.length < newLen) {
+ // CPPONLY: if (!EnsureBufferSpace(length)) {
+ // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure";
+ // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not
+ // CPPONLY: }
+ System.arraycopy(buffer, offset, strBuf, strBufLen, length);
+ strBufLen = newLen;
+ }
+
+ /**
+ * Append the contents of the char reference buffer to the main one.
+ */
+ @Inline private void appendCharRefBufToStrBuf() {
+ appendStrBuf(charRefBuf, 0, charRefBufLen);
+ charRefBufLen = 0;
+ }
+
+ /**
+ * Emits the current comment token.
+ *
+ * @param pos
+ * TODO
+ *
+ * @throws SAXException
+ */
+ private void emitComment(int provisionalHyphens, int pos)
+ throws SAXException {
+ // [NOCPP[
+ if (wantsComments) {
+ // ]NOCPP]
+ tokenHandler.comment(strBuf, 0, strBufLen
+ - provisionalHyphens);
+ // [NOCPP[
+ }
+ // ]NOCPP]
+ clearStrBufAfterUse();
+ cstart = pos + 1;
+ }
+
+ /**
+ * Flushes coalesced character tokens.
+ *
+ * @param buf
+ * TODO
+ * @param pos
+ * TODO
+ *
+ * @throws SAXException
+ */
+ protected void flushChars(@NoLength char[] buf, int pos)
+ throws SAXException {
+ if (pos > cstart) {
+ tokenHandler.characters(buf, cstart, pos - cstart);
+ }
+ cstart = Integer.MAX_VALUE;
+ }
+
+ /**
+ * Reports an condition that would make the infoset incompatible with XML
+ * 1.0 as fatal.
+ *
+ * @param message
+ * the message
+ * @throws SAXException
+ * @throws SAXParseException
+ */
+ public void fatal(String message) throws SAXException {
+ SAXParseException spe = new SAXParseException(message, this);
+ if (errorHandler != null) {
+ errorHandler.fatalError(spe);
+ }
+ throw spe;
+ }
+
+ /**
+ * Reports a Parse Error.
+ *
+ * @param message
+ * the message
+ * @throws SAXException
+ */
+ public void err(String message) throws SAXException {
+ if (errorHandler == null) {
+ return;
+ }
+ SAXParseException spe = new SAXParseException(message, this);
+ errorHandler.error(spe);
+ }
+
+ public void errTreeBuilder(String message) throws SAXException {
+ ErrorHandler eh = null;
+ if (tokenHandler instanceof TreeBuilder<?>) {
+ TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
+ eh = treeBuilder.getErrorHandler();
+ }
+ if (eh == null) {
+ eh = errorHandler;
+ }
+ if (eh == null) {
+ return;
+ }
+ SAXParseException spe = new SAXParseException(message, this);
+ eh.error(spe);
+ }
+
+ /**
+ * Reports a warning
+ *
+ * @param message
+ * the message
+ * @throws SAXException
+ */
+ public void warn(String message) throws SAXException {
+ if (errorHandler == null) {
+ return;
+ }
+ SAXParseException spe = new SAXParseException(message, this);
+ errorHandler.warning(spe);
+ }
+
+ private void strBufToElementNameString() {
+ tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
+ interner);
+ clearStrBufAfterUse();
+ }
+
+ private int emitCurrentTagToken(boolean selfClosing, int pos)
+ throws SAXException {
+ cstart = pos + 1;
+ maybeErrSlashInEndTag(selfClosing);
+ stateSave = Tokenizer.DATA;
+ HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
+ : attributes);
+ if (endTag) {
+ /*
+ * When an end tag token is emitted, the content model flag must be
+ * switched to the PCDATA state.
+ */
+ maybeErrAttributesOnEndTag(attrs);
+ // CPPONLY: if (!viewingXmlSource) {
+ tokenHandler.endTag(tagName);
+ // CPPONLY: }
+ // CPPONLY: if (newAttributesEachTime) {
+ // CPPONLY: Portability.delete(attributes);
+ // CPPONLY: attributes = null;
+ // CPPONLY: }
+ } else {
+ // CPPONLY: if (viewingXmlSource) {
+ // CPPONLY: assert newAttributesEachTime;
+ // CPPONLY: Portability.delete(attributes);
+ // CPPONLY: attributes = null;
+ // CPPONLY: } else {
+ tokenHandler.startTag(tagName, attrs, selfClosing);
+ // CPPONLY: }
+ }
+ tagName.release();
+ tagName = null;
+ if (newAttributesEachTime) {
+ attributes = null;
+ } else {
+ attributes.clear(mappingLangToXmlLang);
+ }
+ /*
+ * The token handler may have called setStateAndEndTagExpectation
+ * and changed stateSave since the start of this method.
+ */
+ return stateSave;
+ }
+
+ private void attributeNameComplete() throws SAXException {
+ attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
+ // [NOCPP[
+ , namePolicy != XmlViolationPolicy.ALLOW
+ // ]NOCPP]
+ , interner);
+ clearStrBufAfterUse();
+
+ if (attributes == null) {
+ attributes = new HtmlAttributes(mappingLangToXmlLang);
+ }
+
+ /*
+ * When the user agent leaves the attribute name state (and before
+ * emitting the tag token, if appropriate), the complete attribute's
+ * name must be compared to the other attributes on the same token; if
+ * there is already an attribute on the token with the exact same name,
+ * then this is a parse error and the new attribute must be dropped,
+ * along with the value that gets associated with it (if any).
+ */
+ if (attributes.contains(attributeName)) {
+ errDuplicateAttribute();
+ attributeName.release();
+ attributeName = null;
+ }
+ }
+
+ private void addAttributeWithoutValue() throws SAXException {
+ noteAttributeWithoutValue();
+
+ // [NOCPP[
+ if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
+ && ElementName.META == tagName) {
+ err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
+ }
+ // ]NOCPP]
+ if (attributeName != null) {
+ // [NOCPP[
+ if (html4) {
+ if (attributeName.isBoolean()) {
+ if (html4ModeCompatibleWithXhtml1Schemata) {
+ attributes.addAttribute(attributeName,
+ attributeName.getLocal(AttributeName.HTML),
+ xmlnsPolicy);
+ } else {
+ attributes.addAttribute(attributeName, "", xmlnsPolicy);
+ }
+ } else {
+ if (AttributeName.BORDER != attributeName) {
+ err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
+ attributes.addAttribute(attributeName, "", xmlnsPolicy);
+ }
+ }
+ } else {
+ if (AttributeName.SRC == attributeName
+ || AttributeName.HREF == attributeName) {
+ warn("Attribute \u201C"
+ + attributeName.getLocal(AttributeName.HTML)
+ + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
+ }
+ // ]NOCPP]
+ attributes.addAttribute(attributeName,
+ Portability.newEmptyString()
+ // [NOCPP[
+ , xmlnsPolicy
+ // ]NOCPP]
+ // CPPONLY: , attributeLine
+ );
+ // [NOCPP[
+ }
+ // ]NOCPP]
+ attributeName = null; // attributeName has been adopted by the
+ // |attributes| object
+ } else {
+ clearStrBufAfterUse();
+ }
+ }
+
+ private void addAttributeWithValue() throws SAXException {
+ // [NOCPP[
+ if (metaBoundaryPassed && ElementName.META == tagName
+ && AttributeName.CHARSET == attributeName) {
+ err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
+ }
+ // ]NOCPP]
+ if (attributeName != null) {
+ String val = strBufToString(); // Ownership transferred to
+ // HtmlAttributes
+ // CPPONLY: if (mViewSource) {
+ // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
+ // CPPONLY: }
+ // [NOCPP[
+ if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
+ && attributeName.isCaseFolded()) {
+ val = newAsciiLowerCaseStringFromString(val);
+ }
+ // ]NOCPP]
+ attributes.addAttribute(attributeName, val
+ // [NOCPP[
+ , xmlnsPolicy
+ // ]NOCPP]
+ // CPPONLY: , attributeLine
+ );
+ attributeName = null; // attributeName has been adopted by the
+ // |attributes| object
+ } else {
+ // We have a duplicate attribute. Explicitly discard its value.
+ clearStrBufAfterUse();
+ }
+ }
+
+ // [NOCPP[
+
+ private static String newAsciiLowerCaseStringFromString(String str) {
+ if (str == null) {
+ return null;
+ }
+ char[] buf = new char[str.length()];
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (c >= 'A' && c <= 'Z') {
+ c += 0x20;
+ }
+ buf[i] = c;
+ }
+ return new String(buf);
+ }
+
+ protected void startErrorReporting() throws SAXException {
+
+ }
+
+ // ]NOCPP]
+
+ public void start() throws SAXException {
+ initializeWithoutStarting();
+ tokenHandler.startTokenization(this);
+ // [NOCPP[
+ startErrorReporting();
+ // ]NOCPP]
+ }
+
+ public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
+ int state = stateSave;
+ int returnState = returnStateSave;
+ char c = '\u0000';
+ shouldSuspend = false;
+ lastCR = false;
+
+ int start = buffer.getStart();
+ int end = buffer.getEnd();
+
+ // In C++, the caller of tokenizeBuffer needs to do this explicitly.
+ // [NOCPP[
+ ensureBufferSpace(end - start);
+ // ]NOCPP]
+
+ /**
+ * The index of the last <code>char</code> read from <code>buf</code>.
+ */
+ int pos = start - 1;
+
+ /**
+ * The index of the first <code>char</code> in <code>buf</code> that is
+ * part of a coalesced run of character tokens or
+ * <code>Integer.MAX_VALUE</code> if there is not a current run being
+ * coalesced.
+ */
+ switch (state) {
+ case DATA:
+ case RCDATA:
+ case SCRIPT_DATA:
+ case PLAINTEXT:
+ case RAWTEXT:
+ case CDATA_SECTION:
+ case SCRIPT_DATA_ESCAPED:
+ case SCRIPT_DATA_ESCAPE_START:
+ case SCRIPT_DATA_ESCAPE_START_DASH:
+ case SCRIPT_DATA_ESCAPED_DASH:
+ case SCRIPT_DATA_ESCAPED_DASH_DASH:
+ case SCRIPT_DATA_DOUBLE_ESCAPE_START:
+ case SCRIPT_DATA_DOUBLE_ESCAPED:
+ case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
+ case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
+ case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
+ case SCRIPT_DATA_DOUBLE_ESCAPE_END:
+ cstart = start;
+ break;
+ default:
+ cstart = Integer.MAX_VALUE;
+ break;
+ }
+
+ /**
+ * The number of <code>char</code>s in <code>buf</code> that have
+ * meaning. (The rest of the array is garbage and should not be
+ * examined.)
+ */
+ // CPPONLY: if (mViewSource) {
+ // CPPONLY: mViewSource.SetBuffer(buffer);
+ // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
+ // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
+ // CPPONLY: } else {
+ // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
+ // CPPONLY: }
+ // [NOCPP[
+ pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
+ end);
+ // ]NOCPP]
+ if (pos == end) {
+ // exiting due to end of buffer
+ buffer.setStart(pos);
+ } else {
+ buffer.setStart(pos + 1);
+ }
+ return lastCR;
+ }
+
+ // [NOCPP[
+ private void ensureBufferSpace(int inputLength) throws SAXException {
+ // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
+ // Adding to the general worst case instead of only the
+ // TreeBuilder-exposed worst case to avoid re-introducing a bug when
+ // unifying the tokenizer and tree builder buffers in the future.
+ int worstCase = strBufLen + inputLength + charRefBufLen + 2;
+ tokenHandler.ensureBufferSpace(worstCase);
+ if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
+ // When altering infoset, if the comment contents are consecutive
+ // hyphens, each hyphen generates a space, too. These buffer
+ // contents never get emitted as characters() to the tokenHandler,
+ // which is why this calculation happens after the call to
+ // ensureBufferSpace on tokenHandler.
+ worstCase *= 2;
+ }
+ if (strBuf == null) {
+ // Add an arbitrary small value to avoid immediate reallocation
+ // once there are a few characters in the buffer.
+ strBuf = new char[worstCase + 128];
+ } else if (worstCase > strBuf.length) {
+ // HotSpot reportedly allocates memory with 8-byte accuracy, so
+ // there's no point in trying to do math here to avoid slop.
+ // Maybe we should add some small constant to worstCase here
+ // but not doing that without profiling. In C++ with jemalloc,
+ // the corresponding method should do math to round up here
+ // to avoid slop.
+ char[] newBuf = new char[worstCase];
+ System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
+ strBuf = newBuf;
+ }
+ }
+ // ]NOCPP]
+
+ @SuppressWarnings("unused") private int stateLoop(int state, char c,
+ int pos, @NoLength char[] buf, boolean reconsume, int returnState,
+ int endPos) throws SAXException {
+ /*
+ * Idioms used in this code:
+ *
+ *
+ * Consuming the next input character
+ *
+ * To consume the next input character, the code does this: if (++pos ==
+ * endPos) { break stateloop; } c = checkChar(buf, pos);
+ *
+ *
+ * Staying in a state
+ *
+ * When there's a state that the tokenizer may stay in over multiple
+ * input characters, the state has a wrapper |for(;;)| loop and staying
+ * in the state continues the loop.
+ *
+ *
+ * Switching to another state
+ *
+ * To switch to another state, the code sets the state variable to the
+ * magic number of the new state. Then it either continues stateloop or
+ * breaks out of the state's own wrapper loop if the target state is
+ * right after the current state in source order. (This is a partial
+ * workaround for Java's lack of goto.)
+ *
+ *
+ * Reconsume support
+ *
+ * The spec sometimes says that an input character is reconsumed in
+ * another state. If a state can ever be entered so that an input
+ * character can be reconsumed in it, the state's code starts with an
+ * |if (reconsume)| that sets reconsume to false and skips over the
+ * normal code for consuming a new character.
+ *
+ * To reconsume the current character in another state, the code sets
+ * |reconsume| to true and then switches to the other state.
+ *
+ *
+ * Emitting character tokens
+ *
+ * This method emits character tokens lazily. Whenever a new range of
+ * character tokens starts, the field cstart must be set to the start
+ * index of the range. The flushChars() method must be called at the end
+ * of a range to flush it.
+ *
+ *
+ * U+0000 handling
+ *
+ * The various states have to handle the replacement of U+0000 with
+ * U+FFFD. However, if U+0000 would be reconsumed in another state, the
+ * replacement doesn't need to happen, because it's handled by the
+ * reconsuming state.
+ *
+ *
+ * LF handling
+ *
+ * Every state needs to increment the line number upon LF unless the LF
+ * gets reconsumed by another state which increments the line number.
+ *
+ *
+ * CR handling
+ *
+ * Every state needs to handle CR unless the CR gets reconsumed and is
+ * handled by the reconsuming state. The CR needs to be handled as if it
+ * were and LF, the lastCR field must be set to true and then this
+ * method must return. The IO driver will then swallow the next
+ * character if it is an LF to coalesce CRLF.
+ */
+ stateloop: for (;;) {
+ switch (state) {
+ case DATA:
+ dataloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ switch (c) {
+ case '&':
+ /*
+ * U+0026 AMPERSAND (&) Switch to the character
+ * reference in data state.
+ */
+ flushChars(buf, pos);
+ assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+ appendCharRefBuf(c);
+ setAdditionalAndRememberAmpersandLocation('\u0000');
+ returnState = state;
+ state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
+ continue stateloop;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the tag
+ * open state.
+ */
+ flushChars(buf, pos);
+
+ state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
+ break dataloop; // FALL THROUGH continue
+ // stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the input character as a
+ * character token.
+ *
+ * Stay in the data state.
+ */
+ continue;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case TAG_OPEN:
+ tagopenloop: for (;;) {
+ /*
+ * The behavior of this state depends on the content
+ * model flag.
+ */
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * If the content model flag is set to the PCDATA state
+ * Consume the next input character:
+ */
+ if (c >= 'A' && c <= 'Z') {
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to U+005A
+ * LATIN CAPITAL LETTER Z Create a new start tag
+ * token,
+ */
+ endTag = false;
+ /*
+ * set its tag name to the lowercase version of the
+ * input character (add 0x0020 to the character's
+ * code point),
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf((char) (c + 0x20));
+ /* then switch to the tag name state. */
+ state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
+ /*
+ * (Don't emit the token yet; further details will
+ * be filled in before it is emitted.)
+ */
+ break tagopenloop;
+ // continue stateloop;
+ } else if (c >= 'a' && c <= 'z') {
+ /*
+ * U+0061 LATIN SMALL LETTER A through to U+007A
+ * LATIN SMALL LETTER Z Create a new start tag
+ * token,
+ */
+ endTag = false;
+ /*
+ * set its tag name to the input character,
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ /* then switch to the tag name state. */
+ state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
+ /*
+ * (Don't emit the token yet; further details will
+ * be filled in before it is emitted.)
+ */
+ break tagopenloop;
+ // continue stateloop;
+ }
+ switch (c) {
+ case '!':
+ /*
+ * U+0021 EXCLAMATION MARK (!) Switch to the
+ * markup declaration open state.
+ */
+ state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
+ continue stateloop;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Switch to the close tag
+ * open state.
+ */
+ state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
+ continue stateloop;
+ case '?':
+ // CPPONLY: if (viewingXmlSource) {
+ // CPPONLY: state = transition(state,
+ // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
+ // CPPONLY: reconsume,
+ // CPPONLY: pos);
+ // CPPONLY: continue stateloop;
+ // CPPONLY: }
+ /*
+ * U+003F QUESTION MARK (?) Parse error.
+ */
+ errProcessingInstruction();
+ /*
+ * Switch to the bogus comment state.
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Parse error.
+ */
+ errLtGt();
+ /*
+ * Emit a U+003C LESS-THAN SIGN character token
+ * and a U+003E GREATER-THAN SIGN character
+ * token.
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
+ /* Switch to the data state. */
+ cstart = pos + 1;
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ /*
+ * Anything else Parse error.
+ */
+ errBadCharAfterLt(c);
+ /*
+ * Emit a U+003C LESS-THAN SIGN character token
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ /*
+ * and reconsume the current input character in
+ * the data state.
+ */
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALL THROUGH DON'T REORDER
+ case TAG_NAME:
+ tagnameloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ strBufToElementNameString();
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the before attribute name state.
+ */
+ strBufToElementNameString();
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ break tagnameloop;
+ // continue stateloop;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Switch to the self-closing
+ * start tag state.
+ */
+ strBufToElementNameString();
+ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * tag token.
+ */
+ strBufToElementNameString();
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ if (c >= 'A' && c <= 'Z') {
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Append the
+ * lowercase version of the current input
+ * character (add 0x0020 to the character's
+ * code point) to the current tag token's
+ * tag name.
+ */
+ c += 0x20;
+ }
+ /*
+ * Anything else Append the current input
+ * character to the current tag token's tag
+ * name.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the tag name state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BEFORE_ATTRIBUTE_NAME:
+ beforeattributenameloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the before attribute name state.
+ */
+ continue;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Switch to the self-closing
+ * start tag state.
+ */
+ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * tag token.
+ */
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ case '\"':
+ case '\'':
+ case '<':
+ case '=':
+ /*
+ * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
+ * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
+ * SIGN (=) Parse error.
+ */
+ errBadCharBeforeAttributeNameOrNull(c);
+ /*
+ * Treat it as per the "anything else" entry
+ * below.
+ */
+ default:
+ /*
+ * Anything else Start a new attribute in the
+ * current tag token.
+ */
+ if (c >= 'A' && c <= 'Z') {
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Set that
+ * attribute's name to the lowercase version
+ * of the current input character (add
+ * 0x0020 to the character's code point)
+ */
+ c += 0x20;
+ }
+ // CPPONLY: attributeLine = line;
+ /*
+ * Set that attribute's name to the current
+ * input character,
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ /*
+ * and its value to the empty string.
+ */
+ // Will do later.
+ /*
+ * Switch to the attribute name state.
+ */
+ state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
+ break beforeattributenameloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case ATTRIBUTE_NAME:
+ attributenameloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ attributeNameComplete();
+ state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the after attribute name state.
+ */
+ attributeNameComplete();
+ state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Switch to the self-closing
+ * start tag state.
+ */
+ attributeNameComplete();
+ addAttributeWithoutValue();
+ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
+ continue stateloop;
+ case '=':
+ /*
+ * U+003D EQUALS SIGN (=) Switch to the before
+ * attribute value state.
+ */
+ attributeNameComplete();
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
+ break attributenameloop;
+ // continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * tag token.
+ */
+ attributeNameComplete();
+ addAttributeWithoutValue();
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ case '\"':
+ case '\'':
+ case '<':
+ /*
+ * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
+ * (') U+003C LESS-THAN SIGN (<) Parse error.
+ */
+ errQuoteOrLtInAttributeNameOrNull(c);
+ /*
+ * Treat it as per the "anything else" entry
+ * below.
+ */
+ default:
+ if (c >= 'A' && c <= 'Z') {
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Append the
+ * lowercase version of the current input
+ * character (add 0x0020 to the character's
+ * code point) to the current attribute's
+ * name.
+ */
+ c += 0x20;
+ }
+ /*
+ * Anything else Append the current input
+ * character to the current attribute's name.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the attribute name state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BEFORE_ATTRIBUTE_VALUE:
+ beforeattributevalueloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the before attribute value state.
+ */
+ continue;
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Switch to the
+ * attribute value (double-quoted) state.
+ */
+ // CPPONLY: attributeLine = line;
+ clearStrBufBeforeUse();
+ state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
+ break beforeattributevalueloop;
+ // continue stateloop;
+ case '&':
+ /*
+ * U+0026 AMPERSAND (&) Switch to the attribute
+ * value (unquoted) state and reconsume this
+ * input character.
+ */
+ // CPPONLY: attributeLine = line;
+ clearStrBufBeforeUse();
+ reconsume = true;
+ state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
+ noteUnquotedAttributeValue();
+ continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Switch to the attribute
+ * value (single-quoted) state.
+ */
+ // CPPONLY: attributeLine = line;
+ clearStrBufBeforeUse();
+ state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Parse error.
+ */
+ errAttributeValueMissing();
+ /*
+ * Emit the current tag token.
+ */
+ addAttributeWithoutValue();
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ case '<':
+ case '=':
+ case '`':
+ /*
+ * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
+ * (=) U+0060 GRAVE ACCENT (`)
+ */
+ errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
+ /*
+ * Treat it as per the "anything else" entry
+ * below.
+ */
+ default:
+ // [NOCPP[
+ errHtml4NonNameInUnquotedAttribute(c);
+ // ]NOCPP]
+ /*
+ * Anything else Append the current input
+ * character to the current attribute's value.
+ */
+ // CPPONLY: attributeLine = line;
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ /*
+ * Switch to the attribute value (unquoted)
+ * state.
+ */
+
+ state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
+ noteUnquotedAttributeValue();
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
+ attributevaluedoublequotedloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Switch to the after
+ * attribute value (quoted) state.
+ */
+ addAttributeWithValue();
+
+ state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
+ break attributevaluedoublequotedloop;
+ // continue stateloop;
+ case '&':
+ /*
+ * U+0026 AMPERSAND (&) Switch to the character
+ * reference in attribute value state, with the
+ * additional allowed character being U+0022
+ * QUOTATION MARK (").
+ */
+ assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+ appendCharRefBuf(c);
+ setAdditionalAndRememberAmpersandLocation('\"');
+ returnState = state;
+ state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the current input
+ * character to the current attribute's value.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the attribute value (double-quoted)
+ * state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case AFTER_ATTRIBUTE_VALUE_QUOTED:
+ afterattributevaluequotedloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the before attribute name state.
+ */
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Switch to the self-closing
+ * start tag state.
+ */
+ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
+ break afterattributevaluequotedloop;
+ // continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * tag token.
+ */
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ default:
+ /*
+ * Anything else Parse error.
+ */
+ errNoSpaceBetweenAttributes();
+ /*
+ * Reconsume the character in the before
+ * attribute name state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case SELF_CLOSING_START_TAG:
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Set the self-closing
+ * flag of the current tag token. Emit the current
+ * tag token.
+ */
+ // [NOCPP[
+ errHtml4XmlVoidSyntax();
+ // ]NOCPP]
+ state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ default:
+ /* Anything else Parse error. */
+ errSlashNotFollowedByGt();
+ /*
+ * Reconsume the character in the before attribute
+ * name state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ }
+ // XXX reorder point
+ case ATTRIBUTE_VALUE_UNQUOTED:
+ for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ addAttributeWithValue();
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the before attribute name state.
+ */
+ addAttributeWithValue();
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ case '&':
+ /*
+ * U+0026 AMPERSAND (&) Switch to the character
+ * reference in attribute value state, with the
+ * additional allowed character being U+003E
+ * GREATER-THAN SIGN (>)
+ */
+ assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+ appendCharRefBuf(c);
+ setAdditionalAndRememberAmpersandLocation('>');
+ returnState = state;
+ state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * tag token.
+ */
+ addAttributeWithValue();
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ case '<':
+ case '\"':
+ case '\'':
+ case '=':
+ case '`':
+ /*
+ * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
+ * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
+ * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
+ */
+ errUnquotedAttributeValOrNull(c);
+ /*
+ * Treat it as per the "anything else" entry
+ * below.
+ */
+ // fall through
+ default:
+ // [NOCPP]
+ errHtml4NonNameInUnquotedAttribute(c);
+ // ]NOCPP]
+ /*
+ * Anything else Append the current input
+ * character to the current attribute's value.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the attribute value (unquoted) state.
+ */
+ continue;
+ }
+ }
+ // XXX reorder point
+ case AFTER_ATTRIBUTE_NAME:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the after attribute name state.
+ */
+ continue;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Switch to the self-closing
+ * start tag state.
+ */
+ addAttributeWithoutValue();
+ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
+ continue stateloop;
+ case '=':
+ /*
+ * U+003D EQUALS SIGN (=) Switch to the before
+ * attribute value state.
+ */
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * tag token.
+ */
+ addAttributeWithoutValue();
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ /*
+ * Switch to the data state.
+ */
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ case '\"':
+ case '\'':
+ case '<':
+ errQuoteOrLtInAttributeNameOrNull(c);
+ /*
+ * Treat it as per the "anything else" entry
+ * below.
+ */
+ default:
+ addAttributeWithoutValue();
+ /*
+ * Anything else Start a new attribute in the
+ * current tag token.
+ */
+ if (c >= 'A' && c <= 'Z') {
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Set that
+ * attribute's name to the lowercase version
+ * of the current input character (add
+ * 0x0020 to the character's code point)
+ */
+ c += 0x20;
+ }
+ /*
+ * Set that attribute's name to the current
+ * input character,
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ /*
+ * and its value to the empty string.
+ */
+ // Will do later.
+ /*
+ * Switch to the attribute name state.
+ */
+ state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case MARKUP_DECLARATION_OPEN:
+ markupdeclarationopenloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * If the next two characters are both U+002D
+ * HYPHEN-MINUS characters (-), consume those two
+ * characters, create a comment token whose data is the
+ * empty string, and switch to the comment start state.
+ *
+ * Otherwise, if the next seven characters are an ASCII
+ * case-insensitive match for the word "DOCTYPE", then
+ * consume those characters and switch to the DOCTYPE
+ * state.
+ *
+ * Otherwise, if the insertion mode is
+ * "in foreign content" and the current node is not an
+ * element in the HTML namespace and the next seven
+ * characters are an case-sensitive match for the string
+ * "[CDATA[" (the five uppercase letters "CDATA" with a
+ * U+005B LEFT SQUARE BRACKET character before and
+ * after), then consume those characters and switch to
+ * the CDATA section state.
+ *
+ * Otherwise, is is a parse error. Switch to the bogus
+ * comment state. The next character that is consumed,
+ * if any, is the first character that will be in the
+ * comment.
+ */
+ switch (c) {
+ case '-':
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
+ break markupdeclarationopenloop;
+ // continue stateloop;
+ case 'd':
+ case 'D':
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ index = 0;
+ state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
+ continue stateloop;
+ case '[':
+ if (tokenHandler.cdataSectionAllowed()) {
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ index = 0;
+ state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
+ continue stateloop;
+ }
+ // else fall through
+ default:
+ errBogusComment();
+ clearStrBufBeforeUse();
+ reconsume = true;
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case MARKUP_DECLARATION_HYPHEN:
+ markupdeclarationhyphenloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case '\u0000':
+ break stateloop;
+ case '-':
+ clearStrBufAfterOneHyphen();
+ state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
+ break markupdeclarationhyphenloop;
+ // continue stateloop;
+ default:
+ errBogusComment();
+ reconsume = true;
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case COMMENT_START:
+ commentstartloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Comment start state
+ *
+ *
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Switch to the comment
+ * start dash state.
+ */
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Parse error.
+ */
+ errPrematureEndOfComment();
+ /* Emit the comment token. */
+ emitComment(0, pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ break commentstartloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the input character to
+ * the comment token's data.
+ */
+ appendStrBuf(c);
+ /*
+ * Switch to the comment state.
+ */
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ break commentstartloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case COMMENT:
+ commentloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Comment state Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Switch to the comment
+ * end dash state
+ */
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
+ break commentloop;
+ // continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the input character to
+ * the comment token's data.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the comment state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case COMMENT_END_DASH:
+ commentenddashloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Comment end dash state Consume the next input
+ * character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Switch to the comment
+ * end state
+ */
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
+ break commentenddashloop;
+ // continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append a U+002D HYPHEN-MINUS
+ * (-) character and the input character to the
+ * comment token's data.
+ */
+ appendStrBuf(c);
+ /*
+ * Switch to the comment state.
+ */
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case COMMENT_END:
+ commentendloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Comment end dash state Consume the next input
+ * character:
+ */
+ switch (c) {
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the comment
+ * token.
+ */
+ emitComment(2, pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '-':
+ /* U+002D HYPHEN-MINUS (-) Parse error. */
+ /*
+ * Append a U+002D HYPHEN-MINUS (-) character to
+ * the comment token's data.
+ */
+ adjustDoubleHyphenAndAppendToStrBufAndErr(c);
+ /*
+ * Stay in the comment end state.
+ */
+ continue;
+ case '\r':
+ adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ break stateloop;
+ case '\n':
+ adjustDoubleHyphenAndAppendToStrBufLineFeed();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ case '!':
+ errHyphenHyphenBang();
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Append two U+002D HYPHEN-MINUS (-) characters
+ * and the input character to the comment
+ * token's data.
+ */
+ adjustDoubleHyphenAndAppendToStrBufAndErr(c);
+ /*
+ * Switch to the comment state.
+ */
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case COMMENT_END_BANG:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Comment end bang state
+ *
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the comment
+ * token.
+ */
+ emitComment(3, pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '-':
+ /*
+ * Append two U+002D HYPHEN-MINUS (-) characters
+ * and a U+0021 EXCLAMATION MARK (!) character
+ * to the comment token's data.
+ */
+ appendStrBuf(c);
+ /*
+ * Switch to the comment end dash state.
+ */
+ state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append two U+002D HYPHEN-MINUS
+ * (-) characters, a U+0021 EXCLAMATION MARK (!)
+ * character, and the input character to the
+ * comment token's data. Switch to the comment
+ * state.
+ */
+ appendStrBuf(c);
+ /*
+ * Switch to the comment state.
+ */
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case COMMENT_START_DASH:
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Comment start dash state
+ *
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Switch to the comment end
+ * state
+ */
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
+ continue stateloop;
+ case '>':
+ errPrematureEndOfComment();
+ /* Emit the comment token. */
+ emitComment(1, pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Append a U+002D HYPHEN-MINUS character (-) and
+ * the current input character to the comment
+ * token's data.
+ */
+ appendStrBuf(c);
+ /*
+ * Switch to the comment state.
+ */
+ state = transition(state, Tokenizer.COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ // XXX reorder point
+ case CDATA_START:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ if (index < 6) { // CDATA_LSQB.length
+ if (c == Tokenizer.CDATA_LSQB[index]) {
+ appendStrBuf(c);
+ } else {
+ errBogusComment();
+ reconsume = true;
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ index++;
+ continue;
+ } else {
+ clearStrBufAfterUse();
+ cstart = pos; // start coalescing
+ reconsume = true;
+ state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
+ break; // FALL THROUGH continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case CDATA_SECTION:
+ cdatasectionloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ switch (c) {
+ case ']':
+ flushChars(buf, pos);
+ state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
+ break cdatasectionloop; // FALL THROUGH
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ default:
+ continue;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case CDATA_RSQB:
+ cdatarsqb: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case ']':
+ state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
+ break cdatarsqb;
+ default:
+ tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
+ 1);
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case CDATA_RSQB_RSQB:
+ cdatarsqbrsqb: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case ']':
+ // Saw a third ]. Emit one ] (logically the
+ // first one) and stay in this state to
+ // remember that the last two characters seen
+ // have been ]].
+ tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
+ continue;
+ case '>':
+ cstart = pos + 1;
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case ATTRIBUTE_VALUE_SINGLE_QUOTED:
+ attributevaluesinglequotedloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Switch to the after
+ * attribute value (quoted) state.
+ */
+ addAttributeWithValue();
+
+ state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '&':
+ /*
+ * U+0026 AMPERSAND (&) Switch to the character
+ * reference in attribute value state, with the
+ * + additional allowed character being U+0027
+ * APOSTROPHE (').
+ */
+ assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+ appendCharRefBuf(c);
+ setAdditionalAndRememberAmpersandLocation('\'');
+ returnState = state;
+ state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
+ break attributevaluesinglequotedloop;
+ // continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the current input
+ * character to the current attribute's value.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the attribute value (double-quoted)
+ * state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case CONSUME_CHARACTER_REFERENCE:
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ if (c == '\u0000') {
+ break stateloop;
+ }
+ /*
+ * Unlike the definition is the spec, this state does not
+ * return a value and never requires the caller to
+ * backtrack. This state takes care of emitting characters
+ * or appending to the current attribute value. It also
+ * takes care of that in the case when consuming the
+ * character reference fails.
+ */
+ /*
+ * This section defines how to consume a character
+ * reference. This definition is used when parsing character
+ * references in text and in attributes.
+ *
+ * The behavior depends on the identity of the next
+ * character (the one immediately after the U+0026 AMPERSAND
+ * character):
+ */
+ switch (c) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\r': // we'll reconsume!
+ case '\u000C':
+ case '<':
+ case '&':
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ case '#':
+ /*
+ * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
+ * SIGN.
+ */
+ appendCharRefBuf('#');
+ state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
+ continue stateloop;
+ default:
+ if (c == additional) {
+ emitOrAppendCharRefBuf(returnState);
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ if (c >= 'a' && c <= 'z') {
+ firstCharKey = c - 'a' + 26;
+ } else if (c >= 'A' && c <= 'Z') {
+ firstCharKey = c - 'A';
+ } else {
+ // No match
+ /*
+ * If no match can be made, then this is a parse
+ * error.
+ */
+ errNoNamedCharacterMatch();
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ // Didn't fail yet
+ appendCharRefBuf(c);
+ state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
+ // FALL THROUGH continue stateloop;
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case CHARACTER_REFERENCE_HILO_LOOKUP:
+ {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ if (c == '\u0000') {
+ break stateloop;
+ }
+ /*
+ * The data structure is as follows:
+ *
+ * HILO_ACCEL is a two-dimensional int array whose major
+ * index corresponds to the second character of the
+ * character reference (code point as index) and the
+ * minor index corresponds to the first character of the
+ * character reference (packed so that A-Z runs from 0
+ * to 25 and a-z runs from 26 to 51). This layout makes
+ * it easier to use the sparseness of the data structure
+ * to omit parts of it: The second dimension of the
+ * table is null when no character reference starts with
+ * the character corresponding to that row.
+ *
+ * The int value HILO_ACCEL (by these indeces) is zero
+ * if there exists no character reference starting with
+ * that two-letter prefix. Otherwise, the value is an
+ * int that packs two shorts so that the higher short is
+ * the index of the highest character reference name
+ * with that prefix in NAMES and the lower short
+ * corresponds to the index of the lowest character
+ * reference name with that prefix. (It happens that the
+ * first two character reference names share their
+ * prefix so the packed int cannot be 0 by packing the
+ * two shorts.)
+ *
+ * NAMES is an array of byte arrays where each byte
+ * array encodes the name of a character references as
+ * ASCII. The names omit the first two letters of the
+ * name. (Since storing the first two letters would be
+ * redundant with the data contained in HILO_ACCEL.) The
+ * entries are lexically sorted.
+ *
+ * For a given index in NAMES, the same index in VALUES
+ * contains the corresponding expansion as an array of
+ * two UTF-16 code units (either the character and
+ * U+0000 or a suggogate pair).
+ */
+ int hilo = 0;
+ if (c <= 'z') {
+ @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
+ if (row != null) {
+ hilo = row[firstCharKey];
+ }
+ }
+ if (hilo == 0) {
+ /*
+ * If no match can be made, then this is a parse
+ * error.
+ */
+ errNoNamedCharacterMatch();
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ // Didn't fail yet
+ appendCharRefBuf(c);
+ lo = hilo & 0xFFFF;
+ hi = hilo >> 16;
+ entCol = -1;
+ candidate = -1;
+ charRefBufMark = 0;
+ state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
+ // FALL THROUGH continue stateloop;
+ }
+ case CHARACTER_REFERENCE_TAIL:
+ outer: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ if (c == '\u0000') {
+ break stateloop;
+ }
+ entCol++;
+ /*
+ * Consume the maximum number of characters possible,
+ * with the consumed characters matching one of the
+ * identifiers in the first column of the named
+ * character references table (in a case-sensitive
+ * manner).
+ */
+ loloop: for (;;) {
+ if (hi < lo) {
+ break outer;
+ }
+ if (entCol == NamedCharacters.NAMES[lo].length()) {
+ candidate = lo;
+ charRefBufMark = charRefBufLen;
+ lo++;
+ } else if (entCol > NamedCharacters.NAMES[lo].length()) {
+ break outer;
+ } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
+ lo++;
+ } else {
+ break loloop;
+ }
+ }
+
+ hiloop: for (;;) {
+ if (hi < lo) {
+ break outer;
+ }
+ if (entCol == NamedCharacters.NAMES[hi].length()) {
+ break hiloop;
+ }
+ if (entCol > NamedCharacters.NAMES[hi].length()) {
+ break outer;
+ } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
+ hi--;
+ } else {
+ break hiloop;
+ }
+ }
+
+ if (c == ';') {
+ // If we see a semicolon, there cannot be a
+ // longer match. Break the loop. However, before
+ // breaking, take the longest match so far as the
+ // candidate, if we are just about to complete a
+ // match.
+ if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
+ candidate = lo;
+ charRefBufMark = charRefBufLen;
+ }
+ break outer;
+ }
+
+ if (hi < lo) {
+ break outer;
+ }
+ appendCharRefBuf(c);
+ continue;
+ }
+
+ if (candidate == -1) {
+ // reconsume deals with CR, LF or nul
+ /*
+ * If no match can be made, then this is a parse error.
+ */
+ errNoNamedCharacterMatch();
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ } else {
+ // c can't be CR, LF or nul if we got here
+ @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
+ if (candidateName.length() == 0
+ || candidateName.charAt(candidateName.length() - 1) != ';') {
+ /*
+ * If the last character matched is not a U+003B
+ * SEMICOLON (;), there is a parse error.
+ */
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ /*
+ * If the entity is being consumed as part of an
+ * attribute, and the last character matched is
+ * not a U+003B SEMICOLON (;),
+ */
+ char ch;
+ if (charRefBufMark == charRefBufLen) {
+ ch = c;
+ } else {
+ ch = charRefBuf[charRefBufMark];
+ }
+ if (ch == '=' || (ch >= '0' && ch <= '9')
+ || (ch >= 'A' && ch <= 'Z')
+ || (ch >= 'a' && ch <= 'z')) {
+ /*
+ * and the next character is either a U+003D
+ * EQUALS SIGN character (=) or in the range
+ * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
+ * U+0041 LATIN CAPITAL LETTER A to U+005A
+ * LATIN CAPITAL LETTER Z, or U+0061 LATIN
+ * SMALL LETTER A to U+007A LATIN SMALL
+ * LETTER Z, then, for historical reasons,
+ * all the characters that were matched
+ * after the U+0026 AMPERSAND (&) must be
+ * unconsumed, and nothing is returned.
+ */
+ errNoNamedCharacterMatch();
+ appendCharRefBufToStrBuf();
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ errUnescapedAmpersandInterpretedAsCharacterReference();
+ } else {
+ errNotSemicolonTerminated();
+ }
+ }
+
+ /*
+ * Otherwise, return a character token for the character
+ * corresponding to the entity name (as given by the
+ * second column of the named character references
+ * table).
+ */
+ // CPPONLY: completedNamedCharacterReference();
+ @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
+ if (
+ // [NOCPP[
+ val.length == 1
+ // ]NOCPP]
+ // CPPONLY: val[1] == 0
+ ) {
+ emitOrAppendOne(val, returnState);
+ } else {
+ emitOrAppendTwo(val, returnState);
+ }
+ // this is so complicated!
+ if (charRefBufMark < charRefBufLen) {
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ appendStrBuf(charRefBuf, charRefBufMark,
+ charRefBufLen - charRefBufMark);
+ } else {
+ tokenHandler.characters(charRefBuf, charRefBufMark,
+ charRefBufLen - charRefBufMark);
+ }
+ }
+ // charRefBufLen will be zeroed below!
+
+ // Check if we broke out early with c being the last
+ // character that matched as opposed to being the
+ // first one that didn't match. In the case of an
+ // early break, the next run on text should start
+ // *after* the current character and the current
+ // character shouldn't be reconsumed.
+ boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
+ charRefBufLen = 0;
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = earlyBreak ? pos + 1 : pos;
+ }
+ reconsume = !earlyBreak;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ /*
+ * If the markup contains I'm &notit; I tell you, the
+ * entity is parsed as "not", as in, I'm ¬it; I tell
+ * you. But if the markup was I'm &notin; I tell you,
+ * the entity would be parsed as "notin;", resulting in
+ * I'm ∉ I tell you.
+ */
+ }
+ // XXX reorder point
+ case CONSUME_NCR:
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ value = 0;
+ seenDigits = false;
+ /*
+ * The behavior further depends on the character after the
+ * U+0023 NUMBER SIGN:
+ */
+ switch (c) {
+ case 'x':
+ case 'X':
+
+ /*
+ * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
+ * LETTER X Consume the X.
+ *
+ * Follow the steps below, but using the range of
+ * characters U+0030 DIGIT ZERO through to U+0039
+ * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
+ * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
+ * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
+ * LETTER F (in other words, 0-9, A-F, a-f).
+ *
+ * When it comes to interpreting the number,
+ * interpret it as a hexadecimal number.
+ */
+ appendCharRefBuf(c);
+ state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
+ continue stateloop;
+ default:
+ /*
+ * Anything else Follow the steps below, but using
+ * the range of characters U+0030 DIGIT ZERO through
+ * to U+0039 DIGIT NINE (i.e. just 0-9).
+ *
+ * When it comes to interpreting the number,
+ * interpret it as a decimal number.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
+ // FALL THROUGH continue stateloop;
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case DECIMAL_NRC_LOOP:
+ decimalloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume as many characters as match the range of
+ * characters given above.
+ */
+ assert value >= 0: "value must not become negative.";
+ if (c >= '0' && c <= '9') {
+ seenDigits = true;
+ // Avoid overflow
+ if (value <= 0x10FFFF) {
+ value *= 10;
+ value += c - '0';
+ }
+ continue;
+ } else if (c == ';') {
+ if (seenDigits) {
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos + 1;
+ }
+ state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
+ // FALL THROUGH continue stateloop;
+ break decimalloop;
+ } else {
+ errNoDigitsInNCR();
+ appendCharRefBuf(';');
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos + 1;
+ }
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ } else {
+ /*
+ * If no characters match the range, then don't
+ * consume any characters (and unconsume the U+0023
+ * NUMBER SIGN character and, if appropriate, the X
+ * character). This is a parse error; nothing is
+ * returned.
+ *
+ * Otherwise, if the next character is a U+003B
+ * SEMICOLON, consume that too. If it isn't, there
+ * is a parse error.
+ */
+ if (!seenDigits) {
+ errNoDigitsInNCR();
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ } else {
+ errCharRefLacksSemicolon();
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
+ // FALL THROUGH continue stateloop;
+ break decimalloop;
+ }
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case HANDLE_NCR_VALUE:
+ // WARNING previous state sets reconsume
+ // We are not going to emit the contents of charRefBuf.
+ charRefBufLen = 0;
+ // XXX inline this case if the method size can take it
+ handleNcrValue(returnState);
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ // XXX reorder point
+ case HEX_NCR_LOOP:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume as many characters as match the range of
+ * characters given above.
+ */
+ assert value >= 0: "value must not become negative.";
+ if (c >= '0' && c <= '9') {
+ seenDigits = true;
+ // Avoid overflow
+ if (value <= 0x10FFFF) {
+ value *= 16;
+ value += c - '0';
+ }
+ continue;
+ } else if (c >= 'A' && c <= 'F') {
+ seenDigits = true;
+ // Avoid overflow
+ if (value <= 0x10FFFF) {
+ value *= 16;
+ value += c - 'A' + 10;
+ }
+ continue;
+ } else if (c >= 'a' && c <= 'f') {
+ seenDigits = true;
+ // Avoid overflow
+ if (value <= 0x10FFFF) {
+ value *= 16;
+ value += c - 'a' + 10;
+ }
+ continue;
+ } else if (c == ';') {
+ if (seenDigits) {
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos + 1;
+ }
+ state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
+ continue stateloop;
+ } else {
+ errNoDigitsInNCR();
+ appendCharRefBuf(';');
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos + 1;
+ }
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ } else {
+ /*
+ * If no characters match the range, then don't
+ * consume any characters (and unconsume the U+0023
+ * NUMBER SIGN character and, if appropriate, the X
+ * character). This is a parse error; nothing is
+ * returned.
+ *
+ * Otherwise, if the next character is a U+003B
+ * SEMICOLON, consume that too. If it isn't, there
+ * is a parse error.
+ */
+ if (!seenDigits) {
+ errNoDigitsInNCR();
+ emitOrAppendCharRefBuf(returnState);
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ } else {
+ errCharRefLacksSemicolon();
+ if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
+ cstart = pos;
+ }
+ reconsume = true;
+ state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ }
+ // XXX reorder point
+ case PLAINTEXT:
+ plaintextloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ switch (c) {
+ case '\u0000':
+ emitPlaintextReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Stay in the
+ * RAWTEXT state.
+ */
+ continue;
+ }
+ }
+ // XXX reorder point
+ case CLOSE_TAG_OPEN:
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Otherwise, if the content model flag is set to the PCDATA
+ * state, or if the next few characters do match that tag
+ * name, consume the next input character:
+ */
+ switch (c) {
+ case '>':
+ /* U+003E GREATER-THAN SIGN (>) Parse error. */
+ errLtSlashGt();
+ /*
+ * Switch to the data state.
+ */
+ cstart = pos + 1;
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ silentCarriageReturn();
+ /* Anything else Parse error. */
+ errGarbageAfterLtSlash();
+ /*
+ * Switch to the bogus comment state.
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf('\n');
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ /* Anything else Parse error. */
+ errGarbageAfterLtSlash();
+ /*
+ * Switch to the bogus comment state.
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ if (c >= 'A' && c <= 'Z') {
+ c += 0x20;
+ }
+ if (c >= 'a' && c <= 'z') {
+ /*
+ * U+0061 LATIN SMALL LETTER A through to U+007A
+ * LATIN SMALL LETTER Z Create a new end tag
+ * token,
+ */
+ endTag = true;
+ /*
+ * set its tag name to the input character,
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ /*
+ * then switch to the tag name state. (Don't
+ * emit the token yet; further details will be
+ * filled in before it is emitted.)
+ */
+ state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
+ continue stateloop;
+ } else {
+ /* Anything else Parse error. */
+ errGarbageAfterLtSlash();
+ /*
+ * Switch to the bogus comment state.
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case RCDATA:
+ rcdataloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ switch (c) {
+ case '&':
+ /*
+ * U+0026 AMPERSAND (&) Switch to the character
+ * reference in RCDATA state.
+ */
+ flushChars(buf, pos);
+ assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
+ appendCharRefBuf(c);
+ setAdditionalAndRememberAmpersandLocation('\u0000');
+ returnState = state;
+ state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
+ continue stateloop;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the
+ * RCDATA less-than sign state.
+ */
+ flushChars(buf, pos);
+
+ returnState = state;
+ state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Emit the current input character as a
+ * character token. Stay in the RCDATA state.
+ */
+ continue;
+ }
+ }
+ // XXX reorder point
+ case RAWTEXT:
+ rawtextloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ switch (c) {
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the
+ * RAWTEXT less-than sign state.
+ */
+ flushChars(buf, pos);
+
+ returnState = state;
+ state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
+ break rawtextloop;
+ // FALL THRU continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Emit the current input character as a
+ * character token. Stay in the RAWTEXT state.
+ */
+ continue;
+ }
+ }
+ // XXX fallthru don't reorder
+ case RAWTEXT_RCDATA_LESS_THAN_SIGN:
+ rawtextrcdatalessthansignloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Set the temporary buffer
+ * to the empty string. Switch to the script
+ * data end tag open state.
+ */
+ index = 0;
+ clearStrBufBeforeUse();
+ state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
+ break rawtextrcdatalessthansignloop;
+ // FALL THRU continue stateloop;
+ default:
+ /*
+ * Otherwise, emit a U+003C LESS-THAN SIGN
+ * character token
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ /*
+ * and reconsume the current input character in
+ * the data state.
+ */
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX fall thru. don't reorder.
+ case NON_DATA_END_TAG_NAME:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * ASSERT! when entering this state, set index to 0 and
+ * call clearStrBufBeforeUse(); Let's implement the above
+ * without lookahead. strBuf is the 'temporary buffer'.
+ */
+ if (endTagExpectationAsArray == null) {
+ tokenHandler.characters(Tokenizer.LT_SOLIDUS,
+ 0, 2);
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ } else if (index < endTagExpectationAsArray.length) {
+ char e = endTagExpectationAsArray[index];
+ char folded = c;
+ if (c >= 'A' && c <= 'Z') {
+ folded += 0x20;
+ }
+ if (folded != e) {
+ // [NOCPP[
+ errHtml4LtSlashInRcdata(folded);
+ // ]NOCPP]
+ tokenHandler.characters(Tokenizer.LT_SOLIDUS,
+ 0, 2);
+ emitStrBuf();
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ appendStrBuf(c);
+ index++;
+ continue;
+ } else {
+ endTag = true;
+ // XXX replace contentModelElement with different
+ // type
+ tagName = endTagExpectation;
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ clearStrBufAfterUse(); // strBuf not used
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE
+ * FEED (LF) U+000C FORM FEED (FF) U+0020
+ * SPACE If the current end tag token is an
+ * appropriate end tag token, then switch to
+ * the before attribute name state.
+ */
+ clearStrBufAfterUse(); // strBuf not used
+ state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
+ continue stateloop;
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) If the current end tag
+ * token is an appropriate end tag token,
+ * then switch to the self-closing start tag
+ * state.
+ */
+ clearStrBufAfterUse(); // strBuf not used
+ state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) If the
+ * current end tag token is an appropriate
+ * end tag token, then emit the current tag
+ * token and switch to the data state.
+ */
+ clearStrBufAfterUse(); // strBuf not used
+ state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
+ if (shouldSuspend) {
+ break stateloop;
+ }
+ continue stateloop;
+ default:
+ /*
+ * Emit a U+003C LESS-THAN SIGN character
+ * token, a U+002F SOLIDUS character token,
+ * a character token for each of the
+ * characters in the temporary buffer (in
+ * the order they were added to the buffer),
+ * and reconsume the current input character
+ * in the RAWTEXT state.
+ */
+ // [NOCPP[
+ errWarnLtSlashInRcdata();
+ // ]NOCPP]
+ tokenHandler.characters(
+ Tokenizer.LT_SOLIDUS, 0, 2);
+ emitStrBuf();
+ cstart = pos; // don't drop the
+ // character
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ }
+ // XXX reorder point
+ // BEGIN HOTSPOT WORKAROUND
+ case BOGUS_COMMENT:
+ boguscommentloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume every character up to and including the first
+ * U+003E GREATER-THAN SIGN character (>) or the end of
+ * the file (EOF), whichever comes first. Emit a comment
+ * token whose data is the concatenation of all the
+ * characters starting from and including the character
+ * that caused the state machine to switch into the
+ * bogus comment state, up to and including the
+ * character immediately before the last consumed
+ * character (i.e. up to the character just before the
+ * U+003E or EOF character). (If the comment was started
+ * by the end of the file (EOF), the token is empty.)
+ *
+ * Switch to the data state.
+ *
+ * If the end of the file was reached, reconsume the EOF
+ * character.
+ */
+ switch (c) {
+ case '>':
+ emitComment(0, pos);
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '-':
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
+ break boguscommentloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ appendStrBuf(c);
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BOGUS_COMMENT_HYPHEN:
+ boguscommenthyphenloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case '>':
+ // [NOCPP[
+ maybeAppendSpaceToBogusComment();
+ // ]NOCPP]
+ emitComment(0, pos);
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '-':
+ appendSecondHyphenToBogusComment();
+ continue boguscommenthyphenloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ appendStrBuf(c);
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case SCRIPT_DATA:
+ scriptdataloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ switch (c) {
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the
+ * script data less-than sign state.
+ */
+ flushChars(buf, pos);
+ returnState = state;
+ state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
+ break scriptdataloop; // FALL THRU continue
+ // stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Stay in the
+ * script data state.
+ */
+ continue;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_LESS_THAN_SIGN:
+ scriptdatalessthansignloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Set the temporary buffer
+ * to the empty string. Switch to the script
+ * data end tag open state.
+ */
+ index = 0;
+ clearStrBufBeforeUse();
+ state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
+ continue stateloop;
+ case '!':
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ cstart = pos;
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
+ break scriptdatalessthansignloop; // FALL THRU
+ // continue
+ // stateloop;
+ default:
+ /*
+ * Otherwise, emit a U+003C LESS-THAN SIGN
+ * character token
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ /*
+ * and reconsume the current input character in
+ * the data state.
+ */
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_ESCAPE_START:
+ scriptdataescapestartloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Switch to the
+ * script data escape start dash state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
+ break scriptdataescapestartloop; // FALL THRU
+ // continue
+ // stateloop;
+ default:
+ /*
+ * Anything else Reconsume the current input
+ * character in the script data state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_ESCAPE_START_DASH:
+ scriptdataescapestartdashloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Switch to the
+ * script data escaped dash dash state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
+ break scriptdataescapestartdashloop;
+ // continue stateloop;
+ default:
+ /*
+ * Anything else Reconsume the current input
+ * character in the script data state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_ESCAPED_DASH_DASH:
+ scriptdataescapeddashdashloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Stay in the
+ * script data escaped dash dash state.
+ */
+ continue;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the
+ * script data escaped less-than sign state.
+ */
+ flushChars(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit a U+003E
+ * GREATER-THAN SIGN character token. Switch to
+ * the script data state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ break scriptdataescapeddashdashloop;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Switch to the
+ * script data escaped state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ break scriptdataescapeddashdashloop;
+ // continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_ESCAPED:
+ scriptdataescapedloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Switch to the
+ * script data escaped dash state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
+ break scriptdataescapedloop; // FALL THRU
+ // continue
+ // stateloop;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the
+ * script data escaped less-than sign state.
+ */
+ flushChars(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Stay in the
+ * script data escaped state.
+ */
+ continue;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_ESCAPED_DASH:
+ scriptdataescapeddashloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Switch to the
+ * script data escaped dash dash state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
+ continue stateloop;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Switch to the
+ * script data escaped less-than sign state.
+ */
+ flushChars(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
+ break scriptdataescapeddashloop;
+ // continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Switch to the
+ * script data escaped state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
+ scriptdataescapedlessthanloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Set the temporary buffer
+ * to the empty string. Switch to the script
+ * data escaped end tag open state.
+ */
+ index = 0;
+ clearStrBufBeforeUse();
+ returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
+ state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
+ continue stateloop;
+ case 'S':
+ case 's':
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
+ * LESS-THAN SIGN character token and the
+ * current input character as a character token.
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ cstart = pos;
+ index = 1;
+ /*
+ * Set the temporary buffer to the empty string.
+ * Append the lowercase version of the current
+ * input character (add 0x0020 to the
+ * character's code point) to the temporary
+ * buffer. Switch to the script data double
+ * escape start state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
+ break scriptdataescapedlessthanloop;
+ // continue stateloop;
+ default:
+ /*
+ * Anything else Emit a U+003C LESS-THAN SIGN
+ * character token and reconsume the current
+ * input character in the script data escaped
+ * state.
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_DOUBLE_ESCAPE_START:
+ scriptdatadoubleescapestartloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ assert index > 0;
+ if (index < 6) { // SCRIPT_ARR.length
+ char folded = c;
+ if (c >= 'A' && c <= 'Z') {
+ folded += 0x20;
+ }
+ if (folded != Tokenizer.SCRIPT_ARR[index]) {
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ index++;
+ continue;
+ }
+ switch (c) {
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ case ' ':
+ case '\t':
+ case '\u000C':
+ case '/':
+ case '>':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
+ * (>) Emit the current input character as a
+ * character token. If the temporary buffer is
+ * the string "script", then switch to the
+ * script data double escaped state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ break scriptdatadoubleescapestartloop;
+ // continue stateloop;
+ default:
+ /*
+ * Anything else Reconsume the current input
+ * character in the script data escaped state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_DOUBLE_ESCAPED:
+ scriptdatadoubleescapedloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Switch to the
+ * script data double escaped dash state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
+ break scriptdatadoubleescapedloop; // FALL THRU
+ // continue
+ // stateloop;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Emit a U+003C
+ * LESS-THAN SIGN character token. Switch to the
+ * script data double escaped less-than sign
+ * state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ continue;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Stay in the
+ * script data double escaped state.
+ */
+ continue;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
+ scriptdatadoubleescapeddashloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Switch to the
+ * script data double escaped dash dash state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
+ break scriptdatadoubleescapeddashloop;
+ // continue stateloop;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Emit a U+003C
+ * LESS-THAN SIGN character token. Switch to the
+ * script data double escaped less-than sign
+ * state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Switch to the
+ * script data double escaped state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
+ scriptdatadoubleescapeddashdashloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '-':
+ /*
+ * U+002D HYPHEN-MINUS (-) Emit a U+002D
+ * HYPHEN-MINUS character token. Stay in the
+ * script data double escaped dash dash state.
+ */
+ continue;
+ case '<':
+ /*
+ * U+003C LESS-THAN SIGN (<) Emit a U+003C
+ * LESS-THAN SIGN character token. Switch to the
+ * script data double escaped less-than sign
+ * state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
+ break scriptdatadoubleescapeddashdashloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit a U+003E
+ * GREATER-THAN SIGN character token. Switch to
+ * the script data state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ emitReplacementCharacter(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ default:
+ /*
+ * Anything else Emit the current input
+ * character as a character token. Switch to the
+ * script data double escaped state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
+ scriptdatadoubleescapedlessthanloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '/':
+ /*
+ * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
+ * character token. Set the temporary buffer to
+ * the empty string. Switch to the script data
+ * double escape end state.
+ */
+ index = 0;
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
+ break scriptdatadoubleescapedlessthanloop;
+ default:
+ /*
+ * Anything else Reconsume the current input
+ * character in the script data double escaped
+ * state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
+ case SCRIPT_DATA_DOUBLE_ESCAPE_END:
+ scriptdatadoubleescapeendloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ if (index < 6) { // SCRIPT_ARR.length
+ char folded = c;
+ if (c >= 'A' && c <= 'Z') {
+ folded += 0x20;
+ }
+ if (folded != Tokenizer.SCRIPT_ARR[index]) {
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ index++;
+ continue;
+ }
+ switch (c) {
+ case '\r':
+ emitCarriageReturn(buf, pos);
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ case ' ':
+ case '\t':
+ case '\u000C':
+ case '/':
+ case '>':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
+ * (>) Emit the current input character as a
+ * character token. If the temporary buffer is
+ * the string "script", then switch to the
+ * script data escaped state.
+ */
+ state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
+ continue stateloop;
+ default:
+ /*
+ * Reconsume the current input character in the
+ * script data double escaped state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // XXX reorder point
+ case MARKUP_DECLARATION_OCTYPE:
+ markupdeclarationdoctypeloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ if (index < 6) { // OCTYPE.length
+ char folded = c;
+ if (c >= 'A' && c <= 'Z') {
+ folded += 0x20;
+ }
+ if (folded == Tokenizer.OCTYPE[index]) {
+ appendStrBuf(c);
+ } else {
+ errBogusComment();
+ reconsume = true;
+ state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
+ continue stateloop;
+ }
+ index++;
+ continue;
+ } else {
+ reconsume = true;
+ state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
+ break markupdeclarationdoctypeloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case DOCTYPE:
+ doctypeloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ initDoctypeFields();
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the before DOCTYPE name state.
+ */
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
+ break doctypeloop;
+ // continue stateloop;
+ default:
+ /*
+ * Anything else Parse error.
+ */
+ errMissingSpaceBeforeDoctypeName();
+ /*
+ * Reconsume the current character in the before
+ * DOCTYPE name state.
+ */
+ reconsume = true;
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
+ break doctypeloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BEFORE_DOCTYPE_NAME:
+ beforedoctypenameloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the before DOCTYPE name state.
+ */
+ continue;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Parse error.
+ */
+ errNamelessDoctype();
+ /*
+ * Create a new DOCTYPE token. Set its
+ * force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit the token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ if (c >= 'A' && c <= 'Z') {
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Create a
+ * new DOCTYPE token. Set the token's name
+ * to the lowercase version of the input
+ * character (add 0x0020 to the character's
+ * code point).
+ */
+ c += 0x20;
+ }
+ /* Anything else Create a new DOCTYPE token. */
+ /*
+ * Set the token's name name to the current
+ * input character.
+ */
+ clearStrBufBeforeUse();
+ appendStrBuf(c);
+ /*
+ * Switch to the DOCTYPE name state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
+ break beforedoctypenameloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case DOCTYPE_NAME:
+ doctypenameloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ strBufToDoctypeName();
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the after DOCTYPE name state.
+ */
+ strBufToDoctypeName();
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
+ break doctypenameloop;
+ // continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * DOCTYPE token.
+ */
+ strBufToDoctypeName();
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * U+0041 LATIN CAPITAL LETTER A through to
+ * U+005A LATIN CAPITAL LETTER Z Append the
+ * lowercase version of the input character (add
+ * 0x0020 to the character's code point) to the
+ * current DOCTYPE token's name.
+ */
+ if (c >= 'A' && c <= 'Z') {
+ c += 0x0020;
+ }
+ /*
+ * Anything else Append the current input
+ * character to the current DOCTYPE token's
+ * name.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the DOCTYPE name state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case AFTER_DOCTYPE_NAME:
+ afterdoctypenameloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the after DOCTYPE name state.
+ */
+ continue;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case 'p':
+ case 'P':
+ index = 0;
+ state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
+ break afterdoctypenameloop;
+ // continue stateloop;
+ case 's':
+ case 'S':
+ index = 0;
+ state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
+ continue stateloop;
+ default:
+ /*
+ * Otherwise, this is the parse error.
+ */
+ bogusDoctype();
+
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case DOCTYPE_UBLIC:
+ doctypeublicloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * If the six characters starting from the current input
+ * character are an ASCII case-insensitive match for the
+ * word "PUBLIC", then consume those characters and
+ * switch to the before DOCTYPE public identifier state.
+ */
+ if (index < 5) { // UBLIC.length
+ char folded = c;
+ if (c >= 'A' && c <= 'Z') {
+ folded += 0x20;
+ }
+ if (folded != Tokenizer.UBLIC[index]) {
+ bogusDoctype();
+ // forceQuirks = true;
+ reconsume = true;
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ index++;
+ continue;
+ } else {
+ reconsume = true;
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
+ break doctypeublicloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case AFTER_DOCTYPE_PUBLIC_KEYWORD:
+ afterdoctypepublickeywordloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the before DOCTYPE public
+ * identifier state.
+ */
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
+ break afterdoctypepublickeywordloop;
+ // FALL THROUGH continue stateloop
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Parse Error.
+ */
+ errNoSpaceBetweenDoctypePublicKeywordAndQuote();
+ /*
+ * Set the DOCTYPE token's public identifier to
+ * the empty string (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE public identifier
+ * (double-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Parse Error.
+ */
+ errNoSpaceBetweenDoctypePublicKeywordAndQuote();
+ /*
+ * Set the DOCTYPE token's public identifier to
+ * the empty string (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE public identifier
+ * (single-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /* U+003E GREATER-THAN SIGN (>) Parse error. */
+ errExpectedPublicId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ bogusDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
+ beforedoctypepublicidentifierloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the before DOCTYPE public identifier
+ * state.
+ */
+ continue;
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Set the DOCTYPE
+ * token's public identifier to the empty string
+ * (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE public identifier
+ * (double-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
+ break beforedoctypepublicidentifierloop;
+ // continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Set the DOCTYPE token's
+ * public identifier to the empty string (not
+ * missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE public identifier
+ * (single-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /* U+003E GREATER-THAN SIGN (>) Parse error. */
+ errExpectedPublicId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ bogusDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
+ doctypepublicidentifierdoublequotedloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Switch to the after
+ * DOCTYPE public identifier state.
+ */
+ publicIdentifier = strBufToString();
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
+ break doctypepublicidentifierdoublequotedloop;
+ // continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Parse error.
+ */
+ errGtInPublicId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ publicIdentifier = strBufToString();
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the current input
+ * character to the current DOCTYPE token's
+ * public identifier.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the DOCTYPE public identifier
+ * (double-quoted) state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
+ afterdoctypepublicidentifierloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the between DOCTYPE public and
+ * system identifiers state.
+ */
+ state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
+ break afterdoctypepublicidentifierloop;
+ // continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Parse error.
+ */
+ errNoSpaceBetweenPublicAndSystemIds();
+ /*
+ * Set the DOCTYPE token's system identifier to
+ * the empty string (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE system identifier
+ * (double-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Parse error.
+ */
+ errNoSpaceBetweenPublicAndSystemIds();
+ /*
+ * Set the DOCTYPE token's system identifier to
+ * the empty string (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE system identifier
+ * (single-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ default:
+ bogusDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
+ betweendoctypepublicandsystemidentifiersloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the between DOCTYPE public and system
+ * identifiers state.
+ */
+ continue;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Set the DOCTYPE
+ * token's system identifier to the empty string
+ * (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE system identifier
+ * (double-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
+ break betweendoctypepublicandsystemidentifiersloop;
+ // continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Set the DOCTYPE token's
+ * system identifier to the empty string (not
+ * missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE system identifier
+ * (single-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ default:
+ bogusDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
+ doctypesystemidentifierdoublequotedloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Switch to the after
+ * DOCTYPE system identifier state.
+ */
+ systemIdentifier = strBufToString();
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Parse error.
+ */
+ errGtInSystemId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ systemIdentifier = strBufToString();
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the current input
+ * character to the current DOCTYPE token's
+ * system identifier.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the DOCTYPE system identifier
+ * (double-quoted) state.
+ */
+ continue;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
+ afterdoctypesystemidentifierloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the after DOCTYPE system identifier state.
+ */
+ continue;
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit the current
+ * DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ /*
+ * Switch to the bogus DOCTYPE state. (This does
+ * not set the DOCTYPE token's force-quirks flag
+ * to on.)
+ */
+ bogusDoctypeWithoutQuirks();
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ break afterdoctypesystemidentifierloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BOGUS_DOCTYPE:
+ for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '>':
+ /*
+ * U+003E GREATER-THAN SIGN (>) Emit that
+ * DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ default:
+ /*
+ * Anything else Stay in the bogus DOCTYPE
+ * state.
+ */
+ continue;
+ }
+ }
+ // XXX reorder point
+ case DOCTYPE_YSTEM:
+ doctypeystemloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Otherwise, if the six characters starting from the
+ * current input character are an ASCII case-insensitive
+ * match for the word "SYSTEM", then consume those
+ * characters and switch to the before DOCTYPE system
+ * identifier state.
+ */
+ if (index < 5) { // YSTEM.length
+ char folded = c;
+ if (c >= 'A' && c <= 'Z') {
+ folded += 0x20;
+ }
+ if (folded != Tokenizer.YSTEM[index]) {
+ bogusDoctype();
+ reconsume = true;
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ index++;
+ continue stateloop;
+ } else {
+ reconsume = true;
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
+ break doctypeystemloop;
+ // continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case AFTER_DOCTYPE_SYSTEM_KEYWORD:
+ afterdoctypesystemkeywordloop: for (;;) {
+ if (reconsume) {
+ reconsume = false;
+ } else {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ }
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE
+ * Switch to the before DOCTYPE public
+ * identifier state.
+ */
+ state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
+ break afterdoctypesystemkeywordloop;
+ // FALL THROUGH continue stateloop
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Parse Error.
+ */
+ errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
+ /*
+ * Set the DOCTYPE token's system identifier to
+ * the empty string (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE public identifier
+ * (double-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Parse Error.
+ */
+ errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
+ /*
+ * Set the DOCTYPE token's public identifier to
+ * the empty string (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE public identifier
+ * (single-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '>':
+ /* U+003E GREATER-THAN SIGN (>) Parse error. */
+ errExpectedPublicId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ bogusDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
+ beforedoctypesystemidentifierloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\r':
+ silentCarriageReturn();
+ break stateloop;
+ case '\n':
+ silentLineFeed();
+ // fall thru
+ case ' ':
+ case '\t':
+ case '\u000C':
+ /*
+ * U+0009 CHARACTER TABULATION U+000A LINE FEED
+ * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
+ * in the before DOCTYPE system identifier
+ * state.
+ */
+ continue;
+ case '"':
+ /*
+ * U+0022 QUOTATION MARK (") Set the DOCTYPE
+ * token's system identifier to the empty string
+ * (not missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE system identifier
+ * (double-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
+ continue stateloop;
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Set the DOCTYPE token's
+ * system identifier to the empty string (not
+ * missing),
+ */
+ clearStrBufBeforeUse();
+ /*
+ * then switch to the DOCTYPE system identifier
+ * (single-quoted) state.
+ */
+ state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
+ break beforedoctypesystemidentifierloop;
+ // continue stateloop;
+ case '>':
+ /* U+003E GREATER-THAN SIGN (>) Parse error. */
+ errExpectedSystemId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ default:
+ bogusDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ // done by bogusDoctype();
+ /*
+ * Switch to the bogus DOCTYPE state.
+ */
+ state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
+ continue stateloop;
+ }
+ }
+ // FALLTHRU DON'T REORDER
+ case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Switch to the after
+ * DOCTYPE system identifier state.
+ */
+ systemIdentifier = strBufToString();
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
+ continue stateloop;
+ case '>':
+ errGtInSystemId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ systemIdentifier = strBufToString();
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the current input
+ * character to the current DOCTYPE token's
+ * system identifier.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the DOCTYPE system identifier
+ * (double-quoted) state.
+ */
+ continue;
+ }
+ }
+ // XXX reorder point
+ case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
+ for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ /*
+ * Consume the next input character:
+ */
+ switch (c) {
+ case '\'':
+ /*
+ * U+0027 APOSTROPHE (') Switch to the after
+ * DOCTYPE public identifier state.
+ */
+ publicIdentifier = strBufToString();
+ state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
+ continue stateloop;
+ case '>':
+ errGtInPublicId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ publicIdentifier = strBufToString();
+ emitDoctypeToken(pos);
+ /*
+ * Switch to the data state.
+ */
+ state = transition(state, Tokenizer.DATA, reconsume, pos);
+ continue stateloop;
+ case '\r':
+ appendStrBufCarriageReturn();
+ break stateloop;
+ case '\n':
+ appendStrBufLineFeed();
+ continue;
+ case '\u0000':
+ c = '\uFFFD';
+ // fall thru
+ default:
+ /*
+ * Anything else Append the current input
+ * character to the current DOCTYPE token's
+ * public identifier.
+ */
+ appendStrBuf(c);
+ /*
+ * Stay in the DOCTYPE public identifier
+ * (single-quoted) state.
+ */
+ continue;
+ }
+ }
+ // XXX reorder point
+ case PROCESSING_INSTRUCTION:
+ processinginstructionloop: for (;;) {
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case '?':
+ state = transition(
+ state,
+ Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
+ reconsume, pos);
+ break processinginstructionloop;
+ // continue stateloop;
+ default:
+ continue;
+ }
+ }
+ case PROCESSING_INSTRUCTION_QUESTION_MARK:
+ if (++pos == endPos) {
+ break stateloop;
+ }
+ c = checkChar(buf, pos);
+ switch (c) {
+ case '>':
+ state = transition(state, Tokenizer.DATA,
+ reconsume, pos);
+ continue stateloop;
+ default:
+ state = transition(state,
+ Tokenizer.PROCESSING_INSTRUCTION,
+ reconsume, pos);
+ continue stateloop;
+ }
+ // END HOTSPOT WORKAROUND
+ }
+ }
+ flushChars(buf, pos);
+ /*
+ * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
+ */
+ // Save locals
+ stateSave = state;
+ returnStateSave = returnState;
+ return pos;
+ }
+
+ // HOTSPOT WORKAROUND INSERTION POINT
+
+ // [NOCPP[
+
+ protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
+ return to;
+ }
+
+ // ]NOCPP]
+
+ private void initDoctypeFields() {
+ // Discard the characters "DOCTYPE" accumulated as a potential bogus
+ // comment into strBuf.
+ clearStrBufAfterUse();
+ doctypeName = "";
+ if (systemIdentifier != null) {
+ Portability.releaseString(systemIdentifier);
+ systemIdentifier = null;
+ }
+ if (publicIdentifier != null) {
+ Portability.releaseString(publicIdentifier);
+ publicIdentifier = null;
+ }
+ forceQuirks = false;
+ }
+
+ @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
+ throws SAXException {
+ silentCarriageReturn();
+ adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
+ }
+
+ @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
+ throws SAXException {
+ silentLineFeed();
+ adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
+ }
+
+ @Inline private void appendStrBufLineFeed() {
+ silentLineFeed();
+ appendStrBuf('\n');
+ }
+
+ @Inline private void appendStrBufCarriageReturn() {
+ silentCarriageReturn();
+ appendStrBuf('\n');
+ }
+
+ @Inline protected void silentCarriageReturn() {
+ ++line;
+ lastCR = true;
+ }
+
+ @Inline protected void silentLineFeed() {
+ ++line;
+ }
+
+ private void emitCarriageReturn(@NoLength char[] buf, int pos)
+ throws SAXException {
+ silentCarriageReturn();
+ flushChars(buf, pos);
+ tokenHandler.characters(Tokenizer.LF, 0, 1);
+ cstart = Integer.MAX_VALUE;
+ }
+
+ private void emitReplacementCharacter(@NoLength char[] buf, int pos)
+ throws SAXException {
+ flushChars(buf, pos);
+ tokenHandler.zeroOriginatingReplacementCharacter();
+ cstart = pos + 1;
+ }
+
+ private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
+ throws SAXException {
+ flushChars(buf, pos);
+ tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
+ cstart = pos + 1;
+ }
+
+ private void setAdditionalAndRememberAmpersandLocation(char add) {
+ additional = add;
+ // [NOCPP[
+ ampersandLocation = new LocatorImpl(this);
+ // ]NOCPP]
+ }
+
+ private void bogusDoctype() throws SAXException {
+ errBogusDoctype();
+ forceQuirks = true;
+ }
+
+ private void bogusDoctypeWithoutQuirks() throws SAXException {
+ errBogusDoctype();
+ forceQuirks = false;
+ }
+
+ private void handleNcrValue(int returnState) throws SAXException {
+ /*
+ * If one or more characters match the range, then take them all and
+ * interpret the string of characters as a number (either hexadecimal or
+ * decimal as appropriate).
+ */
+ if (value <= 0xFFFF) {
+ if (value >= 0x80 && value <= 0x9f) {
+ /*
+ * If that number is one of the numbers in the first column of
+ * the following table, then this is a parse error.
+ */
+ errNcrInC1Range();
+ /*
+ * Find the row with that number in the first column, and return
+ * a character token for the Unicode character given in the
+ * second column of that row.
+ */
+ @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
+ emitOrAppendOne(val, returnState);
+ // [NOCPP[
+ } else if (value == 0xC
+ && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
+ if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
+ emitOrAppendOne(Tokenizer.SPACE, returnState);
+ } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
+ fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
+ }
+ // ]NOCPP]
+ } else if (value == 0x0) {
+ errNcrZero();
+ emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
+ } else if ((value & 0xF800) == 0xD800) {
+ errNcrSurrogate();
+ emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
+ } else {
+ /*
+ * Otherwise, return a character token for the Unicode character
+ * whose code point is that number.
+ */
+ char ch = (char) value;
+ // [NOCPP[
+ if (value == 0x0D) {
+ errNcrCr();
+ } else if ((value <= 0x0008) || (value == 0x000B)
+ || (value >= 0x000E && value <= 0x001F)) {
+ ch = errNcrControlChar(ch);
+ } else if (value >= 0xFDD0 && value <= 0xFDEF) {
+ errNcrUnassigned();
+ } else if ((value & 0xFFFE) == 0xFFFE) {
+ ch = errNcrNonCharacter(ch);
+ } else if (value >= 0x007F && value <= 0x009F) {
+ errNcrControlChar();
+ } else {
+ maybeWarnPrivateUse(ch);
+ }
+ // ]NOCPP]
+ bmpChar[0] = ch;
+ emitOrAppendOne(bmpChar, returnState);
+ }
+ } else if (value <= 0x10FFFF) {
+ // [NOCPP[
+ maybeWarnPrivateUseAstral();
+ if ((value & 0xFFFE) == 0xFFFE) {
+ errAstralNonCharacter(value);
+ }
+ // ]NOCPP]
+ astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
+ astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
+ emitOrAppendTwo(astralChar, returnState);
+ } else {
+ errNcrOutOfRange();
+ emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
+ }
+ }
+
+ public void eof() throws SAXException {
+ int state = stateSave;
+ int returnState = returnStateSave;
+
+ eofloop: for (;;) {
+ switch (state) {
+ case SCRIPT_DATA_LESS_THAN_SIGN:
+ case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
+ /*
+ * Otherwise, emit a U+003C LESS-THAN SIGN character token
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ /*
+ * and reconsume the current input character in the data
+ * state.
+ */
+ break eofloop;
+ case TAG_OPEN:
+ /*
+ * The behavior of this state depends on the content model
+ * flag.
+ */
+ /*
+ * Anything else Parse error.
+ */
+ errEofAfterLt();
+ /*
+ * Emit a U+003C LESS-THAN SIGN character token
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ /*
+ * and reconsume the current input character in the data
+ * state.
+ */
+ break eofloop;
+ case RAWTEXT_RCDATA_LESS_THAN_SIGN:
+ /*
+ * Emit a U+003C LESS-THAN SIGN character token
+ */
+ tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
+ /*
+ * and reconsume the current input character in the RCDATA
+ * state.
+ */
+ break eofloop;
+ case NON_DATA_END_TAG_NAME:
+ /*
+ * Emit a U+003C LESS-THAN SIGN character token, a U+002F
+ * SOLIDUS character token,
+ */
+ tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
+ /*
+ * a character token for each of the characters in the
+ * temporary buffer (in the order they were added to the
+ * buffer),
+ */
+ emitStrBuf();
+ /*
+ * and reconsume the current input character in the RCDATA
+ * state.
+ */
+ break eofloop;
+ case CLOSE_TAG_OPEN:
+ /* EOF Parse error. */
+ errEofAfterLt();
+ /*
+ * Emit a U+003C LESS-THAN SIGN character token and a U+002F
+ * SOLIDUS character token.
+ */
+ tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case TAG_NAME:
+ /*
+ * EOF Parse error.
+ */
+ errEofInTagName();
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case BEFORE_ATTRIBUTE_NAME:
+ case AFTER_ATTRIBUTE_VALUE_QUOTED:
+ case SELF_CLOSING_START_TAG:
+ /* EOF Parse error. */
+ errEofWithoutGt();
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case ATTRIBUTE_NAME:
+ /*
+ * EOF Parse error.
+ */
+ errEofInAttributeName();
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case AFTER_ATTRIBUTE_NAME:
+ case BEFORE_ATTRIBUTE_VALUE:
+ /* EOF Parse error. */
+ errEofWithoutGt();
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
+ case ATTRIBUTE_VALUE_SINGLE_QUOTED:
+ case ATTRIBUTE_VALUE_UNQUOTED:
+ /* EOF Parse error. */
+ errEofInAttributeValue();
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case BOGUS_COMMENT:
+ emitComment(0, 0);
+ break eofloop;
+ case BOGUS_COMMENT_HYPHEN:
+ // [NOCPP[
+ maybeAppendSpaceToBogusComment();
+ // ]NOCPP]
+ emitComment(0, 0);
+ break eofloop;
+ case MARKUP_DECLARATION_OPEN:
+ errBogusComment();
+ emitComment(0, 0);
+ break eofloop;
+ case MARKUP_DECLARATION_HYPHEN:
+ errBogusComment();
+ emitComment(0, 0);
+ break eofloop;
+ case MARKUP_DECLARATION_OCTYPE:
+ if (index < 6) {
+ errBogusComment();
+ emitComment(0, 0);
+ } else {
+ /* EOF Parse error. */
+ errEofInDoctype();
+ /*
+ * Create a new DOCTYPE token. Set its force-quirks flag
+ * to on.
+ */
+ doctypeName = "";
+ if (systemIdentifier != null) {
+ Portability.releaseString(systemIdentifier);
+ systemIdentifier = null;
+ }
+ if (publicIdentifier != null) {
+ Portability.releaseString(publicIdentifier);
+ publicIdentifier = null;
+ }
+ forceQuirks = true;
+ /*
+ * Emit the token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ }
+ break eofloop;
+ case COMMENT_START:
+ case COMMENT:
+ /*
+ * EOF Parse error.
+ */
+ errEofInComment();
+ /* Emit the comment token. */
+ emitComment(0, 0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case COMMENT_END:
+ errEofInComment();
+ /* Emit the comment token. */
+ emitComment(2, 0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case COMMENT_END_DASH:
+ case COMMENT_START_DASH:
+ errEofInComment();
+ /* Emit the comment token. */
+ emitComment(1, 0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case COMMENT_END_BANG:
+ errEofInComment();
+ /* Emit the comment token. */
+ emitComment(3, 0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case DOCTYPE:
+ case BEFORE_DOCTYPE_NAME:
+ errEofInDoctype();
+ /*
+ * Create a new DOCTYPE token. Set its force-quirks flag to
+ * on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit the token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case DOCTYPE_NAME:
+ errEofInDoctype();
+ strBufToDoctypeName();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case DOCTYPE_UBLIC:
+ case DOCTYPE_YSTEM:
+ case AFTER_DOCTYPE_NAME:
+ case AFTER_DOCTYPE_PUBLIC_KEYWORD:
+ case AFTER_DOCTYPE_SYSTEM_KEYWORD:
+ case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
+ errEofInDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
+ case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
+ /* EOF Parse error. */
+ errEofInPublicId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ publicIdentifier = strBufToString();
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
+ case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
+ case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
+ errEofInDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
+ case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
+ /* EOF Parse error. */
+ errEofInSystemId();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ systemIdentifier = strBufToString();
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
+ errEofInDoctype();
+ /*
+ * Set the DOCTYPE token's force-quirks flag to on.
+ */
+ forceQuirks = true;
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case BOGUS_DOCTYPE:
+ /*
+ * Emit that DOCTYPE token.
+ */
+ emitDoctypeToken(0);
+ /*
+ * Reconsume the EOF character in the data state.
+ */
+ break eofloop;
+ case CONSUME_CHARACTER_REFERENCE:
+ /*
+ * Unlike the definition is the spec, this state does not
+ * return a value and never requires the caller to
+ * backtrack. This state takes care of emitting characters
+ * or appending to the current attribute value. It also
+ * takes care of that in the case when consuming the entity
+ * fails.
+ */
+ /*
+ * This section defines how to consume an entity. This
+ * definition is used when parsing entities in text and in
+ * attributes.
+ *
+ * The behavior depends on the identity of the next
+ * character (the one immediately after the U+0026 AMPERSAND
+ * character):
+ */
+
+ emitOrAppendCharRefBuf(returnState);
+ state = returnState;
+ continue;
+ case CHARACTER_REFERENCE_HILO_LOOKUP:
+ errNoNamedCharacterMatch();
+ emitOrAppendCharRefBuf(returnState);
+ state = returnState;
+ continue;
+ case CHARACTER_REFERENCE_TAIL:
+ outer: for (;;) {
+ char c = '\u0000';
+ entCol++;
+ /*
+ * Consume the maximum number of characters possible,
+ * with the consumed characters matching one of the
+ * identifiers in the first column of the named
+ * character references table (in a case-sensitive
+ * manner).
+ */
+ hiloop: for (;;) {
+ if (hi == -1) {
+ break hiloop;
+ }
+ if (entCol == NamedCharacters.NAMES[hi].length()) {
+ break hiloop;
+ }
+ if (entCol > NamedCharacters.NAMES[hi].length()) {
+ break outer;
+ } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
+ hi--;
+ } else {
+ break hiloop;
+ }
+ }
+
+ loloop: for (;;) {
+ if (hi < lo) {
+ break outer;
+ }
+ if (entCol == NamedCharacters.NAMES[lo].length()) {
+ candidate = lo;
+ charRefBufMark = charRefBufLen;
+ lo++;
+ } else if (entCol > NamedCharacters.NAMES[lo].length()) {
+ break outer;
+ } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
+ lo++;
+ } else {
+ break loloop;
+ }
+ }
+ if (hi < lo) {
+ break outer;
+ }
+ continue;
+ }
+
+ if (candidate == -1) {
+ /*
+ * If no match can be made, then this is a parse error.
+ */
+ errNoNamedCharacterMatch();
+ emitOrAppendCharRefBuf(returnState);
+ state = returnState;
+ continue eofloop;
+ } else {
+ @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
+ if (candidateName.length() == 0
+ || candidateName.charAt(candidateName.length() - 1) != ';') {
+ /*
+ * If the last character matched is not a U+003B
+ * SEMICOLON (;), there is a parse error.
+ */
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ /*
+ * If the entity is being consumed as part of an
+ * attribute, and the last character matched is
+ * not a U+003B SEMICOLON (;),
+ */
+ char ch;
+ if (charRefBufMark == charRefBufLen) {
+ ch = '\u0000';
+ } else {
+ ch = charRefBuf[charRefBufMark];
+ }
+ if ((ch >= '0' && ch <= '9')
+ || (ch >= 'A' && ch <= 'Z')
+ || (ch >= 'a' && ch <= 'z')) {
+ /*
+ * and the next character is in the range
+ * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
+ * U+0041 LATIN CAPITAL LETTER A to U+005A
+ * LATIN CAPITAL LETTER Z, or U+0061 LATIN
+ * SMALL LETTER A to U+007A LATIN SMALL
+ * LETTER Z, then, for historical reasons,
+ * all the characters that were matched
+ * after the U+0026 AMPERSAND (&) must be
+ * unconsumed, and nothing is returned.
+ */
+ errNoNamedCharacterMatch();
+ appendCharRefBufToStrBuf();
+ state = returnState;
+ continue eofloop;
+ }
+ }
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ errUnescapedAmpersandInterpretedAsCharacterReference();
+ } else {
+ errNotSemicolonTerminated();
+ }
+ }
+
+ /*
+ * Otherwise, return a character token for the character
+ * corresponding to the entity name (as given by the
+ * second column of the named character references
+ * table).
+ */
+ @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
+ if (
+ // [NOCPP[
+ val.length == 1
+ // ]NOCPP]
+ // CPPONLY: val[1] == 0
+ ) {
+ emitOrAppendOne(val, returnState);
+ } else {
+ emitOrAppendTwo(val, returnState);
+ }
+ // this is so complicated!
+ if (charRefBufMark < charRefBufLen) {
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ appendStrBuf(charRefBuf, charRefBufMark,
+ charRefBufLen - charRefBufMark);
+ } else {
+ tokenHandler.characters(charRefBuf, charRefBufMark,
+ charRefBufLen - charRefBufMark);
+ }
+ }
+ charRefBufLen = 0;
+ state = returnState;
+ continue eofloop;
+ /*
+ * If the markup contains I'm &notit; I tell you, the
+ * entity is parsed as "not", as in, I'm ¬it; I tell
+ * you. But if the markup was I'm &notin; I tell you,
+ * the entity would be parsed as "notin;", resulting in
+ * I'm ∉ I tell you.
+ */
+ }
+ case CONSUME_NCR:
+ case DECIMAL_NRC_LOOP:
+ case HEX_NCR_LOOP:
+ /*
+ * If no characters match the range, then don't consume any
+ * characters (and unconsume the U+0023 NUMBER SIGN
+ * character and, if appropriate, the X character). This is
+ * a parse error; nothing is returned.
+ *
+ * Otherwise, if the next character is a U+003B SEMICOLON,
+ * consume that too. If it isn't, there is a parse error.
+ */
+ if (!seenDigits) {
+ errNoDigitsInNCR();
+ emitOrAppendCharRefBuf(returnState);
+ state = returnState;
+ continue;
+ } else {
+ errCharRefLacksSemicolon();
+ }
+ // WARNING previous state sets reconsume
+ handleNcrValue(returnState);
+ state = returnState;
+ continue;
+ case CDATA_RSQB:
+ tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
+ break eofloop;
+ case CDATA_RSQB_RSQB:
+ tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
+ break eofloop;
+ case DATA:
+ default:
+ break eofloop;
+ }
+ }
+ // case DATA:
+ /*
+ * EOF Emit an end-of-file token.
+ */
+ tokenHandler.eof();
+ return;
+ }
+
+ private void emitDoctypeToken(int pos) throws SAXException {
+ cstart = pos + 1;
+ tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
+ forceQuirks);
+ // It is OK and sufficient to release these here, since
+ // there's no way out of the doctype states than through paths
+ // that call this method.
+ doctypeName = null;
+ Portability.releaseString(publicIdentifier);
+ publicIdentifier = null;
+ Portability.releaseString(systemIdentifier);
+ systemIdentifier = null;
+ }
+
+ @Inline protected char checkChar(@NoLength char[] buf, int pos)
+ throws SAXException {
+ return buf[pos];
+ }
+
+ public boolean internalEncodingDeclaration(String internalCharset)
+ throws SAXException {
+ if (encodingDeclarationHandler != null) {
+ return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
+ }
+ return false;
+ }
+
+ /**
+ * @param val
+ * @throws SAXException
+ */
+ private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
+ throws SAXException {
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ appendStrBuf(val[0]);
+ appendStrBuf(val[1]);
+ } else {
+ tokenHandler.characters(val, 0, 2);
+ }
+ }
+
+ private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
+ throws SAXException {
+ if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
+ appendStrBuf(val[0]);
+ } else {
+ tokenHandler.characters(val, 0, 1);
+ }
+ }
+
+ public void end() throws SAXException {
+ strBuf = null;
+ doctypeName = null;
+ if (systemIdentifier != null) {
+ Portability.releaseString(systemIdentifier);
+ systemIdentifier = null;
+ }
+ if (publicIdentifier != null) {
+ Portability.releaseString(publicIdentifier);
+ publicIdentifier = null;
+ }
+ if (tagName != null) {
+ tagName.release();
+ tagName = null;
+ }
+ if (attributeName != null) {
+ attributeName.release();
+ attributeName = null;
+ }
+ tokenHandler.endTokenization();
+ if (attributes != null) {
+ // [NOCPP[
+ attributes = null;
+ // ]NOCPP]
+ // CPPONLY: attributes.clear(mappingLangToXmlLang);
+ }
+ }
+
+ public void requestSuspension() {
+ shouldSuspend = true;
+ }
+
+ // [NOCPP[
+
+ public void becomeConfident() {
+ confident = true;
+ }
+
+ /**
+ * Returns the nextCharOnNewLine.
+ *
+ * @return the nextCharOnNewLine
+ */
+ public boolean isNextCharOnNewLine() {
+ return false;
+ }
+
+ public boolean isPrevCR() {
+ return lastCR;
+ }
+
+ /**
+ * Returns the line.
+ *
+ * @return the line
+ */
+ public int getLine() {
+ return -1;
+ }
+
+ /**
+ * Returns the col.
+ *
+ * @return the col
+ */
+ public int getCol() {
+ return -1;
+ }
+
+ // ]NOCPP]
+
+ public boolean isInDataState() {
+ return (stateSave == DATA);
+ }
+
+ public void resetToDataState() {
+ clearStrBufAfterUse();
+ charRefBufLen = 0;
+ stateSave = Tokenizer.DATA;
+ // line = 1; XXX line numbers
+ lastCR = false;
+ index = 0;
+ forceQuirks = false;
+ additional = '\u0000';
+ entCol = -1;
+ firstCharKey = -1;
+ lo = 0;
+ hi = 0; // will always be overwritten before use anyway
+ candidate = -1;
+ charRefBufMark = 0;
+ value = 0;
+ seenDigits = false;
+ endTag = false;
+ shouldSuspend = false;
+ initDoctypeFields();
+ if (tagName != null) {
+ tagName.release();
+ tagName = null;
+ }
+ if (attributeName != null) {
+ attributeName.release();
+ attributeName = null;
+ }
+ if (newAttributesEachTime) {
+ if (attributes != null) {
+ Portability.delete(attributes);
+ attributes = null;
+ }
+ }
+ }
+
+ public void loadState(Tokenizer other) throws SAXException {
+ strBufLen = other.strBufLen;
+ if (strBufLen > strBuf.length) {
+ strBuf = new char[strBufLen];
+ }
+ System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
+
+ charRefBufLen = other.charRefBufLen;
+ System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
+
+ stateSave = other.stateSave;
+ returnStateSave = other.returnStateSave;
+ endTagExpectation = other.endTagExpectation;
+ endTagExpectationAsArray = other.endTagExpectationAsArray;
+ // line = 1; XXX line numbers
+ lastCR = other.lastCR;
+ index = other.index;
+ forceQuirks = other.forceQuirks;
+ additional = other.additional;
+ entCol = other.entCol;
+ firstCharKey = other.firstCharKey;
+ lo = other.lo;
+ hi = other.hi;
+ candidate = other.candidate;
+ charRefBufMark = other.charRefBufMark;
+ value = other.value;
+ seenDigits = other.seenDigits;
+ endTag = other.endTag;
+ shouldSuspend = false;
+
+ if (other.doctypeName == null) {
+ doctypeName = null;
+ } else {
+ doctypeName = Portability.newLocalFromLocal(other.doctypeName,
+ interner);
+ }
+
+ Portability.releaseString(systemIdentifier);
+ if (other.systemIdentifier == null) {
+ systemIdentifier = null;
+ } else {
+ systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
+ }
+
+ Portability.releaseString(publicIdentifier);
+ if (other.publicIdentifier == null) {
+ publicIdentifier = null;
+ } else {
+ publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
+ }
+
+ if (tagName != null) {
+ tagName.release();
+ }
+ if (other.tagName == null) {
+ tagName = null;
+ } else {
+ tagName = other.tagName.cloneElementName(interner);
+ }
+
+ if (attributeName != null) {
+ attributeName.release();
+ }
+ if (other.attributeName == null) {
+ attributeName = null;
+ } else {
+ attributeName = other.attributeName.cloneAttributeName(interner);
+ }
+
+ Portability.delete(attributes);
+ if (other.attributes == null) {
+ attributes = null;
+ } else {
+ attributes = other.attributes.cloneAttributes(interner);
+ }
+ }
+
+ public void initializeWithoutStarting() throws SAXException {
+ confident = false;
+ strBuf = null;
+ line = 1;
+ // CPPONLY: attributeLine = 1;
+ // [NOCPP[
+ html4 = false;
+ metaBoundaryPassed = false;
+ wantsComments = tokenHandler.wantsComments();
+ if (!newAttributesEachTime) {
+ attributes = new HtmlAttributes(mappingLangToXmlLang);
+ }
+ // ]NOCPP]
+ resetToDataState();
+ }
+
+ protected void errGarbageAfterLtSlash() throws SAXException {
+ }
+
+ protected void errLtSlashGt() throws SAXException {
+ }
+
+ protected void errWarnLtSlashInRcdata() throws SAXException {
+ }
+
+ protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
+ }
+
+ protected void errCharRefLacksSemicolon() throws SAXException {
+ }
+
+ protected void errNoDigitsInNCR() throws SAXException {
+ }
+
+ protected void errGtInSystemId() throws SAXException {
+ }
+
+ protected void errGtInPublicId() throws SAXException {
+ }
+
+ protected void errNamelessDoctype() throws SAXException {
+ }
+
+ protected void errConsecutiveHyphens() throws SAXException {
+ }
+
+ protected void errPrematureEndOfComment() throws SAXException {
+ }
+
+ protected void errBogusComment() throws SAXException {
+ }
+
+ protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
+ }
+
+ protected void errSlashNotFollowedByGt() throws SAXException {
+ }
+
+ protected void errHtml4XmlVoidSyntax() throws SAXException {
+ }
+
+ protected void errNoSpaceBetweenAttributes() throws SAXException {
+ }
+
+ protected void errHtml4NonNameInUnquotedAttribute(char c)
+ throws SAXException {
+ }
+
+ protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
+ throws SAXException {
+ }
+
+ protected void errAttributeValueMissing() throws SAXException {
+ }
+
+ protected void errBadCharBeforeAttributeNameOrNull(char c)
+ throws SAXException {
+ }
+
+ protected void errEqualsSignBeforeAttributeName() throws SAXException {
+ }
+
+ protected void errBadCharAfterLt(char c) throws SAXException {
+ }
+
+ protected void errLtGt() throws SAXException {
+ }
+
+ protected void errProcessingInstruction() throws SAXException {
+ }
+
+ protected void errUnescapedAmpersandInterpretedAsCharacterReference()
+ throws SAXException {
+ }
+
+ protected void errNotSemicolonTerminated() throws SAXException {
+ }
+
+ protected void errNoNamedCharacterMatch() throws SAXException {
+ }
+
+ protected void errQuoteBeforeAttributeName(char c) throws SAXException {
+ }
+
+ protected void errQuoteOrLtInAttributeNameOrNull(char c)
+ throws SAXException {
+ }
+
+ protected void errExpectedPublicId() throws SAXException {
+ }
+
+ protected void errBogusDoctype() throws SAXException {
+ }
+
+ protected void maybeWarnPrivateUseAstral() throws SAXException {
+ }
+
+ protected void maybeWarnPrivateUse(char ch) throws SAXException {
+ }
+
+ protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
+ throws SAXException {
+ }
+
+ protected void maybeErrSlashInEndTag(boolean selfClosing)
+ throws SAXException {
+ }
+
+ protected char errNcrNonCharacter(char ch) throws SAXException {
+ return ch;
+ }
+
+ protected void errAstralNonCharacter(int ch) throws SAXException {
+ }
+
+ protected void errNcrSurrogate() throws SAXException {
+ }
+
+ protected char errNcrControlChar(char ch) throws SAXException {
+ return ch;
+ }
+
+ protected void errNcrCr() throws SAXException {
+ }
+
+ protected void errNcrInC1Range() throws SAXException {
+ }
+
+ protected void errEofInPublicId() throws SAXException {
+ }
+
+ protected void errEofInComment() throws SAXException {
+ }
+
+ protected void errEofInDoctype() throws SAXException {
+ }
+
+ protected void errEofInAttributeValue() throws SAXException {
+ }
+
+ protected void errEofInAttributeName() throws SAXException {
+ }
+
+ protected void errEofWithoutGt() throws SAXException {
+ }
+
+ protected void errEofInTagName() throws SAXException {
+ }
+
+ protected void errEofInEndTag() throws SAXException {
+ }
+
+ protected void errEofAfterLt() throws SAXException {
+ }
+
+ protected void errNcrOutOfRange() throws SAXException {
+ }
+
+ protected void errNcrUnassigned() throws SAXException {
+ }
+
+ protected void errDuplicateAttribute() throws SAXException {
+ }
+
+ protected void errEofInSystemId() throws SAXException {
+ }
+
+ protected void errExpectedSystemId() throws SAXException {
+ }
+
+ protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
+ }
+
+ protected void errHyphenHyphenBang() throws SAXException {
+ }
+
+ protected void errNcrControlChar() throws SAXException {
+ }
+
+ protected void errNcrZero() throws SAXException {
+ }
+
+ protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
+ throws SAXException {
+ }
+
+ protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
+ }
+
+ protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
+ throws SAXException {
+ }
+
+ protected void noteAttributeWithoutValue() throws SAXException {
+ }
+
+ protected void noteUnquotedAttributeValue() throws SAXException {
+ }
+
+ /**
+ * Sets the encodingDeclarationHandler.
+ *
+ * @param encodingDeclarationHandler
+ * the encodingDeclarationHandler to set
+ */
+ public void setEncodingDeclarationHandler(
+ EncodingDeclarationHandler encodingDeclarationHandler) {
+ this.encodingDeclarationHandler = encodingDeclarationHandler;
+ }
+
+ void destructor() {
+ // The translator will write refcount tracing stuff here
+ Portability.delete(attributes);
+ attributes = null;
+ }
+
+ // [NOCPP[
+
+ /**
+ * Sets an offset to be added to the position reported to
+ * <code>TransitionHandler</code>.
+ *
+ * @param offset the offset
+ */
+ public void setTransitionBaseOffset(int offset) {
+
+ }
+
+ // ]NOCPP]
+
+}