summaryrefslogtreecommitdiffstats
path: root/parser/html/javasrc/Tokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'parser/html/javasrc/Tokenizer.java')
-rw-r--r--parser/html/javasrc/Tokenizer.java7089
1 files changed, 0 insertions, 7089 deletions
diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java
deleted file mode 100644
index f141d94d7..000000000
--- a/parser/html/javasrc/Tokenizer.java
+++ /dev/null
@@ -1,7089 +0,0 @@
-/*
- * Copyright (c) 2005-2007 Henri Sivonen
- * Copyright (c) 2007-2015 Mozilla Foundation
- * Copyright (c) 2019 Moonchild Productions
- * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla
- * Foundation, and Opera Software ASA.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * The comments following this one that use the same comment syntax as this
- * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007
- * amended as of June 18 2008 and May 31 2010.
- * That document came with this statement:
- * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and
- * Opera Software ASA. You are granted a license to use, reproduce and
- * create derivative works of this document."
- */
-
-package nu.validator.htmlparser.impl;
-
-import org.xml.sax.ErrorHandler;
-import org.xml.sax.Locator;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;
-
-import nu.validator.htmlparser.annotation.Auto;
-import nu.validator.htmlparser.annotation.CharacterName;
-import nu.validator.htmlparser.annotation.Const;
-import nu.validator.htmlparser.annotation.Inline;
-import nu.validator.htmlparser.annotation.Local;
-import nu.validator.htmlparser.annotation.NoLength;
-import nu.validator.htmlparser.common.EncodingDeclarationHandler;
-import nu.validator.htmlparser.common.Interner;
-import nu.validator.htmlparser.common.TokenHandler;
-import nu.validator.htmlparser.common.XmlViolationPolicy;
-
-/**
- * An implementation of
- * https://html.spec.whatwg.org/multipage/syntax.html#tokenization
- *
- * This class implements the <code>Locator</code> interface. This is not an
- * incidental implementation detail: Users of this class are encouraged to make
- * use of the <code>Locator</code> nature.
- *
- * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer
- * can be configured to treat these conditions as fatal or to coerce the infoset
- * to something that XML 1.0 allows.
- *
- * @version $Id$
- * @author hsivonen
- */
-public class Tokenizer implements Locator {
-
- private static final int DATA_AND_RCDATA_MASK = ~1;
-
- public static final int DATA = 0;
-
- public static final int RCDATA = 1;
-
- public static final int SCRIPT_DATA = 2;
-
- public static final int RAWTEXT = 3;
-
- public static final int SCRIPT_DATA_ESCAPED = 4;
-
- public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5;
-
- public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6;
-
- public static final int ATTRIBUTE_VALUE_UNQUOTED = 7;
-
- public static final int PLAINTEXT = 8;
-
- public static final int TAG_OPEN = 9;
-
- public static final int CLOSE_TAG_OPEN = 10;
-
- public static final int TAG_NAME = 11;
-
- public static final int BEFORE_ATTRIBUTE_NAME = 12;
-
- public static final int ATTRIBUTE_NAME = 13;
-
- public static final int AFTER_ATTRIBUTE_NAME = 14;
-
- public static final int BEFORE_ATTRIBUTE_VALUE = 15;
-
- public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16;
-
- public static final int BOGUS_COMMENT = 17;
-
- public static final int MARKUP_DECLARATION_OPEN = 18;
-
- public static final int DOCTYPE = 19;
-
- public static final int BEFORE_DOCTYPE_NAME = 20;
-
- public static final int DOCTYPE_NAME = 21;
-
- public static final int AFTER_DOCTYPE_NAME = 22;
-
- public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23;
-
- public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24;
-
- public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25;
-
- public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26;
-
- public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27;
-
- public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28;
-
- public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29;
-
- public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30;
-
- public static final int BOGUS_DOCTYPE = 31;
-
- public static final int COMMENT_START = 32;
-
- public static final int COMMENT_START_DASH = 33;
-
- public static final int COMMENT = 34;
-
- public static final int COMMENT_END_DASH = 35;
-
- public static final int COMMENT_END = 36;
-
- public static final int COMMENT_END_BANG = 37;
-
- public static final int NON_DATA_END_TAG_NAME = 38;
-
- public static final int MARKUP_DECLARATION_HYPHEN = 39;
-
- public static final int MARKUP_DECLARATION_OCTYPE = 40;
-
- public static final int DOCTYPE_UBLIC = 41;
-
- public static final int DOCTYPE_YSTEM = 42;
-
- public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43;
-
- public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44;
-
- public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45;
-
- public static final int CONSUME_CHARACTER_REFERENCE = 46;
-
- public static final int CONSUME_NCR = 47;
-
- public static final int CHARACTER_REFERENCE_TAIL = 48;
-
- public static final int HEX_NCR_LOOP = 49;
-
- public static final int DECIMAL_NRC_LOOP = 50;
-
- public static final int HANDLE_NCR_VALUE = 51;
-
- public static final int HANDLE_NCR_VALUE_RECONSUME = 52;
-
- public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53;
-
- public static final int SELF_CLOSING_START_TAG = 54;
-
- public static final int CDATA_START = 55;
-
- public static final int CDATA_SECTION = 56;
-
- public static final int CDATA_RSQB = 57;
-
- public static final int CDATA_RSQB_RSQB = 58;
-
- public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59;
-
- public static final int SCRIPT_DATA_ESCAPE_START = 60;
-
- public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61;
-
- public static final int SCRIPT_DATA_ESCAPED_DASH = 62;
-
- public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63;
-
- public static final int BOGUS_COMMENT_HYPHEN = 64;
-
- public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65;
-
- public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66;
-
- public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67;
-
- public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68;
-
- public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69;
-
- public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70;
-
- public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71;
-
- public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72;
-
- public static final int PROCESSING_INSTRUCTION = 73;
-
- public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
-
- /**
- * Magic value for UTF-16 operations.
- */
- private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10));
-
- /**
- * UTF-16 code unit array containing less than and greater than for emitting
- * those characters on certain parse errors.
- */
- private static final @NoLength char[] LT_GT = { '<', '>' };
-
- /**
- * UTF-16 code unit array containing less than and solidus for emitting
- * those characters on certain parse errors.
- */
- private static final @NoLength char[] LT_SOLIDUS = { '<', '/' };
-
- /**
- * UTF-16 code unit array containing ]] for emitting those characters on
- * state transitions.
- */
- private static final @NoLength char[] RSQB_RSQB = { ']', ']' };
-
- /**
- * Array version of U+FFFD.
- */
- private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
-
- // [NOCPP[
-
- /**
- * Array version of space.
- */
- private static final @NoLength char[] SPACE = { ' ' };
-
- // ]NOCPP]
-
- /**
- * Array version of line feed.
- */
- private static final @NoLength char[] LF = { '\n' };
-
- /**
- * "CDATA[" as <code>char[]</code>
- */
- private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T',
- 'A', '[' };
-
- /**
- * "octype" as <code>char[]</code>
- */
- private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p',
- 'e' };
-
- /**
- * "ublic" as <code>char[]</code>
- */
- private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' };
-
- /**
- * "ystem" as <code>char[]</code>
- */
- private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' };
-
- private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' };
-
- private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' };
-
- private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' };
-
- private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't',
- 'e', 'x', 't' };
-
- private static final char[] XMP_ARR = { 'x', 'm', 'p' };
-
- private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r',
- 'e', 'a' };
-
- private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' };
-
- private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e',
- 'd' };
-
- private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i',
- 'p', 't' };
-
- private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm',
- 'e', 's' };
-
- /**
- * The token handler.
- */
- protected final TokenHandler tokenHandler;
-
- protected EncodingDeclarationHandler encodingDeclarationHandler;
-
- // [NOCPP[
-
- /**
- * The error handler.
- */
- protected ErrorHandler errorHandler;
-
- // ]NOCPP]
-
- /**
- * Whether the previous char read was CR.
- */
- protected boolean lastCR;
-
- protected int stateSave;
-
- private int returnStateSave;
-
- protected int index;
-
- private boolean forceQuirks;
-
- private char additional;
-
- private int entCol;
-
- private int firstCharKey;
-
- private int lo;
-
- private int hi;
-
- private int candidate;
-
- private int charRefBufMark;
-
- protected int value;
-
- private boolean seenDigits;
-
- protected int cstart;
-
- /**
- * The SAX public id for the resource being tokenized. (Only passed to back
- * as part of locator data.)
- */
- private String publicId;
-
- /**
- * The SAX system id for the resource being tokenized. (Only passed to back
- * as part of locator data.)
- */
- private String systemId;
-
- /**
- * Buffer for bufferable things other than those that fit the description
- * of <code>charRefBuf</code>.
- */
- private @Auto char[] strBuf;
-
- /**
- * Number of significant <code>char</code>s in <code>strBuf</code>.
- */
- private int strBufLen;
-
- /**
- * Buffer for characters that might form a character reference but may
- * end up not forming one.
- */
- private final @Auto char[] charRefBuf;
-
- /**
- * Number of significant <code>char</code>s in <code>charRefBuf</code>.
- */
- private int charRefBufLen;
-
- /**
- * Buffer for expanding NCRs falling into the Basic Multilingual Plane.
- */
- private final @Auto char[] bmpChar;
-
- /**
- * Buffer for expanding astral NCRs.
- */
- private final @Auto char[] astralChar;
-
- /**
- * The element whose end tag closes the current CDATA or RCDATA element.
- */
- protected ElementName endTagExpectation = null;
-
- private char[] endTagExpectationAsArray; // not @Auto!
-
- /**
- * <code>true</code> if tokenizing an end tag
- */
- protected boolean endTag;
-
- /**
- * The current tag token name.
- */
- private ElementName tagName = null;
-
- /**
- * The current attribute name.
- */
- protected AttributeName attributeName = null;
-
- // [NOCPP[
-
- /**
- * Whether comment tokens are emitted.
- */
- private boolean wantsComments = false;
-
- /**
- * <code>true</code> when HTML4-specific additional errors are requested.
- */
- protected boolean html4;
-
- /**
- * Whether the stream is past the first 1024 bytes.
- */
- private boolean metaBoundaryPassed;
-
- // ]NOCPP]
-
- /**
- * The name of the current doctype token.
- */
- private @Local String doctypeName;
-
- /**
- * The public id of the current doctype token.
- */
- private String publicIdentifier;
-
- /**
- * The system id of the current doctype token.
- */
- private String systemIdentifier;
-
- /**
- * The attribute holder.
- */
- private HtmlAttributes attributes;
-
- // [NOCPP[
-
- /**
- * The policy for vertical tab and form feed.
- */
- private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET;
-
- /**
- * The policy for comments.
- */
- private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET;
-
- private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET;
-
- private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET;
-
- private boolean html4ModeCompatibleWithXhtml1Schemata;
-
- private int mappingLangToXmlLang;
-
- // ]NOCPP]
-
- private final boolean newAttributesEachTime;
-
- private boolean shouldSuspend;
-
- protected boolean confident;
-
- private int line;
-
- /*
- * The line number of the current attribute. First set to the line of the
- * attribute name and if there is a value, set to the line the value
- * started on.
- */
- // CPPONLY: private int attributeLine;
-
- private Interner interner;
-
- // CPPONLY: private boolean viewingXmlSource;
-
- // [NOCPP[
-
- protected LocatorImpl ampersandLocation;
-
- public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) {
- this.tokenHandler = tokenHandler;
- this.encodingDeclarationHandler = null;
- this.newAttributesEachTime = newAttributesEachTime;
- // &CounterClockwiseContourIntegral; is the longest valid char ref and
- // the semicolon never gets appended to the buffer.
- this.charRefBuf = new char[32];
- this.bmpChar = new char[1];
- this.astralChar = new char[2];
- this.tagName = null;
- this.attributeName = null;
- this.doctypeName = null;
- this.publicIdentifier = null;
- this.systemIdentifier = null;
- this.attributes = null;
- }
-
- // ]NOCPP]
-
- /**
- * The constructor.
- *
- * @param tokenHandler
- * the handler for receiving tokens
- */
- public Tokenizer(TokenHandler tokenHandler
- // CPPONLY: , boolean viewingXmlSource
- ) {
- this.tokenHandler = tokenHandler;
- this.encodingDeclarationHandler = null;
- // [NOCPP[
- this.newAttributesEachTime = false;
- // ]NOCPP]
- // &CounterClockwiseContourIntegral; is the longest valid char ref and
- // the semicolon never gets appended to the buffer.
- this.charRefBuf = new char[32];
- this.bmpChar = new char[1];
- this.astralChar = new char[2];
- this.tagName = null;
- this.attributeName = null;
- this.doctypeName = null;
- this.publicIdentifier = null;
- this.systemIdentifier = null;
- // [NOCPP[
- this.attributes = null;
- // ]NOCPP]
- // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null;
- // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder();
- // CPPONLY: this.viewingXmlSource = viewingXmlSource;
- }
-
- public void setInterner(Interner interner) {
- this.interner = interner;
- }
-
- public void initLocation(String newPublicId, String newSystemId) {
- this.systemId = newSystemId;
- this.publicId = newPublicId;
-
- }
-
- // CPPONLY: boolean isViewingXmlSource() {
- // CPPONLY: return viewingXmlSource;
- // CPPONLY: }
-
- // [NOCPP[
-
- /**
- * Returns the mappingLangToXmlLang.
- *
- * @return the mappingLangToXmlLang
- */
- public boolean isMappingLangToXmlLang() {
- return mappingLangToXmlLang == AttributeName.HTML_LANG;
- }
-
- /**
- * Sets the mappingLangToXmlLang.
- *
- * @param mappingLangToXmlLang
- * the mappingLangToXmlLang to set
- */
- public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) {
- this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG
- : AttributeName.HTML;
- }
-
- /**
- * Sets the error handler.
- *
- * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
- */
- public void setErrorHandler(ErrorHandler eh) {
- this.errorHandler = eh;
- }
-
- public ErrorHandler getErrorHandler() {
- return this.errorHandler;
- }
-
- /**
- * Sets the commentPolicy.
- *
- * @param commentPolicy
- * the commentPolicy to set
- */
- public void setCommentPolicy(XmlViolationPolicy commentPolicy) {
- this.commentPolicy = commentPolicy;
- }
-
- /**
- * Sets the contentNonXmlCharPolicy.
- *
- * @param contentNonXmlCharPolicy
- * the contentNonXmlCharPolicy to set
- */
- public void setContentNonXmlCharPolicy(
- XmlViolationPolicy contentNonXmlCharPolicy) {
- if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) {
- throw new IllegalArgumentException(
- "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW.");
- }
- }
-
- /**
- * Sets the contentSpacePolicy.
- *
- * @param contentSpacePolicy
- * the contentSpacePolicy to set
- */
- public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) {
- this.contentSpacePolicy = contentSpacePolicy;
- }
-
- /**
- * Sets the xmlnsPolicy.
- *
- * @param xmlnsPolicy
- * the xmlnsPolicy to set
- */
- public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) {
- if (xmlnsPolicy == XmlViolationPolicy.FATAL) {
- throw new IllegalArgumentException("Can't use FATAL here.");
- }
- this.xmlnsPolicy = xmlnsPolicy;
- }
-
- public void setNamePolicy(XmlViolationPolicy namePolicy) {
- this.namePolicy = namePolicy;
- }
-
- /**
- * Sets the html4ModeCompatibleWithXhtml1Schemata.
- *
- * @param html4ModeCompatibleWithXhtml1Schemata
- * the html4ModeCompatibleWithXhtml1Schemata to set
- */
- public void setHtml4ModeCompatibleWithXhtml1Schemata(
- boolean html4ModeCompatibleWithXhtml1Schemata) {
- this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata;
- }
-
- // ]NOCPP]
-
- // For the token handler to call
- /**
- * Sets the tokenizer state and the associated element name. This should
- * only ever used to put the tokenizer into one of the states that have
- * a special end tag expectation.
- *
- * @param specialTokenizerState
- * the tokenizer state to set
- */
- public void setState(int specialTokenizerState) {
- this.stateSave = specialTokenizerState;
- this.endTagExpectation = null;
- this.endTagExpectationAsArray = null;
- }
-
- // [NOCPP[
-
- /**
- * Sets the tokenizer state and the associated element name. This should
- * only ever used to put the tokenizer into one of the states that have
- * a special end tag expectation. For use from the tokenizer test harness.
- *
- * @param specialTokenizerState
- * the tokenizer state to set
- * @param endTagExpectation
- * the expected end tag for transitioning back to normal
- */
- public void setStateAndEndTagExpectation(int specialTokenizerState,
- @Local String endTagExpectation) {
- this.stateSave = specialTokenizerState;
- if (specialTokenizerState == Tokenizer.DATA) {
- return;
- }
- @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation);
- this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0,
- asArray.length, interner);
- endTagExpectationToArray();
- }
-
- // ]NOCPP]
-
- /**
- * Sets the tokenizer state and the associated element name. This should
- * only ever used to put the tokenizer into one of the states that have
- * a special end tag expectation.
- *
- * @param specialTokenizerState
- * the tokenizer state to set
- * @param endTagExpectation
- * the expected end tag for transitioning back to normal
- */
- public void setStateAndEndTagExpectation(int specialTokenizerState,
- ElementName endTagExpectation) {
- this.stateSave = specialTokenizerState;
- this.endTagExpectation = endTagExpectation;
- endTagExpectationToArray();
- }
-
- private void endTagExpectationToArray() {
- switch (endTagExpectation.getGroup()) {
- case TreeBuilder.TITLE:
- endTagExpectationAsArray = TITLE_ARR;
- return;
- case TreeBuilder.SCRIPT:
- endTagExpectationAsArray = SCRIPT_ARR;
- return;
- case TreeBuilder.STYLE:
- endTagExpectationAsArray = STYLE_ARR;
- return;
- case TreeBuilder.PLAINTEXT:
- endTagExpectationAsArray = PLAINTEXT_ARR;
- return;
- case TreeBuilder.XMP:
- endTagExpectationAsArray = XMP_ARR;
- return;
- case TreeBuilder.TEXTAREA:
- endTagExpectationAsArray = TEXTAREA_ARR;
- return;
- case TreeBuilder.IFRAME:
- endTagExpectationAsArray = IFRAME_ARR;
- return;
- case TreeBuilder.NOEMBED:
- endTagExpectationAsArray = NOEMBED_ARR;
- return;
- case TreeBuilder.NOSCRIPT:
- endTagExpectationAsArray = NOSCRIPT_ARR;
- return;
- case TreeBuilder.NOFRAMES:
- endTagExpectationAsArray = NOFRAMES_ARR;
- return;
- default:
- assert false: "Bad end tag expectation.";
- return;
- }
- }
-
- /**
- * For C++ use only.
- */
- public void setLineNumber(int line) {
- // CPPONLY: this.attributeLine = line; // XXX is this needed?
- this.line = line;
- }
-
- // start Locator impl
-
- /**
- * @see org.xml.sax.Locator#getLineNumber()
- */
- @Inline public int getLineNumber() {
- return line;
- }
-
- // [NOCPP[
-
- /**
- * @see org.xml.sax.Locator#getColumnNumber()
- */
- @Inline public int getColumnNumber() {
- return -1;
- }
-
- /**
- * @see org.xml.sax.Locator#getPublicId()
- */
- public String getPublicId() {
- return publicId;
- }
-
- /**
- * @see org.xml.sax.Locator#getSystemId()
- */
- public String getSystemId() {
- return systemId;
- }
-
- // end Locator impl
-
- // end public API
-
- public void notifyAboutMetaBoundary() {
- metaBoundaryPassed = true;
- }
-
- void turnOnAdditionalHtml4Errors() {
- html4 = true;
- }
-
- // ]NOCPP]
-
- HtmlAttributes emptyAttributes() {
- // [NOCPP[
- if (newAttributesEachTime) {
- return new HtmlAttributes(mappingLangToXmlLang);
- } else {
- // ]NOCPP]
- return HtmlAttributes.EMPTY_ATTRIBUTES;
- // [NOCPP[
- }
- // ]NOCPP]
- }
-
- @Inline private void appendCharRefBuf(char c) {
- // CPPONLY: assert charRefBufLen < charRefBuf.length:
- // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!";
- charRefBuf[charRefBufLen++] = c;
- }
-
- private void emitOrAppendCharRefBuf(int returnState) throws SAXException {
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- appendCharRefBufToStrBuf();
- } else {
- if (charRefBufLen > 0) {
- tokenHandler.characters(charRefBuf, 0, charRefBufLen);
- charRefBufLen = 0;
- }
- }
- }
-
- @Inline private void clearStrBufAfterUse() {
- strBufLen = 0;
- }
-
- @Inline private void clearStrBufBeforeUse() {
- assert strBufLen == 0: "strBufLen not reset after previous use!";
- strBufLen = 0; // no-op in the absence of bugs
- }
-
- @Inline private void clearStrBufAfterOneHyphen() {
- assert strBufLen == 1: "strBufLen length not one!";
- assert strBuf[0] == '-': "strBuf does not start with a hyphen!";
- strBufLen = 0;
- }
-
- /**
- * Appends to the buffer.
- *
- * @param c
- * the UTF-16 code unit to append
- */
- @Inline private void appendStrBuf(char c) {
- // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient.";
- // CPPONLY: if (strBufLen == strBuf.length) {
- // CPPONLY: if (!EnsureBufferSpace(1)) {
- // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure";
- // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not
- // CPPONLY: }
- strBuf[strBufLen++] = c;
- }
-
- /**
- * The buffer as a String. Currently only used for error reporting.
- *
- * <p>
- * C++ memory note: The return value must be released.
- *
- * @return the buffer as a string
- */
- protected String strBufToString() {
- String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen
- // CPPONLY: , tokenHandler
- );
- clearStrBufAfterUse();
- return str;
- }
-
- /**
- * Returns the buffer as a local name. The return value is released in
- * emitDoctypeToken().
- *
- * @return the buffer as local name
- */
- private void strBufToDoctypeName() {
- doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
- interner);
- clearStrBufAfterUse();
- }
-
- /**
- * Emits the buffer as character tokens.
- *
- * @throws SAXException
- * if the token handler threw
- */
- private void emitStrBuf() throws SAXException {
- if (strBufLen > 0) {
- tokenHandler.characters(strBuf, 0, strBufLen);
- clearStrBufAfterUse();
- }
- }
-
- @Inline private void appendSecondHyphenToBogusComment() throws SAXException {
- // [NOCPP[
- switch (commentPolicy) {
- case ALTER_INFOSET:
- appendStrBuf(' ');
- // FALLTHROUGH
- case ALLOW:
- warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
- // ]NOCPP]
- appendStrBuf('-');
- // [NOCPP[
- break;
- case FATAL:
- fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
- break;
- }
- // ]NOCPP]
- }
-
- // [NOCPP[
- private void maybeAppendSpaceToBogusComment() throws SAXException {
- switch (commentPolicy) {
- case ALTER_INFOSET:
- appendStrBuf(' ');
- // FALLTHROUGH
- case ALLOW:
- warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
- break;
- case FATAL:
- fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment.");
- break;
- }
- }
-
- // ]NOCPP]
-
- @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
- throws SAXException {
- errConsecutiveHyphens();
- // [NOCPP[
- switch (commentPolicy) {
- case ALTER_INFOSET:
- strBufLen--;
- // WARNING!!! This expands the worst case of the buffer length
- // given the length of input!
- appendStrBuf(' ');
- appendStrBuf('-');
- // FALLTHROUGH
- case ALLOW:
- warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
- // ]NOCPP]
- appendStrBuf(c);
- // [NOCPP[
- break;
- case FATAL:
- fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
- break;
- }
- // ]NOCPP]
- }
-
- private void appendStrBuf(@NoLength char[] buffer, int offset, int length) {
- int newLen = strBufLen + length;
- // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient.";
- // CPPONLY: if (strBuf.length < newLen) {
- // CPPONLY: if (!EnsureBufferSpace(length)) {
- // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure";
- // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not
- // CPPONLY: }
- System.arraycopy(buffer, offset, strBuf, strBufLen, length);
- strBufLen = newLen;
- }
-
- /**
- * Append the contents of the char reference buffer to the main one.
- */
- @Inline private void appendCharRefBufToStrBuf() {
- appendStrBuf(charRefBuf, 0, charRefBufLen);
- charRefBufLen = 0;
- }
-
- /**
- * Emits the current comment token.
- *
- * @param pos
- * TODO
- *
- * @throws SAXException
- */
- private void emitComment(int provisionalHyphens, int pos)
- throws SAXException {
- // [NOCPP[
- if (wantsComments) {
- // ]NOCPP]
- tokenHandler.comment(strBuf, 0, strBufLen
- - provisionalHyphens);
- // [NOCPP[
- }
- // ]NOCPP]
- clearStrBufAfterUse();
- cstart = pos + 1;
- }
-
- /**
- * Flushes coalesced character tokens.
- *
- * @param buf
- * TODO
- * @param pos
- * TODO
- *
- * @throws SAXException
- */
- protected void flushChars(@NoLength char[] buf, int pos)
- throws SAXException {
- if (pos > cstart) {
- tokenHandler.characters(buf, cstart, pos - cstart);
- }
- cstart = Integer.MAX_VALUE;
- }
-
- /**
- * Reports an condition that would make the infoset incompatible with XML
- * 1.0 as fatal.
- *
- * @param message
- * the message
- * @throws SAXException
- * @throws SAXParseException
- */
- public void fatal(String message) throws SAXException {
- SAXParseException spe = new SAXParseException(message, this);
- if (errorHandler != null) {
- errorHandler.fatalError(spe);
- }
- throw spe;
- }
-
- /**
- * Reports a Parse Error.
- *
- * @param message
- * the message
- * @throws SAXException
- */
- public void err(String message) throws SAXException {
- if (errorHandler == null) {
- return;
- }
- SAXParseException spe = new SAXParseException(message, this);
- errorHandler.error(spe);
- }
-
- public void errTreeBuilder(String message) throws SAXException {
- ErrorHandler eh = null;
- if (tokenHandler instanceof TreeBuilder<?>) {
- TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler;
- eh = treeBuilder.getErrorHandler();
- }
- if (eh == null) {
- eh = errorHandler;
- }
- if (eh == null) {
- return;
- }
- SAXParseException spe = new SAXParseException(message, this);
- eh.error(spe);
- }
-
- /**
- * Reports a warning
- *
- * @param message
- * the message
- * @throws SAXException
- */
- public void warn(String message) throws SAXException {
- if (errorHandler == null) {
- return;
- }
- SAXParseException spe = new SAXParseException(message, this);
- errorHandler.warning(spe);
- }
-
- private void strBufToElementNameString() {
- tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen,
- interner);
- clearStrBufAfterUse();
- }
-
- private int emitCurrentTagToken(boolean selfClosing, int pos)
- throws SAXException {
- cstart = pos + 1;
- maybeErrSlashInEndTag(selfClosing);
- stateSave = Tokenizer.DATA;
- HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES
- : attributes);
- if (endTag) {
- /*
- * When an end tag token is emitted, the content model flag must be
- * switched to the PCDATA state.
- */
- maybeErrAttributesOnEndTag(attrs);
- // CPPONLY: if (!viewingXmlSource) {
- tokenHandler.endTag(tagName);
- // CPPONLY: }
- // CPPONLY: if (newAttributesEachTime) {
- // CPPONLY: Portability.delete(attributes);
- // CPPONLY: attributes = null;
- // CPPONLY: }
- } else {
- // CPPONLY: if (viewingXmlSource) {
- // CPPONLY: assert newAttributesEachTime;
- // CPPONLY: Portability.delete(attributes);
- // CPPONLY: attributes = null;
- // CPPONLY: } else {
- tokenHandler.startTag(tagName, attrs, selfClosing);
- // CPPONLY: }
- }
- tagName.release();
- tagName = null;
- if (newAttributesEachTime) {
- attributes = null;
- } else {
- attributes.clear(mappingLangToXmlLang);
- }
- /*
- * The token handler may have called setStateAndEndTagExpectation
- * and changed stateSave since the start of this method.
- */
- return stateSave;
- }
-
- private void attributeNameComplete() throws SAXException {
- attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen
- // [NOCPP[
- , namePolicy != XmlViolationPolicy.ALLOW
- // ]NOCPP]
- , interner);
- clearStrBufAfterUse();
-
- if (attributes == null) {
- attributes = new HtmlAttributes(mappingLangToXmlLang);
- }
-
- /*
- * When the user agent leaves the attribute name state (and before
- * emitting the tag token, if appropriate), the complete attribute's
- * name must be compared to the other attributes on the same token; if
- * there is already an attribute on the token with the exact same name,
- * then this is a parse error and the new attribute must be dropped,
- * along with the value that gets associated with it (if any).
- */
- if (attributes.contains(attributeName)) {
- errDuplicateAttribute();
- attributeName.release();
- attributeName = null;
- }
- }
-
- private void addAttributeWithoutValue() throws SAXException {
- noteAttributeWithoutValue();
-
- // [NOCPP[
- if (metaBoundaryPassed && AttributeName.CHARSET == attributeName
- && ElementName.META == tagName) {
- err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
- }
- // ]NOCPP]
- if (attributeName != null) {
- // [NOCPP[
- if (html4) {
- if (attributeName.isBoolean()) {
- if (html4ModeCompatibleWithXhtml1Schemata) {
- attributes.addAttribute(attributeName,
- attributeName.getLocal(AttributeName.HTML),
- xmlnsPolicy);
- } else {
- attributes.addAttribute(attributeName, "", xmlnsPolicy);
- }
- } else {
- if (AttributeName.BORDER != attributeName) {
- err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)");
- attributes.addAttribute(attributeName, "", xmlnsPolicy);
- }
- }
- } else {
- if (AttributeName.SRC == attributeName
- || AttributeName.HREF == attributeName) {
- warn("Attribute \u201C"
- + attributeName.getLocal(AttributeName.HTML)
- + "\u201D without an explicit value seen. The attribute may be dropped by IE7.");
- }
- // ]NOCPP]
- attributes.addAttribute(attributeName,
- Portability.newEmptyString()
- // [NOCPP[
- , xmlnsPolicy
- // ]NOCPP]
- // CPPONLY: , attributeLine
- );
- // [NOCPP[
- }
- // ]NOCPP]
- attributeName = null; // attributeName has been adopted by the
- // |attributes| object
- } else {
- clearStrBufAfterUse();
- }
- }
-
- private void addAttributeWithValue() throws SAXException {
- // [NOCPP[
- if (metaBoundaryPassed && ElementName.META == tagName
- && AttributeName.CHARSET == attributeName) {
- err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes.");
- }
- // ]NOCPP]
- if (attributeName != null) {
- String val = strBufToString(); // Ownership transferred to
- // HtmlAttributes
- // CPPONLY: if (mViewSource) {
- // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val);
- // CPPONLY: }
- // [NOCPP[
- if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata
- && attributeName.isCaseFolded()) {
- val = newAsciiLowerCaseStringFromString(val);
- }
- // ]NOCPP]
- attributes.addAttribute(attributeName, val
- // [NOCPP[
- , xmlnsPolicy
- // ]NOCPP]
- // CPPONLY: , attributeLine
- );
- attributeName = null; // attributeName has been adopted by the
- // |attributes| object
- } else {
- // We have a duplicate attribute. Explicitly discard its value.
- clearStrBufAfterUse();
- }
- }
-
- // [NOCPP[
-
- private static String newAsciiLowerCaseStringFromString(String str) {
- if (str == null) {
- return null;
- }
- char[] buf = new char[str.length()];
- for (int i = 0; i < str.length(); i++) {
- char c = str.charAt(i);
- if (c >= 'A' && c <= 'Z') {
- c += 0x20;
- }
- buf[i] = c;
- }
- return new String(buf);
- }
-
- protected void startErrorReporting() throws SAXException {
-
- }
-
- // ]NOCPP]
-
- public void start() throws SAXException {
- initializeWithoutStarting();
- tokenHandler.startTokenization(this);
- // [NOCPP[
- startErrorReporting();
- // ]NOCPP]
- }
-
- public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException {
- int state = stateSave;
- int returnState = returnStateSave;
- char c = '\u0000';
- shouldSuspend = false;
- lastCR = false;
-
- int start = buffer.getStart();
- int end = buffer.getEnd();
-
- // In C++, the caller of tokenizeBuffer needs to do this explicitly.
- // [NOCPP[
- ensureBufferSpace(end - start);
- // ]NOCPP]
-
- /**
- * The index of the last <code>char</code> read from <code>buf</code>.
- */
- int pos = start - 1;
-
- /**
- * The index of the first <code>char</code> in <code>buf</code> that is
- * part of a coalesced run of character tokens or
- * <code>Integer.MAX_VALUE</code> if there is not a current run being
- * coalesced.
- */
- switch (state) {
- case DATA:
- case RCDATA:
- case SCRIPT_DATA:
- case PLAINTEXT:
- case RAWTEXT:
- case CDATA_SECTION:
- case SCRIPT_DATA_ESCAPED:
- case SCRIPT_DATA_ESCAPE_START:
- case SCRIPT_DATA_ESCAPE_START_DASH:
- case SCRIPT_DATA_ESCAPED_DASH:
- case SCRIPT_DATA_ESCAPED_DASH_DASH:
- case SCRIPT_DATA_DOUBLE_ESCAPE_START:
- case SCRIPT_DATA_DOUBLE_ESCAPED:
- case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
- case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
- case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
- case SCRIPT_DATA_DOUBLE_ESCAPE_END:
- cstart = start;
- break;
- default:
- cstart = Integer.MAX_VALUE;
- break;
- }
-
- /**
- * The number of <code>char</code>s in <code>buf</code> that have
- * meaning. (The rest of the array is garbage and should not be
- * examined.)
- */
- // CPPONLY: if (mViewSource) {
- // CPPONLY: mViewSource.SetBuffer(buffer);
- // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
- // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1);
- // CPPONLY: } else {
- // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd());
- // CPPONLY: }
- // [NOCPP[
- pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
- end);
- // ]NOCPP]
- if (pos == end) {
- // exiting due to end of buffer
- buffer.setStart(pos);
- } else {
- buffer.setStart(pos + 1);
- }
- return lastCR;
- }
-
- // [NOCPP[
- private void ensureBufferSpace(int inputLength) throws SAXException {
- // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
- // Adding to the general worst case instead of only the
- // TreeBuilder-exposed worst case to avoid re-introducing a bug when
- // unifying the tokenizer and tree builder buffers in the future.
- int worstCase = strBufLen + inputLength + charRefBufLen + 2;
- tokenHandler.ensureBufferSpace(worstCase);
- if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) {
- // When altering infoset, if the comment contents are consecutive
- // hyphens, each hyphen generates a space, too. These buffer
- // contents never get emitted as characters() to the tokenHandler,
- // which is why this calculation happens after the call to
- // ensureBufferSpace on tokenHandler.
- worstCase *= 2;
- }
- if (strBuf == null) {
- // Add an arbitrary small value to avoid immediate reallocation
- // once there are a few characters in the buffer.
- strBuf = new char[worstCase + 128];
- } else if (worstCase > strBuf.length) {
- // HotSpot reportedly allocates memory with 8-byte accuracy, so
- // there's no point in trying to do math here to avoid slop.
- // Maybe we should add some small constant to worstCase here
- // but not doing that without profiling. In C++ with jemalloc,
- // the corresponding method should do math to round up here
- // to avoid slop.
- char[] newBuf = new char[worstCase];
- System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
- strBuf = newBuf;
- }
- }
- // ]NOCPP]
-
- @SuppressWarnings("unused") private int stateLoop(int state, char c,
- int pos, @NoLength char[] buf, boolean reconsume, int returnState,
- int endPos) throws SAXException {
- /*
- * Idioms used in this code:
- *
- *
- * Consuming the next input character
- *
- * To consume the next input character, the code does this: if (++pos ==
- * endPos) { break stateloop; } c = checkChar(buf, pos);
- *
- *
- * Staying in a state
- *
- * When there's a state that the tokenizer may stay in over multiple
- * input characters, the state has a wrapper |for(;;)| loop and staying
- * in the state continues the loop.
- *
- *
- * Switching to another state
- *
- * To switch to another state, the code sets the state variable to the
- * magic number of the new state. Then it either continues stateloop or
- * breaks out of the state's own wrapper loop if the target state is
- * right after the current state in source order. (This is a partial
- * workaround for Java's lack of goto.)
- *
- *
- * Reconsume support
- *
- * The spec sometimes says that an input character is reconsumed in
- * another state. If a state can ever be entered so that an input
- * character can be reconsumed in it, the state's code starts with an
- * |if (reconsume)| that sets reconsume to false and skips over the
- * normal code for consuming a new character.
- *
- * To reconsume the current character in another state, the code sets
- * |reconsume| to true and then switches to the other state.
- *
- *
- * Emitting character tokens
- *
- * This method emits character tokens lazily. Whenever a new range of
- * character tokens starts, the field cstart must be set to the start
- * index of the range. The flushChars() method must be called at the end
- * of a range to flush it.
- *
- *
- * U+0000 handling
- *
- * The various states have to handle the replacement of U+0000 with
- * U+FFFD. However, if U+0000 would be reconsumed in another state, the
- * replacement doesn't need to happen, because it's handled by the
- * reconsuming state.
- *
- *
- * LF handling
- *
- * Every state needs to increment the line number upon LF unless the LF
- * gets reconsumed by another state which increments the line number.
- *
- *
- * CR handling
- *
- * Every state needs to handle CR unless the CR gets reconsumed and is
- * handled by the reconsuming state. The CR needs to be handled as if it
- * were and LF, the lastCR field must be set to true and then this
- * method must return. The IO driver will then swallow the next
- * character if it is an LF to coalesce CRLF.
- */
- stateloop: for (;;) {
- switch (state) {
- case DATA:
- dataloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- switch (c) {
- case '&':
- /*
- * U+0026 AMPERSAND (&) Switch to the character
- * reference in data state.
- */
- flushChars(buf, pos);
- assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
- appendCharRefBuf(c);
- setAdditionalAndRememberAmpersandLocation('\u0000');
- returnState = state;
- state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
- continue stateloop;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the tag
- * open state.
- */
- flushChars(buf, pos);
-
- state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos);
- break dataloop; // FALL THROUGH continue
- // stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the input character as a
- * character token.
- *
- * Stay in the data state.
- */
- continue;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case TAG_OPEN:
- tagopenloop: for (;;) {
- /*
- * The behavior of this state depends on the content
- * model flag.
- */
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * If the content model flag is set to the PCDATA state
- * Consume the next input character:
- */
- if (c >= 'A' && c <= 'Z') {
- /*
- * U+0041 LATIN CAPITAL LETTER A through to U+005A
- * LATIN CAPITAL LETTER Z Create a new start tag
- * token,
- */
- endTag = false;
- /*
- * set its tag name to the lowercase version of the
- * input character (add 0x0020 to the character's
- * code point),
- */
- clearStrBufBeforeUse();
- appendStrBuf((char) (c + 0x20));
- /* then switch to the tag name state. */
- state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
- /*
- * (Don't emit the token yet; further details will
- * be filled in before it is emitted.)
- */
- break tagopenloop;
- // continue stateloop;
- } else if (c >= 'a' && c <= 'z') {
- /*
- * U+0061 LATIN SMALL LETTER A through to U+007A
- * LATIN SMALL LETTER Z Create a new start tag
- * token,
- */
- endTag = false;
- /*
- * set its tag name to the input character,
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- /* then switch to the tag name state. */
- state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
- /*
- * (Don't emit the token yet; further details will
- * be filled in before it is emitted.)
- */
- break tagopenloop;
- // continue stateloop;
- }
- switch (c) {
- case '!':
- /*
- * U+0021 EXCLAMATION MARK (!) Switch to the
- * markup declaration open state.
- */
- state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos);
- continue stateloop;
- case '/':
- /*
- * U+002F SOLIDUS (/) Switch to the close tag
- * open state.
- */
- state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos);
- continue stateloop;
- case '?':
- // CPPONLY: if (viewingXmlSource) {
- // CPPONLY: state = transition(state,
- // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION,
- // CPPONLY: reconsume,
- // CPPONLY: pos);
- // CPPONLY: continue stateloop;
- // CPPONLY: }
- /*
- * U+003F QUESTION MARK (?) Parse error.
- */
- errProcessingInstruction();
- /*
- * Switch to the bogus comment state.
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Parse error.
- */
- errLtGt();
- /*
- * Emit a U+003C LESS-THAN SIGN character token
- * and a U+003E GREATER-THAN SIGN character
- * token.
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 2);
- /* Switch to the data state. */
- cstart = pos + 1;
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- /*
- * Anything else Parse error.
- */
- errBadCharAfterLt(c);
- /*
- * Emit a U+003C LESS-THAN SIGN character token
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- /*
- * and reconsume the current input character in
- * the data state.
- */
- cstart = pos;
- reconsume = true;
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- }
- }
- // FALL THROUGH DON'T REORDER
- case TAG_NAME:
- tagnameloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- strBufToElementNameString();
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the before attribute name state.
- */
- strBufToElementNameString();
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- break tagnameloop;
- // continue stateloop;
- case '/':
- /*
- * U+002F SOLIDUS (/) Switch to the self-closing
- * start tag state.
- */
- strBufToElementNameString();
- state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * tag token.
- */
- strBufToElementNameString();
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- if (c >= 'A' && c <= 'Z') {
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Append the
- * lowercase version of the current input
- * character (add 0x0020 to the character's
- * code point) to the current tag token's
- * tag name.
- */
- c += 0x20;
- }
- /*
- * Anything else Append the current input
- * character to the current tag token's tag
- * name.
- */
- appendStrBuf(c);
- /*
- * Stay in the tag name state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_ATTRIBUTE_NAME:
- beforeattributenameloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the before attribute name state.
- */
- continue;
- case '/':
- /*
- * U+002F SOLIDUS (/) Switch to the self-closing
- * start tag state.
- */
- state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * tag token.
- */
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- case '\"':
- case '\'':
- case '<':
- case '=':
- /*
- * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
- * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
- * SIGN (=) Parse error.
- */
- errBadCharBeforeAttributeNameOrNull(c);
- /*
- * Treat it as per the "anything else" entry
- * below.
- */
- default:
- /*
- * Anything else Start a new attribute in the
- * current tag token.
- */
- if (c >= 'A' && c <= 'Z') {
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Set that
- * attribute's name to the lowercase version
- * of the current input character (add
- * 0x0020 to the character's code point)
- */
- c += 0x20;
- }
- // CPPONLY: attributeLine = line;
- /*
- * Set that attribute's name to the current
- * input character,
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- /*
- * and its value to the empty string.
- */
- // Will do later.
- /*
- * Switch to the attribute name state.
- */
- state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
- break beforeattributenameloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case ATTRIBUTE_NAME:
- attributenameloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- attributeNameComplete();
- state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the after attribute name state.
- */
- attributeNameComplete();
- state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- case '/':
- /*
- * U+002F SOLIDUS (/) Switch to the self-closing
- * start tag state.
- */
- attributeNameComplete();
- addAttributeWithoutValue();
- state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
- continue stateloop;
- case '=':
- /*
- * U+003D EQUALS SIGN (=) Switch to the before
- * attribute value state.
- */
- attributeNameComplete();
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
- break attributenameloop;
- // continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * tag token.
- */
- attributeNameComplete();
- addAttributeWithoutValue();
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- case '\"':
- case '\'':
- case '<':
- /*
- * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
- * (') U+003C LESS-THAN SIGN (<) Parse error.
- */
- errQuoteOrLtInAttributeNameOrNull(c);
- /*
- * Treat it as per the "anything else" entry
- * below.
- */
- default:
- if (c >= 'A' && c <= 'Z') {
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Append the
- * lowercase version of the current input
- * character (add 0x0020 to the character's
- * code point) to the current attribute's
- * name.
- */
- c += 0x20;
- }
- /*
- * Anything else Append the current input
- * character to the current attribute's name.
- */
- appendStrBuf(c);
- /*
- * Stay in the attribute name state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_ATTRIBUTE_VALUE:
- beforeattributevalueloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the before attribute value state.
- */
- continue;
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Switch to the
- * attribute value (double-quoted) state.
- */
- // CPPONLY: attributeLine = line;
- clearStrBufBeforeUse();
- state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos);
- break beforeattributevalueloop;
- // continue stateloop;
- case '&':
- /*
- * U+0026 AMPERSAND (&) Switch to the attribute
- * value (unquoted) state and reconsume this
- * input character.
- */
- // CPPONLY: attributeLine = line;
- clearStrBufBeforeUse();
- reconsume = true;
- state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
- noteUnquotedAttributeValue();
- continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Switch to the attribute
- * value (single-quoted) state.
- */
- // CPPONLY: attributeLine = line;
- clearStrBufBeforeUse();
- state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Parse error.
- */
- errAttributeValueMissing();
- /*
- * Emit the current tag token.
- */
- addAttributeWithoutValue();
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- case '<':
- case '=':
- case '`':
- /*
- * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN
- * (=) U+0060 GRAVE ACCENT (`)
- */
- errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c);
- /*
- * Treat it as per the "anything else" entry
- * below.
- */
- default:
- // [NOCPP[
- errHtml4NonNameInUnquotedAttribute(c);
- // ]NOCPP]
- /*
- * Anything else Append the current input
- * character to the current attribute's value.
- */
- // CPPONLY: attributeLine = line;
- clearStrBufBeforeUse();
- appendStrBuf(c);
- /*
- * Switch to the attribute value (unquoted)
- * state.
- */
-
- state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos);
- noteUnquotedAttributeValue();
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
- attributevaluedoublequotedloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Switch to the after
- * attribute value (quoted) state.
- */
- addAttributeWithValue();
-
- state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
- break attributevaluedoublequotedloop;
- // continue stateloop;
- case '&':
- /*
- * U+0026 AMPERSAND (&) Switch to the character
- * reference in attribute value state, with the
- * additional allowed character being U+0022
- * QUOTATION MARK (").
- */
- assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
- appendCharRefBuf(c);
- setAdditionalAndRememberAmpersandLocation('\"');
- returnState = state;
- state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the current input
- * character to the current attribute's value.
- */
- appendStrBuf(c);
- /*
- * Stay in the attribute value (double-quoted)
- * state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_ATTRIBUTE_VALUE_QUOTED:
- afterattributevaluequotedloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the before attribute name state.
- */
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- case '/':
- /*
- * U+002F SOLIDUS (/) Switch to the self-closing
- * start tag state.
- */
- state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
- break afterattributevaluequotedloop;
- // continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * tag token.
- */
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- default:
- /*
- * Anything else Parse error.
- */
- errNoSpaceBetweenAttributes();
- /*
- * Reconsume the character in the before
- * attribute name state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case SELF_CLOSING_START_TAG:
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Set the self-closing
- * flag of the current tag token. Emit the current
- * tag token.
- */
- // [NOCPP[
- errHtml4XmlVoidSyntax();
- // ]NOCPP]
- state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- default:
- /* Anything else Parse error. */
- errSlashNotFollowedByGt();
- /*
- * Reconsume the character in the before attribute
- * name state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- }
- // XXX reorder point
- case ATTRIBUTE_VALUE_UNQUOTED:
- for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- addAttributeWithValue();
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the before attribute name state.
- */
- addAttributeWithValue();
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- case '&':
- /*
- * U+0026 AMPERSAND (&) Switch to the character
- * reference in attribute value state, with the
- * additional allowed character being U+003E
- * GREATER-THAN SIGN (>)
- */
- assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
- appendCharRefBuf(c);
- setAdditionalAndRememberAmpersandLocation('>');
- returnState = state;
- state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * tag token.
- */
- addAttributeWithValue();
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- case '<':
- case '\"':
- case '\'':
- case '=':
- case '`':
- /*
- * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE
- * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS
- * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error.
- */
- errUnquotedAttributeValOrNull(c);
- /*
- * Treat it as per the "anything else" entry
- * below.
- */
- // fall through
- default:
- // [NOCPP]
- errHtml4NonNameInUnquotedAttribute(c);
- // ]NOCPP]
- /*
- * Anything else Append the current input
- * character to the current attribute's value.
- */
- appendStrBuf(c);
- /*
- * Stay in the attribute value (unquoted) state.
- */
- continue;
- }
- }
- // XXX reorder point
- case AFTER_ATTRIBUTE_NAME:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the after attribute name state.
- */
- continue;
- case '/':
- /*
- * U+002F SOLIDUS (/) Switch to the self-closing
- * start tag state.
- */
- addAttributeWithoutValue();
- state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
- continue stateloop;
- case '=':
- /*
- * U+003D EQUALS SIGN (=) Switch to the before
- * attribute value state.
- */
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * tag token.
- */
- addAttributeWithoutValue();
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- /*
- * Switch to the data state.
- */
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- case '\"':
- case '\'':
- case '<':
- errQuoteOrLtInAttributeNameOrNull(c);
- /*
- * Treat it as per the "anything else" entry
- * below.
- */
- default:
- addAttributeWithoutValue();
- /*
- * Anything else Start a new attribute in the
- * current tag token.
- */
- if (c >= 'A' && c <= 'Z') {
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Set that
- * attribute's name to the lowercase version
- * of the current input character (add
- * 0x0020 to the character's code point)
- */
- c += 0x20;
- }
- /*
- * Set that attribute's name to the current
- * input character,
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- /*
- * and its value to the empty string.
- */
- // Will do later.
- /*
- * Switch to the attribute name state.
- */
- state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case MARKUP_DECLARATION_OPEN:
- markupdeclarationopenloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * If the next two characters are both U+002D
- * HYPHEN-MINUS characters (-), consume those two
- * characters, create a comment token whose data is the
- * empty string, and switch to the comment start state.
- *
- * Otherwise, if the next seven characters are an ASCII
- * case-insensitive match for the word "DOCTYPE", then
- * consume those characters and switch to the DOCTYPE
- * state.
- *
- * Otherwise, if the insertion mode is
- * "in foreign content" and the current node is not an
- * element in the HTML namespace and the next seven
- * characters are an case-sensitive match for the string
- * "[CDATA[" (the five uppercase letters "CDATA" with a
- * U+005B LEFT SQUARE BRACKET character before and
- * after), then consume those characters and switch to
- * the CDATA section state.
- *
- * Otherwise, is is a parse error. Switch to the bogus
- * comment state. The next character that is consumed,
- * if any, is the first character that will be in the
- * comment.
- */
- switch (c) {
- case '-':
- clearStrBufBeforeUse();
- appendStrBuf(c);
- state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos);
- break markupdeclarationopenloop;
- // continue stateloop;
- case 'd':
- case 'D':
- clearStrBufBeforeUse();
- appendStrBuf(c);
- index = 0;
- state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos);
- continue stateloop;
- case '[':
- if (tokenHandler.cdataSectionAllowed()) {
- clearStrBufBeforeUse();
- appendStrBuf(c);
- index = 0;
- state = transition(state, Tokenizer.CDATA_START, reconsume, pos);
- continue stateloop;
- }
- // else fall through
- default:
- errBogusComment();
- clearStrBufBeforeUse();
- reconsume = true;
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case MARKUP_DECLARATION_HYPHEN:
- markupdeclarationhyphenloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case '\u0000':
- break stateloop;
- case '-':
- clearStrBufAfterOneHyphen();
- state = transition(state, Tokenizer.COMMENT_START, reconsume, pos);
- break markupdeclarationhyphenloop;
- // continue stateloop;
- default:
- errBogusComment();
- reconsume = true;
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT_START:
- commentstartloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Comment start state
- *
- *
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Switch to the comment
- * start dash state.
- */
- appendStrBuf(c);
- state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Parse error.
- */
- errPrematureEndOfComment();
- /* Emit the comment token. */
- emitComment(0, pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- break commentstartloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the input character to
- * the comment token's data.
- */
- appendStrBuf(c);
- /*
- * Switch to the comment state.
- */
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- break commentstartloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT:
- commentloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Comment state Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Switch to the comment
- * end dash state
- */
- appendStrBuf(c);
- state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
- break commentloop;
- // continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the input character to
- * the comment token's data.
- */
- appendStrBuf(c);
- /*
- * Stay in the comment state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT_END_DASH:
- commentenddashloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Comment end dash state Consume the next input
- * character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Switch to the comment
- * end state
- */
- appendStrBuf(c);
- state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
- break commentenddashloop;
- // continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append a U+002D HYPHEN-MINUS
- * (-) character and the input character to the
- * comment token's data.
- */
- appendStrBuf(c);
- /*
- * Switch to the comment state.
- */
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case COMMENT_END:
- commentendloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Comment end dash state Consume the next input
- * character:
- */
- switch (c) {
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the comment
- * token.
- */
- emitComment(2, pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '-':
- /* U+002D HYPHEN-MINUS (-) Parse error. */
- /*
- * Append a U+002D HYPHEN-MINUS (-) character to
- * the comment token's data.
- */
- adjustDoubleHyphenAndAppendToStrBufAndErr(c);
- /*
- * Stay in the comment end state.
- */
- continue;
- case '\r':
- adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- break stateloop;
- case '\n':
- adjustDoubleHyphenAndAppendToStrBufLineFeed();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- case '!':
- errHyphenHyphenBang();
- appendStrBuf(c);
- state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Append two U+002D HYPHEN-MINUS (-) characters
- * and the input character to the comment
- * token's data.
- */
- adjustDoubleHyphenAndAppendToStrBufAndErr(c);
- /*
- * Switch to the comment state.
- */
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case COMMENT_END_BANG:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Comment end bang state
- *
- * Consume the next input character:
- */
- switch (c) {
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the comment
- * token.
- */
- emitComment(3, pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '-':
- /*
- * Append two U+002D HYPHEN-MINUS (-) characters
- * and a U+0021 EXCLAMATION MARK (!) character
- * to the comment token's data.
- */
- appendStrBuf(c);
- /*
- * Switch to the comment end dash state.
- */
- state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append two U+002D HYPHEN-MINUS
- * (-) characters, a U+0021 EXCLAMATION MARK (!)
- * character, and the input character to the
- * comment token's data. Switch to the comment
- * state.
- */
- appendStrBuf(c);
- /*
- * Switch to the comment state.
- */
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case COMMENT_START_DASH:
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Comment start dash state
- *
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Switch to the comment end
- * state
- */
- appendStrBuf(c);
- state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
- continue stateloop;
- case '>':
- errPrematureEndOfComment();
- /* Emit the comment token. */
- emitComment(1, pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Append a U+002D HYPHEN-MINUS character (-) and
- * the current input character to the comment
- * token's data.
- */
- appendStrBuf(c);
- /*
- * Switch to the comment state.
- */
- state = transition(state, Tokenizer.COMMENT, reconsume, pos);
- continue stateloop;
- }
- // XXX reorder point
- case CDATA_START:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- if (index < 6) { // CDATA_LSQB.length
- if (c == Tokenizer.CDATA_LSQB[index]) {
- appendStrBuf(c);
- } else {
- errBogusComment();
- reconsume = true;
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- }
- index++;
- continue;
- } else {
- clearStrBufAfterUse();
- cstart = pos; // start coalescing
- reconsume = true;
- state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
- break; // FALL THROUGH continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case CDATA_SECTION:
- cdatasectionloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- switch (c) {
- case ']':
- flushChars(buf, pos);
- state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
- break cdatasectionloop; // FALL THROUGH
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- default:
- continue;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case CDATA_RSQB:
- cdatarsqb: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case ']':
- state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos);
- break cdatarsqb;
- default:
- tokenHandler.characters(Tokenizer.RSQB_RSQB, 0,
- 1);
- cstart = pos;
- reconsume = true;
- state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case CDATA_RSQB_RSQB:
- cdatarsqbrsqb: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case ']':
- // Saw a third ]. Emit one ] (logically the
- // first one) and stay in this state to
- // remember that the last two characters seen
- // have been ]].
- tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
- continue;
- case '>':
- cstart = pos + 1;
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
- cstart = pos;
- reconsume = true;
- state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case ATTRIBUTE_VALUE_SINGLE_QUOTED:
- attributevaluesinglequotedloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Switch to the after
- * attribute value (quoted) state.
- */
- addAttributeWithValue();
-
- state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos);
- continue stateloop;
- case '&':
- /*
- * U+0026 AMPERSAND (&) Switch to the character
- * reference in attribute value state, with the
- * + additional allowed character being U+0027
- * APOSTROPHE (').
- */
- assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
- appendCharRefBuf(c);
- setAdditionalAndRememberAmpersandLocation('\'');
- returnState = state;
- state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
- break attributevaluesinglequotedloop;
- // continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the current input
- * character to the current attribute's value.
- */
- appendStrBuf(c);
- /*
- * Stay in the attribute value (double-quoted)
- * state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case CONSUME_CHARACTER_REFERENCE:
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- if (c == '\u0000') {
- break stateloop;
- }
- /*
- * Unlike the definition is the spec, this state does not
- * return a value and never requires the caller to
- * backtrack. This state takes care of emitting characters
- * or appending to the current attribute value. It also
- * takes care of that in the case when consuming the
- * character reference fails.
- */
- /*
- * This section defines how to consume a character
- * reference. This definition is used when parsing character
- * references in text and in attributes.
- *
- * The behavior depends on the identity of the next
- * character (the one immediately after the U+0026 AMPERSAND
- * character):
- */
- switch (c) {
- case ' ':
- case '\t':
- case '\n':
- case '\r': // we'll reconsume!
- case '\u000C':
- case '<':
- case '&':
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- case '#':
- /*
- * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER
- * SIGN.
- */
- appendCharRefBuf('#');
- state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos);
- continue stateloop;
- default:
- if (c == additional) {
- emitOrAppendCharRefBuf(returnState);
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- if (c >= 'a' && c <= 'z') {
- firstCharKey = c - 'a' + 26;
- } else if (c >= 'A' && c <= 'Z') {
- firstCharKey = c - 'A';
- } else {
- // No match
- /*
- * If no match can be made, then this is a parse
- * error.
- */
- errNoNamedCharacterMatch();
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- // Didn't fail yet
- appendCharRefBuf(c);
- state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos);
- // FALL THROUGH continue stateloop;
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case CHARACTER_REFERENCE_HILO_LOOKUP:
- {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- if (c == '\u0000') {
- break stateloop;
- }
- /*
- * The data structure is as follows:
- *
- * HILO_ACCEL is a two-dimensional int array whose major
- * index corresponds to the second character of the
- * character reference (code point as index) and the
- * minor index corresponds to the first character of the
- * character reference (packed so that A-Z runs from 0
- * to 25 and a-z runs from 26 to 51). This layout makes
- * it easier to use the sparseness of the data structure
- * to omit parts of it: The second dimension of the
- * table is null when no character reference starts with
- * the character corresponding to that row.
- *
- * The int value HILO_ACCEL (by these indeces) is zero
- * if there exists no character reference starting with
- * that two-letter prefix. Otherwise, the value is an
- * int that packs two shorts so that the higher short is
- * the index of the highest character reference name
- * with that prefix in NAMES and the lower short
- * corresponds to the index of the lowest character
- * reference name with that prefix. (It happens that the
- * first two character reference names share their
- * prefix so the packed int cannot be 0 by packing the
- * two shorts.)
- *
- * NAMES is an array of byte arrays where each byte
- * array encodes the name of a character references as
- * ASCII. The names omit the first two letters of the
- * name. (Since storing the first two letters would be
- * redundant with the data contained in HILO_ACCEL.) The
- * entries are lexically sorted.
- *
- * For a given index in NAMES, the same index in VALUES
- * contains the corresponding expansion as an array of
- * two UTF-16 code units (either the character and
- * U+0000 or a suggogate pair).
- */
- int hilo = 0;
- if (c <= 'z') {
- @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c];
- if (row != null) {
- hilo = row[firstCharKey];
- }
- }
- if (hilo == 0) {
- /*
- * If no match can be made, then this is a parse
- * error.
- */
- errNoNamedCharacterMatch();
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- // Didn't fail yet
- appendCharRefBuf(c);
- lo = hilo & 0xFFFF;
- hi = hilo >> 16;
- entCol = -1;
- candidate = -1;
- charRefBufMark = 0;
- state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos);
- // FALL THROUGH continue stateloop;
- }
- case CHARACTER_REFERENCE_TAIL:
- outer: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- if (c == '\u0000') {
- break stateloop;
- }
- entCol++;
- /*
- * Consume the maximum number of characters possible,
- * with the consumed characters matching one of the
- * identifiers in the first column of the named
- * character references table (in a case-sensitive
- * manner).
- */
- loloop: for (;;) {
- if (hi < lo) {
- break outer;
- }
- if (entCol == NamedCharacters.NAMES[lo].length()) {
- candidate = lo;
- charRefBufMark = charRefBufLen;
- lo++;
- } else if (entCol > NamedCharacters.NAMES[lo].length()) {
- break outer;
- } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
- lo++;
- } else {
- break loloop;
- }
- }
-
- hiloop: for (;;) {
- if (hi < lo) {
- break outer;
- }
- if (entCol == NamedCharacters.NAMES[hi].length()) {
- break hiloop;
- }
- if (entCol > NamedCharacters.NAMES[hi].length()) {
- break outer;
- } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
- hi--;
- } else {
- break hiloop;
- }
- }
-
- if (c == ';') {
- // If we see a semicolon, there cannot be a
- // longer match. Break the loop. However, before
- // breaking, take the longest match so far as the
- // candidate, if we are just about to complete a
- // match.
- if (entCol + 1 == NamedCharacters.NAMES[lo].length()) {
- candidate = lo;
- charRefBufMark = charRefBufLen;
- }
- break outer;
- }
-
- if (hi < lo) {
- break outer;
- }
- appendCharRefBuf(c);
- continue;
- }
-
- if (candidate == -1) {
- // reconsume deals with CR, LF or nul
- /*
- * If no match can be made, then this is a parse error.
- */
- errNoNamedCharacterMatch();
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- } else {
- // c can't be CR, LF or nul if we got here
- @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
- if (candidateName.length() == 0
- || candidateName.charAt(candidateName.length() - 1) != ';') {
- /*
- * If the last character matched is not a U+003B
- * SEMICOLON (;), there is a parse error.
- */
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- /*
- * If the entity is being consumed as part of an
- * attribute, and the last character matched is
- * not a U+003B SEMICOLON (;),
- */
- char ch;
- if (charRefBufMark == charRefBufLen) {
- ch = c;
- } else {
- ch = charRefBuf[charRefBufMark];
- }
- if (ch == '=' || (ch >= '0' && ch <= '9')
- || (ch >= 'A' && ch <= 'Z')
- || (ch >= 'a' && ch <= 'z')) {
- /*
- * and the next character is either a U+003D
- * EQUALS SIGN character (=) or in the range
- * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
- * U+0041 LATIN CAPITAL LETTER A to U+005A
- * LATIN CAPITAL LETTER Z, or U+0061 LATIN
- * SMALL LETTER A to U+007A LATIN SMALL
- * LETTER Z, then, for historical reasons,
- * all the characters that were matched
- * after the U+0026 AMPERSAND (&) must be
- * unconsumed, and nothing is returned.
- */
- errNoNamedCharacterMatch();
- appendCharRefBufToStrBuf();
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- }
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- errUnescapedAmpersandInterpretedAsCharacterReference();
- } else {
- errNotSemicolonTerminated();
- }
- }
-
- /*
- * Otherwise, return a character token for the character
- * corresponding to the entity name (as given by the
- * second column of the named character references
- * table).
- */
- // CPPONLY: completedNamedCharacterReference();
- @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
- if (
- // [NOCPP[
- val.length == 1
- // ]NOCPP]
- // CPPONLY: val[1] == 0
- ) {
- emitOrAppendOne(val, returnState);
- } else {
- emitOrAppendTwo(val, returnState);
- }
- // this is so complicated!
- if (charRefBufMark < charRefBufLen) {
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- appendStrBuf(charRefBuf, charRefBufMark,
- charRefBufLen - charRefBufMark);
- } else {
- tokenHandler.characters(charRefBuf, charRefBufMark,
- charRefBufLen - charRefBufMark);
- }
- }
- // charRefBufLen will be zeroed below!
-
- // Check if we broke out early with c being the last
- // character that matched as opposed to being the
- // first one that didn't match. In the case of an
- // early break, the next run on text should start
- // *after* the current character and the current
- // character shouldn't be reconsumed.
- boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen);
- charRefBufLen = 0;
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = earlyBreak ? pos + 1 : pos;
- }
- reconsume = !earlyBreak;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- /*
- * If the markup contains I'm &notit; I tell you, the
- * entity is parsed as "not", as in, I'm ¬it; I tell
- * you. But if the markup was I'm &notin; I tell you,
- * the entity would be parsed as "notin;", resulting in
- * I'm ∉ I tell you.
- */
- }
- // XXX reorder point
- case CONSUME_NCR:
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- value = 0;
- seenDigits = false;
- /*
- * The behavior further depends on the character after the
- * U+0023 NUMBER SIGN:
- */
- switch (c) {
- case 'x':
- case 'X':
-
- /*
- * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL
- * LETTER X Consume the X.
- *
- * Follow the steps below, but using the range of
- * characters U+0030 DIGIT ZERO through to U+0039
- * DIGIT NINE, U+0061 LATIN SMALL LETTER A through
- * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN
- * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL
- * LETTER F (in other words, 0-9, A-F, a-f).
- *
- * When it comes to interpreting the number,
- * interpret it as a hexadecimal number.
- */
- appendCharRefBuf(c);
- state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos);
- continue stateloop;
- default:
- /*
- * Anything else Follow the steps below, but using
- * the range of characters U+0030 DIGIT ZERO through
- * to U+0039 DIGIT NINE (i.e. just 0-9).
- *
- * When it comes to interpreting the number,
- * interpret it as a decimal number.
- */
- reconsume = true;
- state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos);
- // FALL THROUGH continue stateloop;
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case DECIMAL_NRC_LOOP:
- decimalloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume as many characters as match the range of
- * characters given above.
- */
- assert value >= 0: "value must not become negative.";
- if (c >= '0' && c <= '9') {
- seenDigits = true;
- // Avoid overflow
- if (value <= 0x10FFFF) {
- value *= 10;
- value += c - '0';
- }
- continue;
- } else if (c == ';') {
- if (seenDigits) {
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos + 1;
- }
- state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
- // FALL THROUGH continue stateloop;
- break decimalloop;
- } else {
- errNoDigitsInNCR();
- appendCharRefBuf(';');
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos + 1;
- }
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- } else {
- /*
- * If no characters match the range, then don't
- * consume any characters (and unconsume the U+0023
- * NUMBER SIGN character and, if appropriate, the X
- * character). This is a parse error; nothing is
- * returned.
- *
- * Otherwise, if the next character is a U+003B
- * SEMICOLON, consume that too. If it isn't, there
- * is a parse error.
- */
- if (!seenDigits) {
- errNoDigitsInNCR();
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- } else {
- errCharRefLacksSemicolon();
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
- // FALL THROUGH continue stateloop;
- break decimalloop;
- }
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case HANDLE_NCR_VALUE:
- // WARNING previous state sets reconsume
- // We are not going to emit the contents of charRefBuf.
- charRefBufLen = 0;
- // XXX inline this case if the method size can take it
- handleNcrValue(returnState);
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- // XXX reorder point
- case HEX_NCR_LOOP:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume as many characters as match the range of
- * characters given above.
- */
- assert value >= 0: "value must not become negative.";
- if (c >= '0' && c <= '9') {
- seenDigits = true;
- // Avoid overflow
- if (value <= 0x10FFFF) {
- value *= 16;
- value += c - '0';
- }
- continue;
- } else if (c >= 'A' && c <= 'F') {
- seenDigits = true;
- // Avoid overflow
- if (value <= 0x10FFFF) {
- value *= 16;
- value += c - 'A' + 10;
- }
- continue;
- } else if (c >= 'a' && c <= 'f') {
- seenDigits = true;
- // Avoid overflow
- if (value <= 0x10FFFF) {
- value *= 16;
- value += c - 'a' + 10;
- }
- continue;
- } else if (c == ';') {
- if (seenDigits) {
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos + 1;
- }
- state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
- continue stateloop;
- } else {
- errNoDigitsInNCR();
- appendCharRefBuf(';');
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos + 1;
- }
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- } else {
- /*
- * If no characters match the range, then don't
- * consume any characters (and unconsume the U+0023
- * NUMBER SIGN character and, if appropriate, the X
- * character). This is a parse error; nothing is
- * returned.
- *
- * Otherwise, if the next character is a U+003B
- * SEMICOLON, consume that too. If it isn't, there
- * is a parse error.
- */
- if (!seenDigits) {
- errNoDigitsInNCR();
- emitOrAppendCharRefBuf(returnState);
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- } else {
- errCharRefLacksSemicolon();
- if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
- cstart = pos;
- }
- reconsume = true;
- state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
- continue stateloop;
- }
- }
- }
- // XXX reorder point
- case PLAINTEXT:
- plaintextloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- switch (c) {
- case '\u0000':
- emitPlaintextReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Stay in the
- * RAWTEXT state.
- */
- continue;
- }
- }
- // XXX reorder point
- case CLOSE_TAG_OPEN:
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Otherwise, if the content model flag is set to the PCDATA
- * state, or if the next few characters do match that tag
- * name, consume the next input character:
- */
- switch (c) {
- case '>':
- /* U+003E GREATER-THAN SIGN (>) Parse error. */
- errLtSlashGt();
- /*
- * Switch to the data state.
- */
- cstart = pos + 1;
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- silentCarriageReturn();
- /* Anything else Parse error. */
- errGarbageAfterLtSlash();
- /*
- * Switch to the bogus comment state.
- */
- clearStrBufBeforeUse();
- appendStrBuf('\n');
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- /* Anything else Parse error. */
- errGarbageAfterLtSlash();
- /*
- * Switch to the bogus comment state.
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- if (c >= 'A' && c <= 'Z') {
- c += 0x20;
- }
- if (c >= 'a' && c <= 'z') {
- /*
- * U+0061 LATIN SMALL LETTER A through to U+007A
- * LATIN SMALL LETTER Z Create a new end tag
- * token,
- */
- endTag = true;
- /*
- * set its tag name to the input character,
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- /*
- * then switch to the tag name state. (Don't
- * emit the token yet; further details will be
- * filled in before it is emitted.)
- */
- state = transition(state, Tokenizer.TAG_NAME, reconsume, pos);
- continue stateloop;
- } else {
- /* Anything else Parse error. */
- errGarbageAfterLtSlash();
- /*
- * Switch to the bogus comment state.
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case RCDATA:
- rcdataloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- switch (c) {
- case '&':
- /*
- * U+0026 AMPERSAND (&) Switch to the character
- * reference in RCDATA state.
- */
- flushChars(buf, pos);
- assert charRefBufLen == 0: "charRefBufLen not reset after previous use!";
- appendCharRefBuf(c);
- setAdditionalAndRememberAmpersandLocation('\u0000');
- returnState = state;
- state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos);
- continue stateloop;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the
- * RCDATA less-than sign state.
- */
- flushChars(buf, pos);
-
- returnState = state;
- state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
- continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Emit the current input character as a
- * character token. Stay in the RCDATA state.
- */
- continue;
- }
- }
- // XXX reorder point
- case RAWTEXT:
- rawtextloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- switch (c) {
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the
- * RAWTEXT less-than sign state.
- */
- flushChars(buf, pos);
-
- returnState = state;
- state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos);
- break rawtextloop;
- // FALL THRU continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Emit the current input character as a
- * character token. Stay in the RAWTEXT state.
- */
- continue;
- }
- }
- // XXX fallthru don't reorder
- case RAWTEXT_RCDATA_LESS_THAN_SIGN:
- rawtextrcdatalessthansignloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case '/':
- /*
- * U+002F SOLIDUS (/) Set the temporary buffer
- * to the empty string. Switch to the script
- * data end tag open state.
- */
- index = 0;
- clearStrBufBeforeUse();
- state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
- break rawtextrcdatalessthansignloop;
- // FALL THRU continue stateloop;
- default:
- /*
- * Otherwise, emit a U+003C LESS-THAN SIGN
- * character token
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- /*
- * and reconsume the current input character in
- * the data state.
- */
- cstart = pos;
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX fall thru. don't reorder.
- case NON_DATA_END_TAG_NAME:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * ASSERT! when entering this state, set index to 0 and
- * call clearStrBufBeforeUse(); Let's implement the above
- * without lookahead. strBuf is the 'temporary buffer'.
- */
- if (endTagExpectationAsArray == null) {
- tokenHandler.characters(Tokenizer.LT_SOLIDUS,
- 0, 2);
- cstart = pos;
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- } else if (index < endTagExpectationAsArray.length) {
- char e = endTagExpectationAsArray[index];
- char folded = c;
- if (c >= 'A' && c <= 'Z') {
- folded += 0x20;
- }
- if (folded != e) {
- // [NOCPP[
- errHtml4LtSlashInRcdata(folded);
- // ]NOCPP]
- tokenHandler.characters(Tokenizer.LT_SOLIDUS,
- 0, 2);
- emitStrBuf();
- cstart = pos;
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- appendStrBuf(c);
- index++;
- continue;
- } else {
- endTag = true;
- // XXX replace contentModelElement with different
- // type
- tagName = endTagExpectation;
- switch (c) {
- case '\r':
- silentCarriageReturn();
- clearStrBufAfterUse(); // strBuf not used
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE
- * FEED (LF) U+000C FORM FEED (FF) U+0020
- * SPACE If the current end tag token is an
- * appropriate end tag token, then switch to
- * the before attribute name state.
- */
- clearStrBufAfterUse(); // strBuf not used
- state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos);
- continue stateloop;
- case '/':
- /*
- * U+002F SOLIDUS (/) If the current end tag
- * token is an appropriate end tag token,
- * then switch to the self-closing start tag
- * state.
- */
- clearStrBufAfterUse(); // strBuf not used
- state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) If the
- * current end tag token is an appropriate
- * end tag token, then emit the current tag
- * token and switch to the data state.
- */
- clearStrBufAfterUse(); // strBuf not used
- state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos);
- if (shouldSuspend) {
- break stateloop;
- }
- continue stateloop;
- default:
- /*
- * Emit a U+003C LESS-THAN SIGN character
- * token, a U+002F SOLIDUS character token,
- * a character token for each of the
- * characters in the temporary buffer (in
- * the order they were added to the buffer),
- * and reconsume the current input character
- * in the RAWTEXT state.
- */
- // [NOCPP[
- errWarnLtSlashInRcdata();
- // ]NOCPP]
- tokenHandler.characters(
- Tokenizer.LT_SOLIDUS, 0, 2);
- emitStrBuf();
- cstart = pos; // don't drop the
- // character
- reconsume = true;
- state = transition(state, returnState, reconsume, pos);
- continue stateloop;
- }
- }
- }
- // XXX reorder point
- // BEGIN HOTSPOT WORKAROUND
- case BOGUS_COMMENT:
- boguscommentloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume every character up to and including the first
- * U+003E GREATER-THAN SIGN character (>) or the end of
- * the file (EOF), whichever comes first. Emit a comment
- * token whose data is the concatenation of all the
- * characters starting from and including the character
- * that caused the state machine to switch into the
- * bogus comment state, up to and including the
- * character immediately before the last consumed
- * character (i.e. up to the character just before the
- * U+003E or EOF character). (If the comment was started
- * by the end of the file (EOF), the token is empty.)
- *
- * Switch to the data state.
- *
- * If the end of the file was reached, reconsume the EOF
- * character.
- */
- switch (c) {
- case '>':
- emitComment(0, pos);
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '-':
- appendStrBuf(c);
- state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos);
- break boguscommentloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- appendStrBuf(c);
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case BOGUS_COMMENT_HYPHEN:
- boguscommenthyphenloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case '>':
- // [NOCPP[
- maybeAppendSpaceToBogusComment();
- // ]NOCPP]
- emitComment(0, pos);
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '-':
- appendSecondHyphenToBogusComment();
- continue boguscommenthyphenloop;
- case '\r':
- appendStrBufCarriageReturn();
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- appendStrBuf(c);
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case SCRIPT_DATA:
- scriptdataloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- switch (c) {
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the
- * script data less-than sign state.
- */
- flushChars(buf, pos);
- returnState = state;
- state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos);
- break scriptdataloop; // FALL THRU continue
- // stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Stay in the
- * script data state.
- */
- continue;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_LESS_THAN_SIGN:
- scriptdatalessthansignloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case '/':
- /*
- * U+002F SOLIDUS (/) Set the temporary buffer
- * to the empty string. Switch to the script
- * data end tag open state.
- */
- index = 0;
- clearStrBufBeforeUse();
- state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
- continue stateloop;
- case '!':
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- cstart = pos;
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos);
- break scriptdatalessthansignloop; // FALL THRU
- // continue
- // stateloop;
- default:
- /*
- * Otherwise, emit a U+003C LESS-THAN SIGN
- * character token
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- /*
- * and reconsume the current input character in
- * the data state.
- */
- cstart = pos;
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_ESCAPE_START:
- scriptdataescapestartloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Switch to the
- * script data escape start dash state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos);
- break scriptdataescapestartloop; // FALL THRU
- // continue
- // stateloop;
- default:
- /*
- * Anything else Reconsume the current input
- * character in the script data state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_ESCAPE_START_DASH:
- scriptdataescapestartdashloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Switch to the
- * script data escaped dash dash state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
- break scriptdataescapestartdashloop;
- // continue stateloop;
- default:
- /*
- * Anything else Reconsume the current input
- * character in the script data state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_ESCAPED_DASH_DASH:
- scriptdataescapeddashdashloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Stay in the
- * script data escaped dash dash state.
- */
- continue;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the
- * script data escaped less-than sign state.
- */
- flushChars(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit a U+003E
- * GREATER-THAN SIGN character token. Switch to
- * the script data state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
- continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- break scriptdataescapeddashdashloop;
- case '\r':
- emitCarriageReturn(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Switch to the
- * script data escaped state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- break scriptdataescapeddashdashloop;
- // continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_ESCAPED:
- scriptdataescapedloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Switch to the
- * script data escaped dash state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos);
- break scriptdataescapedloop; // FALL THRU
- // continue
- // stateloop;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the
- * script data escaped less-than sign state.
- */
- flushChars(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
- continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Stay in the
- * script data escaped state.
- */
- continue;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_ESCAPED_DASH:
- scriptdataescapeddashloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Switch to the
- * script data escaped dash dash state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos);
- continue stateloop;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Switch to the
- * script data escaped less-than sign state.
- */
- flushChars(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
- break scriptdataescapeddashloop;
- // continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- continue stateloop;
- case '\r':
- emitCarriageReturn(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Switch to the
- * script data escaped state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
- scriptdataescapedlessthanloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '/':
- /*
- * U+002F SOLIDUS (/) Set the temporary buffer
- * to the empty string. Switch to the script
- * data escaped end tag open state.
- */
- index = 0;
- clearStrBufBeforeUse();
- returnState = Tokenizer.SCRIPT_DATA_ESCAPED;
- state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos);
- continue stateloop;
- case 'S':
- case 's':
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Emit a U+003C
- * LESS-THAN SIGN character token and the
- * current input character as a character token.
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- cstart = pos;
- index = 1;
- /*
- * Set the temporary buffer to the empty string.
- * Append the lowercase version of the current
- * input character (add 0x0020 to the
- * character's code point) to the temporary
- * buffer. Switch to the script data double
- * escape start state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos);
- break scriptdataescapedlessthanloop;
- // continue stateloop;
- default:
- /*
- * Anything else Emit a U+003C LESS-THAN SIGN
- * character token and reconsume the current
- * input character in the script data escaped
- * state.
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- cstart = pos;
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_DOUBLE_ESCAPE_START:
- scriptdatadoubleescapestartloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- assert index > 0;
- if (index < 6) { // SCRIPT_ARR.length
- char folded = c;
- if (c >= 'A' && c <= 'Z') {
- folded += 0x20;
- }
- if (folded != Tokenizer.SCRIPT_ARR[index]) {
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- index++;
- continue;
- }
- switch (c) {
- case '\r':
- emitCarriageReturn(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- case ' ':
- case '\t':
- case '\u000C':
- case '/':
- case '>':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
- * (>) Emit the current input character as a
- * character token. If the temporary buffer is
- * the string "script", then switch to the
- * script data double escaped state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- break scriptdatadoubleescapestartloop;
- // continue stateloop;
- default:
- /*
- * Anything else Reconsume the current input
- * character in the script data escaped state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_DOUBLE_ESCAPED:
- scriptdatadoubleescapedloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Switch to the
- * script data double escaped dash state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos);
- break scriptdatadoubleescapedloop; // FALL THRU
- // continue
- // stateloop;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Emit a U+003C
- * LESS-THAN SIGN character token. Switch to the
- * script data double escaped less-than sign
- * state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
- continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- continue;
- case '\r':
- emitCarriageReturn(buf, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Stay in the
- * script data double escaped state.
- */
- continue;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
- scriptdatadoubleescapeddashloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Switch to the
- * script data double escaped dash dash state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos);
- break scriptdatadoubleescapeddashloop;
- // continue stateloop;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Emit a U+003C
- * LESS-THAN SIGN character token. Switch to the
- * script data double escaped less-than sign
- * state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
- continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- case '\r':
- emitCarriageReturn(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Switch to the
- * script data double escaped state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
- scriptdatadoubleescapeddashdashloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '-':
- /*
- * U+002D HYPHEN-MINUS (-) Emit a U+002D
- * HYPHEN-MINUS character token. Stay in the
- * script data double escaped dash dash state.
- */
- continue;
- case '<':
- /*
- * U+003C LESS-THAN SIGN (<) Emit a U+003C
- * LESS-THAN SIGN character token. Switch to the
- * script data double escaped less-than sign
- * state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos);
- break scriptdatadoubleescapeddashdashloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit a U+003E
- * GREATER-THAN SIGN character token. Switch to
- * the script data state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos);
- continue stateloop;
- case '\u0000':
- emitReplacementCharacter(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- case '\r':
- emitCarriageReturn(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- default:
- /*
- * Anything else Emit the current input
- * character as a character token. Switch to the
- * script data double escaped state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
- scriptdatadoubleescapedlessthanloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '/':
- /*
- * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS
- * character token. Set the temporary buffer to
- * the empty string. Switch to the script data
- * double escape end state.
- */
- index = 0;
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos);
- break scriptdatadoubleescapedlessthanloop;
- default:
- /*
- * Anything else Reconsume the current input
- * character in the script data double escaped
- * state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER
- case SCRIPT_DATA_DOUBLE_ESCAPE_END:
- scriptdatadoubleescapeendloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- if (index < 6) { // SCRIPT_ARR.length
- char folded = c;
- if (c >= 'A' && c <= 'Z') {
- folded += 0x20;
- }
- if (folded != Tokenizer.SCRIPT_ARR[index]) {
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- index++;
- continue;
- }
- switch (c) {
- case '\r':
- emitCarriageReturn(buf, pos);
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- case ' ':
- case '\t':
- case '\u000C':
- case '/':
- case '>':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN
- * (>) Emit the current input character as a
- * character token. If the temporary buffer is
- * the string "script", then switch to the
- * script data escaped state.
- */
- state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos);
- continue stateloop;
- default:
- /*
- * Reconsume the current input character in the
- * script data double escaped state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos);
- continue stateloop;
- }
- }
- // XXX reorder point
- case MARKUP_DECLARATION_OCTYPE:
- markupdeclarationdoctypeloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- if (index < 6) { // OCTYPE.length
- char folded = c;
- if (c >= 'A' && c <= 'Z') {
- folded += 0x20;
- }
- if (folded == Tokenizer.OCTYPE[index]) {
- appendStrBuf(c);
- } else {
- errBogusComment();
- reconsume = true;
- state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos);
- continue stateloop;
- }
- index++;
- continue;
- } else {
- reconsume = true;
- state = transition(state, Tokenizer.DOCTYPE, reconsume, pos);
- break markupdeclarationdoctypeloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case DOCTYPE:
- doctypeloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- initDoctypeFields();
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the before DOCTYPE name state.
- */
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
- break doctypeloop;
- // continue stateloop;
- default:
- /*
- * Anything else Parse error.
- */
- errMissingSpaceBeforeDoctypeName();
- /*
- * Reconsume the current character in the before
- * DOCTYPE name state.
- */
- reconsume = true;
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos);
- break doctypeloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_DOCTYPE_NAME:
- beforedoctypenameloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the before DOCTYPE name state.
- */
- continue;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Parse error.
- */
- errNamelessDoctype();
- /*
- * Create a new DOCTYPE token. Set its
- * force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit the token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- if (c >= 'A' && c <= 'Z') {
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Create a
- * new DOCTYPE token. Set the token's name
- * to the lowercase version of the input
- * character (add 0x0020 to the character's
- * code point).
- */
- c += 0x20;
- }
- /* Anything else Create a new DOCTYPE token. */
- /*
- * Set the token's name name to the current
- * input character.
- */
- clearStrBufBeforeUse();
- appendStrBuf(c);
- /*
- * Switch to the DOCTYPE name state.
- */
- state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos);
- break beforedoctypenameloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case DOCTYPE_NAME:
- doctypenameloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- strBufToDoctypeName();
- state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the after DOCTYPE name state.
- */
- strBufToDoctypeName();
- state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos);
- break doctypenameloop;
- // continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * DOCTYPE token.
- */
- strBufToDoctypeName();
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * U+0041 LATIN CAPITAL LETTER A through to
- * U+005A LATIN CAPITAL LETTER Z Append the
- * lowercase version of the input character (add
- * 0x0020 to the character's code point) to the
- * current DOCTYPE token's name.
- */
- if (c >= 'A' && c <= 'Z') {
- c += 0x0020;
- }
- /*
- * Anything else Append the current input
- * character to the current DOCTYPE token's
- * name.
- */
- appendStrBuf(c);
- /*
- * Stay in the DOCTYPE name state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_DOCTYPE_NAME:
- afterdoctypenameloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the after DOCTYPE name state.
- */
- continue;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case 'p':
- case 'P':
- index = 0;
- state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos);
- break afterdoctypenameloop;
- // continue stateloop;
- case 's':
- case 'S':
- index = 0;
- state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos);
- continue stateloop;
- default:
- /*
- * Otherwise, this is the parse error.
- */
- bogusDoctype();
-
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case DOCTYPE_UBLIC:
- doctypeublicloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * If the six characters starting from the current input
- * character are an ASCII case-insensitive match for the
- * word "PUBLIC", then consume those characters and
- * switch to the before DOCTYPE public identifier state.
- */
- if (index < 5) { // UBLIC.length
- char folded = c;
- if (c >= 'A' && c <= 'Z') {
- folded += 0x20;
- }
- if (folded != Tokenizer.UBLIC[index]) {
- bogusDoctype();
- // forceQuirks = true;
- reconsume = true;
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- index++;
- continue;
- } else {
- reconsume = true;
- state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos);
- break doctypeublicloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_DOCTYPE_PUBLIC_KEYWORD:
- afterdoctypepublickeywordloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the before DOCTYPE public
- * identifier state.
- */
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
- break afterdoctypepublickeywordloop;
- // FALL THROUGH continue stateloop
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Parse Error.
- */
- errNoSpaceBetweenDoctypePublicKeywordAndQuote();
- /*
- * Set the DOCTYPE token's public identifier to
- * the empty string (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE public identifier
- * (double-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Parse Error.
- */
- errNoSpaceBetweenDoctypePublicKeywordAndQuote();
- /*
- * Set the DOCTYPE token's public identifier to
- * the empty string (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE public identifier
- * (single-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '>':
- /* U+003E GREATER-THAN SIGN (>) Parse error. */
- errExpectedPublicId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- bogusDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
- beforedoctypepublicidentifierloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the before DOCTYPE public identifier
- * state.
- */
- continue;
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Set the DOCTYPE
- * token's public identifier to the empty string
- * (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE public identifier
- * (double-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
- break beforedoctypepublicidentifierloop;
- // continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Set the DOCTYPE token's
- * public identifier to the empty string (not
- * missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE public identifier
- * (single-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '>':
- /* U+003E GREATER-THAN SIGN (>) Parse error. */
- errExpectedPublicId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- bogusDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
- doctypepublicidentifierdoublequotedloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Switch to the after
- * DOCTYPE public identifier state.
- */
- publicIdentifier = strBufToString();
- state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
- break doctypepublicidentifierdoublequotedloop;
- // continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Parse error.
- */
- errGtInPublicId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- publicIdentifier = strBufToString();
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the current input
- * character to the current DOCTYPE token's
- * public identifier.
- */
- appendStrBuf(c);
- /*
- * Stay in the DOCTYPE public identifier
- * (double-quoted) state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
- afterdoctypepublicidentifierloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the between DOCTYPE public and
- * system identifiers state.
- */
- state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos);
- break afterdoctypepublicidentifierloop;
- // continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Parse error.
- */
- errNoSpaceBetweenPublicAndSystemIds();
- /*
- * Set the DOCTYPE token's system identifier to
- * the empty string (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE system identifier
- * (double-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Parse error.
- */
- errNoSpaceBetweenPublicAndSystemIds();
- /*
- * Set the DOCTYPE token's system identifier to
- * the empty string (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE system identifier
- * (single-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
- continue stateloop;
- default:
- bogusDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
- betweendoctypepublicandsystemidentifiersloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the between DOCTYPE public and system
- * identifiers state.
- */
- continue;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Set the DOCTYPE
- * token's system identifier to the empty string
- * (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE system identifier
- * (double-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
- break betweendoctypepublicandsystemidentifiersloop;
- // continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Set the DOCTYPE token's
- * system identifier to the empty string (not
- * missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE system identifier
- * (single-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
- continue stateloop;
- default:
- bogusDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
- doctypesystemidentifierdoublequotedloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Switch to the after
- * DOCTYPE system identifier state.
- */
- systemIdentifier = strBufToString();
- state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
- continue stateloop;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Parse error.
- */
- errGtInSystemId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- systemIdentifier = strBufToString();
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the current input
- * character to the current DOCTYPE token's
- * system identifier.
- */
- appendStrBuf(c);
- /*
- * Stay in the DOCTYPE system identifier
- * (double-quoted) state.
- */
- continue;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
- afterdoctypesystemidentifierloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the after DOCTYPE system identifier state.
- */
- continue;
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit the current
- * DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- /*
- * Switch to the bogus DOCTYPE state. (This does
- * not set the DOCTYPE token's force-quirks flag
- * to on.)
- */
- bogusDoctypeWithoutQuirks();
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- break afterdoctypesystemidentifierloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case BOGUS_DOCTYPE:
- for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '>':
- /*
- * U+003E GREATER-THAN SIGN (>) Emit that
- * DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- default:
- /*
- * Anything else Stay in the bogus DOCTYPE
- * state.
- */
- continue;
- }
- }
- // XXX reorder point
- case DOCTYPE_YSTEM:
- doctypeystemloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Otherwise, if the six characters starting from the
- * current input character are an ASCII case-insensitive
- * match for the word "SYSTEM", then consume those
- * characters and switch to the before DOCTYPE system
- * identifier state.
- */
- if (index < 5) { // YSTEM.length
- char folded = c;
- if (c >= 'A' && c <= 'Z') {
- folded += 0x20;
- }
- if (folded != Tokenizer.YSTEM[index]) {
- bogusDoctype();
- reconsume = true;
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- index++;
- continue stateloop;
- } else {
- reconsume = true;
- state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos);
- break doctypeystemloop;
- // continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case AFTER_DOCTYPE_SYSTEM_KEYWORD:
- afterdoctypesystemkeywordloop: for (;;) {
- if (reconsume) {
- reconsume = false;
- } else {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- }
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE
- * Switch to the before DOCTYPE public
- * identifier state.
- */
- state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
- break afterdoctypesystemkeywordloop;
- // FALL THROUGH continue stateloop
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Parse Error.
- */
- errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
- /*
- * Set the DOCTYPE token's system identifier to
- * the empty string (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE public identifier
- * (double-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Parse Error.
- */
- errNoSpaceBetweenDoctypeSystemKeywordAndQuote();
- /*
- * Set the DOCTYPE token's public identifier to
- * the empty string (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE public identifier
- * (single-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '>':
- /* U+003E GREATER-THAN SIGN (>) Parse error. */
- errExpectedPublicId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- bogusDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
- beforedoctypesystemidentifierloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\r':
- silentCarriageReturn();
- break stateloop;
- case '\n':
- silentLineFeed();
- // fall thru
- case ' ':
- case '\t':
- case '\u000C':
- /*
- * U+0009 CHARACTER TABULATION U+000A LINE FEED
- * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay
- * in the before DOCTYPE system identifier
- * state.
- */
- continue;
- case '"':
- /*
- * U+0022 QUOTATION MARK (") Set the DOCTYPE
- * token's system identifier to the empty string
- * (not missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE system identifier
- * (double-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos);
- continue stateloop;
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Set the DOCTYPE token's
- * system identifier to the empty string (not
- * missing),
- */
- clearStrBufBeforeUse();
- /*
- * then switch to the DOCTYPE system identifier
- * (single-quoted) state.
- */
- state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos);
- break beforedoctypesystemidentifierloop;
- // continue stateloop;
- case '>':
- /* U+003E GREATER-THAN SIGN (>) Parse error. */
- errExpectedSystemId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- default:
- bogusDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- // done by bogusDoctype();
- /*
- * Switch to the bogus DOCTYPE state.
- */
- state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos);
- continue stateloop;
- }
- }
- // FALLTHRU DON'T REORDER
- case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Switch to the after
- * DOCTYPE system identifier state.
- */
- systemIdentifier = strBufToString();
- state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos);
- continue stateloop;
- case '>':
- errGtInSystemId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- systemIdentifier = strBufToString();
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the current input
- * character to the current DOCTYPE token's
- * system identifier.
- */
- appendStrBuf(c);
- /*
- * Stay in the DOCTYPE system identifier
- * (double-quoted) state.
- */
- continue;
- }
- }
- // XXX reorder point
- case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
- for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- /*
- * Consume the next input character:
- */
- switch (c) {
- case '\'':
- /*
- * U+0027 APOSTROPHE (') Switch to the after
- * DOCTYPE public identifier state.
- */
- publicIdentifier = strBufToString();
- state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos);
- continue stateloop;
- case '>':
- errGtInPublicId();
- /*
- * Set the DOCTYPE token's force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- publicIdentifier = strBufToString();
- emitDoctypeToken(pos);
- /*
- * Switch to the data state.
- */
- state = transition(state, Tokenizer.DATA, reconsume, pos);
- continue stateloop;
- case '\r':
- appendStrBufCarriageReturn();
- break stateloop;
- case '\n':
- appendStrBufLineFeed();
- continue;
- case '\u0000':
- c = '\uFFFD';
- // fall thru
- default:
- /*
- * Anything else Append the current input
- * character to the current DOCTYPE token's
- * public identifier.
- */
- appendStrBuf(c);
- /*
- * Stay in the DOCTYPE public identifier
- * (single-quoted) state.
- */
- continue;
- }
- }
- // XXX reorder point
- case PROCESSING_INSTRUCTION:
- processinginstructionloop: for (;;) {
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case '?':
- state = transition(
- state,
- Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,
- reconsume, pos);
- break processinginstructionloop;
- // continue stateloop;
- default:
- continue;
- }
- }
- case PROCESSING_INSTRUCTION_QUESTION_MARK:
- if (++pos == endPos) {
- break stateloop;
- }
- c = checkChar(buf, pos);
- switch (c) {
- case '>':
- state = transition(state, Tokenizer.DATA,
- reconsume, pos);
- continue stateloop;
- default:
- state = transition(state,
- Tokenizer.PROCESSING_INSTRUCTION,
- reconsume, pos);
- continue stateloop;
- }
- // END HOTSPOT WORKAROUND
- }
- }
- flushChars(buf, pos);
- /*
- * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; }
- */
- // Save locals
- stateSave = state;
- returnStateSave = returnState;
- return pos;
- }
-
- // HOTSPOT WORKAROUND INSERTION POINT
-
- // [NOCPP[
-
- protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException {
- return to;
- }
-
- // ]NOCPP]
-
- private void initDoctypeFields() {
- // Discard the characters "DOCTYPE" accumulated as a potential bogus
- // comment into strBuf.
- clearStrBufAfterUse();
- doctypeName = "";
- if (systemIdentifier != null) {
- Portability.releaseString(systemIdentifier);
- systemIdentifier = null;
- }
- if (publicIdentifier != null) {
- Portability.releaseString(publicIdentifier);
- publicIdentifier = null;
- }
- forceQuirks = false;
- }
-
- @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
- throws SAXException {
- silentCarriageReturn();
- adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
- }
-
- @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
- throws SAXException {
- silentLineFeed();
- adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
- }
-
- @Inline private void appendStrBufLineFeed() {
- silentLineFeed();
- appendStrBuf('\n');
- }
-
- @Inline private void appendStrBufCarriageReturn() {
- silentCarriageReturn();
- appendStrBuf('\n');
- }
-
- @Inline protected void silentCarriageReturn() {
- ++line;
- lastCR = true;
- }
-
- @Inline protected void silentLineFeed() {
- ++line;
- }
-
- private void emitCarriageReturn(@NoLength char[] buf, int pos)
- throws SAXException {
- silentCarriageReturn();
- flushChars(buf, pos);
- tokenHandler.characters(Tokenizer.LF, 0, 1);
- cstart = Integer.MAX_VALUE;
- }
-
- private void emitReplacementCharacter(@NoLength char[] buf, int pos)
- throws SAXException {
- flushChars(buf, pos);
- tokenHandler.zeroOriginatingReplacementCharacter();
- cstart = pos + 1;
- }
-
- private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
- throws SAXException {
- flushChars(buf, pos);
- tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1);
- cstart = pos + 1;
- }
-
- private void setAdditionalAndRememberAmpersandLocation(char add) {
- additional = add;
- // [NOCPP[
- ampersandLocation = new LocatorImpl(this);
- // ]NOCPP]
- }
-
- private void bogusDoctype() throws SAXException {
- errBogusDoctype();
- forceQuirks = true;
- }
-
- private void bogusDoctypeWithoutQuirks() throws SAXException {
- errBogusDoctype();
- forceQuirks = false;
- }
-
- private void handleNcrValue(int returnState) throws SAXException {
- /*
- * If one or more characters match the range, then take them all and
- * interpret the string of characters as a number (either hexadecimal or
- * decimal as appropriate).
- */
- if (value <= 0xFFFF) {
- if (value >= 0x80 && value <= 0x9f) {
- /*
- * If that number is one of the numbers in the first column of
- * the following table, then this is a parse error.
- */
- errNcrInC1Range();
- /*
- * Find the row with that number in the first column, and return
- * a character token for the Unicode character given in the
- * second column of that row.
- */
- @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80];
- emitOrAppendOne(val, returnState);
- // [NOCPP[
- } else if (value == 0xC
- && contentSpacePolicy != XmlViolationPolicy.ALLOW) {
- if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) {
- emitOrAppendOne(Tokenizer.SPACE, returnState);
- } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) {
- fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space.");
- }
- // ]NOCPP]
- } else if (value == 0x0) {
- errNcrZero();
- emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
- } else if ((value & 0xF800) == 0xD800) {
- errNcrSurrogate();
- emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
- } else {
- /*
- * Otherwise, return a character token for the Unicode character
- * whose code point is that number.
- */
- char ch = (char) value;
- // [NOCPP[
- if (value == 0x0D) {
- errNcrCr();
- } else if ((value <= 0x0008) || (value == 0x000B)
- || (value >= 0x000E && value <= 0x001F)) {
- ch = errNcrControlChar(ch);
- } else if (value >= 0xFDD0 && value <= 0xFDEF) {
- errNcrUnassigned();
- } else if ((value & 0xFFFE) == 0xFFFE) {
- ch = errNcrNonCharacter(ch);
- } else if (value >= 0x007F && value <= 0x009F) {
- errNcrControlChar();
- } else {
- maybeWarnPrivateUse(ch);
- }
- // ]NOCPP]
- bmpChar[0] = ch;
- emitOrAppendOne(bmpChar, returnState);
- }
- } else if (value <= 0x10FFFF) {
- // [NOCPP[
- maybeWarnPrivateUseAstral();
- if ((value & 0xFFFE) == 0xFFFE) {
- errAstralNonCharacter(value);
- }
- // ]NOCPP]
- astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10));
- astralChar[1] = (char) (0xDC00 + (value & 0x3FF));
- emitOrAppendTwo(astralChar, returnState);
- } else {
- errNcrOutOfRange();
- emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState);
- }
- }
-
- public void eof() throws SAXException {
- int state = stateSave;
- int returnState = returnStateSave;
-
- eofloop: for (;;) {
- switch (state) {
- case SCRIPT_DATA_LESS_THAN_SIGN:
- case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
- /*
- * Otherwise, emit a U+003C LESS-THAN SIGN character token
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- /*
- * and reconsume the current input character in the data
- * state.
- */
- break eofloop;
- case TAG_OPEN:
- /*
- * The behavior of this state depends on the content model
- * flag.
- */
- /*
- * Anything else Parse error.
- */
- errEofAfterLt();
- /*
- * Emit a U+003C LESS-THAN SIGN character token
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- /*
- * and reconsume the current input character in the data
- * state.
- */
- break eofloop;
- case RAWTEXT_RCDATA_LESS_THAN_SIGN:
- /*
- * Emit a U+003C LESS-THAN SIGN character token
- */
- tokenHandler.characters(Tokenizer.LT_GT, 0, 1);
- /*
- * and reconsume the current input character in the RCDATA
- * state.
- */
- break eofloop;
- case NON_DATA_END_TAG_NAME:
- /*
- * Emit a U+003C LESS-THAN SIGN character token, a U+002F
- * SOLIDUS character token,
- */
- tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
- /*
- * a character token for each of the characters in the
- * temporary buffer (in the order they were added to the
- * buffer),
- */
- emitStrBuf();
- /*
- * and reconsume the current input character in the RCDATA
- * state.
- */
- break eofloop;
- case CLOSE_TAG_OPEN:
- /* EOF Parse error. */
- errEofAfterLt();
- /*
- * Emit a U+003C LESS-THAN SIGN character token and a U+002F
- * SOLIDUS character token.
- */
- tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case TAG_NAME:
- /*
- * EOF Parse error.
- */
- errEofInTagName();
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case BEFORE_ATTRIBUTE_NAME:
- case AFTER_ATTRIBUTE_VALUE_QUOTED:
- case SELF_CLOSING_START_TAG:
- /* EOF Parse error. */
- errEofWithoutGt();
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case ATTRIBUTE_NAME:
- /*
- * EOF Parse error.
- */
- errEofInAttributeName();
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case AFTER_ATTRIBUTE_NAME:
- case BEFORE_ATTRIBUTE_VALUE:
- /* EOF Parse error. */
- errEofWithoutGt();
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case ATTRIBUTE_VALUE_DOUBLE_QUOTED:
- case ATTRIBUTE_VALUE_SINGLE_QUOTED:
- case ATTRIBUTE_VALUE_UNQUOTED:
- /* EOF Parse error. */
- errEofInAttributeValue();
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case BOGUS_COMMENT:
- emitComment(0, 0);
- break eofloop;
- case BOGUS_COMMENT_HYPHEN:
- // [NOCPP[
- maybeAppendSpaceToBogusComment();
- // ]NOCPP]
- emitComment(0, 0);
- break eofloop;
- case MARKUP_DECLARATION_OPEN:
- errBogusComment();
- emitComment(0, 0);
- break eofloop;
- case MARKUP_DECLARATION_HYPHEN:
- errBogusComment();
- emitComment(0, 0);
- break eofloop;
- case MARKUP_DECLARATION_OCTYPE:
- if (index < 6) {
- errBogusComment();
- emitComment(0, 0);
- } else {
- /* EOF Parse error. */
- errEofInDoctype();
- /*
- * Create a new DOCTYPE token. Set its force-quirks flag
- * to on.
- */
- doctypeName = "";
- if (systemIdentifier != null) {
- Portability.releaseString(systemIdentifier);
- systemIdentifier = null;
- }
- if (publicIdentifier != null) {
- Portability.releaseString(publicIdentifier);
- publicIdentifier = null;
- }
- forceQuirks = true;
- /*
- * Emit the token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- }
- break eofloop;
- case COMMENT_START:
- case COMMENT:
- /*
- * EOF Parse error.
- */
- errEofInComment();
- /* Emit the comment token. */
- emitComment(0, 0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case COMMENT_END:
- errEofInComment();
- /* Emit the comment token. */
- emitComment(2, 0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case COMMENT_END_DASH:
- case COMMENT_START_DASH:
- errEofInComment();
- /* Emit the comment token. */
- emitComment(1, 0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case COMMENT_END_BANG:
- errEofInComment();
- /* Emit the comment token. */
- emitComment(3, 0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case DOCTYPE:
- case BEFORE_DOCTYPE_NAME:
- errEofInDoctype();
- /*
- * Create a new DOCTYPE token. Set its force-quirks flag to
- * on.
- */
- forceQuirks = true;
- /*
- * Emit the token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case DOCTYPE_NAME:
- errEofInDoctype();
- strBufToDoctypeName();
- /*
- * Set the DOCTYPE token's force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case DOCTYPE_UBLIC:
- case DOCTYPE_YSTEM:
- case AFTER_DOCTYPE_NAME:
- case AFTER_DOCTYPE_PUBLIC_KEYWORD:
- case AFTER_DOCTYPE_SYSTEM_KEYWORD:
- case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
- errEofInDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
- case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
- /* EOF Parse error. */
- errEofInPublicId();
- /*
- * Set the DOCTYPE token's force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- publicIdentifier = strBufToString();
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
- case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
- case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
- errEofInDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
- case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
- /* EOF Parse error. */
- errEofInSystemId();
- /*
- * Set the DOCTYPE token's force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- systemIdentifier = strBufToString();
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
- errEofInDoctype();
- /*
- * Set the DOCTYPE token's force-quirks flag to on.
- */
- forceQuirks = true;
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case BOGUS_DOCTYPE:
- /*
- * Emit that DOCTYPE token.
- */
- emitDoctypeToken(0);
- /*
- * Reconsume the EOF character in the data state.
- */
- break eofloop;
- case CONSUME_CHARACTER_REFERENCE:
- /*
- * Unlike the definition is the spec, this state does not
- * return a value and never requires the caller to
- * backtrack. This state takes care of emitting characters
- * or appending to the current attribute value. It also
- * takes care of that in the case when consuming the entity
- * fails.
- */
- /*
- * This section defines how to consume an entity. This
- * definition is used when parsing entities in text and in
- * attributes.
- *
- * The behavior depends on the identity of the next
- * character (the one immediately after the U+0026 AMPERSAND
- * character):
- */
-
- emitOrAppendCharRefBuf(returnState);
- state = returnState;
- continue;
- case CHARACTER_REFERENCE_HILO_LOOKUP:
- errNoNamedCharacterMatch();
- emitOrAppendCharRefBuf(returnState);
- state = returnState;
- continue;
- case CHARACTER_REFERENCE_TAIL:
- outer: for (;;) {
- char c = '\u0000';
- entCol++;
- /*
- * Consume the maximum number of characters possible,
- * with the consumed characters matching one of the
- * identifiers in the first column of the named
- * character references table (in a case-sensitive
- * manner).
- */
- hiloop: for (;;) {
- if (hi == -1) {
- break hiloop;
- }
- if (entCol == NamedCharacters.NAMES[hi].length()) {
- break hiloop;
- }
- if (entCol > NamedCharacters.NAMES[hi].length()) {
- break outer;
- } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) {
- hi--;
- } else {
- break hiloop;
- }
- }
-
- loloop: for (;;) {
- if (hi < lo) {
- break outer;
- }
- if (entCol == NamedCharacters.NAMES[lo].length()) {
- candidate = lo;
- charRefBufMark = charRefBufLen;
- lo++;
- } else if (entCol > NamedCharacters.NAMES[lo].length()) {
- break outer;
- } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) {
- lo++;
- } else {
- break loloop;
- }
- }
- if (hi < lo) {
- break outer;
- }
- continue;
- }
-
- if (candidate == -1) {
- /*
- * If no match can be made, then this is a parse error.
- */
- errNoNamedCharacterMatch();
- emitOrAppendCharRefBuf(returnState);
- state = returnState;
- continue eofloop;
- } else {
- @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate];
- if (candidateName.length() == 0
- || candidateName.charAt(candidateName.length() - 1) != ';') {
- /*
- * If the last character matched is not a U+003B
- * SEMICOLON (;), there is a parse error.
- */
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- /*
- * If the entity is being consumed as part of an
- * attribute, and the last character matched is
- * not a U+003B SEMICOLON (;),
- */
- char ch;
- if (charRefBufMark == charRefBufLen) {
- ch = '\u0000';
- } else {
- ch = charRefBuf[charRefBufMark];
- }
- if ((ch >= '0' && ch <= '9')
- || (ch >= 'A' && ch <= 'Z')
- || (ch >= 'a' && ch <= 'z')) {
- /*
- * and the next character is in the range
- * U+0030 DIGIT ZERO to U+0039 DIGIT NINE,
- * U+0041 LATIN CAPITAL LETTER A to U+005A
- * LATIN CAPITAL LETTER Z, or U+0061 LATIN
- * SMALL LETTER A to U+007A LATIN SMALL
- * LETTER Z, then, for historical reasons,
- * all the characters that were matched
- * after the U+0026 AMPERSAND (&) must be
- * unconsumed, and nothing is returned.
- */
- errNoNamedCharacterMatch();
- appendCharRefBufToStrBuf();
- state = returnState;
- continue eofloop;
- }
- }
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- errUnescapedAmpersandInterpretedAsCharacterReference();
- } else {
- errNotSemicolonTerminated();
- }
- }
-
- /*
- * Otherwise, return a character token for the character
- * corresponding to the entity name (as given by the
- * second column of the named character references
- * table).
- */
- @Const @NoLength char[] val = NamedCharacters.VALUES[candidate];
- if (
- // [NOCPP[
- val.length == 1
- // ]NOCPP]
- // CPPONLY: val[1] == 0
- ) {
- emitOrAppendOne(val, returnState);
- } else {
- emitOrAppendTwo(val, returnState);
- }
- // this is so complicated!
- if (charRefBufMark < charRefBufLen) {
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- appendStrBuf(charRefBuf, charRefBufMark,
- charRefBufLen - charRefBufMark);
- } else {
- tokenHandler.characters(charRefBuf, charRefBufMark,
- charRefBufLen - charRefBufMark);
- }
- }
- charRefBufLen = 0;
- state = returnState;
- continue eofloop;
- /*
- * If the markup contains I'm &notit; I tell you, the
- * entity is parsed as "not", as in, I'm ¬it; I tell
- * you. But if the markup was I'm &notin; I tell you,
- * the entity would be parsed as "notin;", resulting in
- * I'm ∉ I tell you.
- */
- }
- case CONSUME_NCR:
- case DECIMAL_NRC_LOOP:
- case HEX_NCR_LOOP:
- /*
- * If no characters match the range, then don't consume any
- * characters (and unconsume the U+0023 NUMBER SIGN
- * character and, if appropriate, the X character). This is
- * a parse error; nothing is returned.
- *
- * Otherwise, if the next character is a U+003B SEMICOLON,
- * consume that too. If it isn't, there is a parse error.
- */
- if (!seenDigits) {
- errNoDigitsInNCR();
- emitOrAppendCharRefBuf(returnState);
- state = returnState;
- continue;
- } else {
- errCharRefLacksSemicolon();
- }
- // WARNING previous state sets reconsume
- handleNcrValue(returnState);
- state = returnState;
- continue;
- case CDATA_RSQB:
- tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1);
- break eofloop;
- case CDATA_RSQB_RSQB:
- tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2);
- break eofloop;
- case DATA:
- default:
- break eofloop;
- }
- }
- // case DATA:
- /*
- * EOF Emit an end-of-file token.
- */
- tokenHandler.eof();
- return;
- }
-
- private void emitDoctypeToken(int pos) throws SAXException {
- cstart = pos + 1;
- tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier,
- forceQuirks);
- // It is OK and sufficient to release these here, since
- // there's no way out of the doctype states than through paths
- // that call this method.
- doctypeName = null;
- Portability.releaseString(publicIdentifier);
- publicIdentifier = null;
- Portability.releaseString(systemIdentifier);
- systemIdentifier = null;
- }
-
- @Inline protected char checkChar(@NoLength char[] buf, int pos)
- throws SAXException {
- return buf[pos];
- }
-
- public boolean internalEncodingDeclaration(String internalCharset)
- throws SAXException {
- if (encodingDeclarationHandler != null) {
- return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset);
- }
- return false;
- }
-
- /**
- * @param val
- * @throws SAXException
- */
- private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState)
- throws SAXException {
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- appendStrBuf(val[0]);
- appendStrBuf(val[1]);
- } else {
- tokenHandler.characters(val, 0, 2);
- }
- }
-
- private void emitOrAppendOne(@Const @NoLength char[] val, int returnState)
- throws SAXException {
- if ((returnState & DATA_AND_RCDATA_MASK) != 0) {
- appendStrBuf(val[0]);
- } else {
- tokenHandler.characters(val, 0, 1);
- }
- }
-
- public void end() throws SAXException {
- strBuf = null;
- doctypeName = null;
- if (systemIdentifier != null) {
- Portability.releaseString(systemIdentifier);
- systemIdentifier = null;
- }
- if (publicIdentifier != null) {
- Portability.releaseString(publicIdentifier);
- publicIdentifier = null;
- }
- if (tagName != null) {
- tagName.release();
- tagName = null;
- }
- if (attributeName != null) {
- attributeName.release();
- attributeName = null;
- }
- tokenHandler.endTokenization();
- if (attributes != null) {
- // [NOCPP[
- attributes = null;
- // ]NOCPP]
- // CPPONLY: attributes.clear(mappingLangToXmlLang);
- }
- }
-
- public void requestSuspension() {
- shouldSuspend = true;
- }
-
- // [NOCPP[
-
- public void becomeConfident() {
- confident = true;
- }
-
- /**
- * Returns the nextCharOnNewLine.
- *
- * @return the nextCharOnNewLine
- */
- public boolean isNextCharOnNewLine() {
- return false;
- }
-
- public boolean isPrevCR() {
- return lastCR;
- }
-
- /**
- * Returns the line.
- *
- * @return the line
- */
- public int getLine() {
- return -1;
- }
-
- /**
- * Returns the col.
- *
- * @return the col
- */
- public int getCol() {
- return -1;
- }
-
- // ]NOCPP]
-
- public boolean isInDataState() {
- return (stateSave == DATA);
- }
-
- public void resetToDataState() {
- clearStrBufAfterUse();
- charRefBufLen = 0;
- stateSave = Tokenizer.DATA;
- // line = 1; XXX line numbers
- lastCR = false;
- index = 0;
- forceQuirks = false;
- additional = '\u0000';
- entCol = -1;
- firstCharKey = -1;
- lo = 0;
- hi = 0; // will always be overwritten before use anyway
- candidate = -1;
- charRefBufMark = 0;
- value = 0;
- seenDigits = false;
- endTag = false;
- shouldSuspend = false;
- initDoctypeFields();
- if (tagName != null) {
- tagName.release();
- tagName = null;
- }
- if (attributeName != null) {
- attributeName.release();
- attributeName = null;
- }
- if (newAttributesEachTime) {
- if (attributes != null) {
- Portability.delete(attributes);
- attributes = null;
- }
- }
- }
-
- public void loadState(Tokenizer other) throws SAXException {
- strBufLen = other.strBufLen;
- if (strBufLen > strBuf.length) {
- strBuf = new char[strBufLen];
- }
- System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen);
-
- charRefBufLen = other.charRefBufLen;
- System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen);
-
- stateSave = other.stateSave;
- returnStateSave = other.returnStateSave;
- endTagExpectation = other.endTagExpectation;
- endTagExpectationAsArray = other.endTagExpectationAsArray;
- // line = 1; XXX line numbers
- lastCR = other.lastCR;
- index = other.index;
- forceQuirks = other.forceQuirks;
- additional = other.additional;
- entCol = other.entCol;
- firstCharKey = other.firstCharKey;
- lo = other.lo;
- hi = other.hi;
- candidate = other.candidate;
- charRefBufMark = other.charRefBufMark;
- value = other.value;
- seenDigits = other.seenDigits;
- endTag = other.endTag;
- shouldSuspend = false;
-
- if (other.doctypeName == null) {
- doctypeName = null;
- } else {
- doctypeName = Portability.newLocalFromLocal(other.doctypeName,
- interner);
- }
-
- Portability.releaseString(systemIdentifier);
- if (other.systemIdentifier == null) {
- systemIdentifier = null;
- } else {
- systemIdentifier = Portability.newStringFromString(other.systemIdentifier);
- }
-
- Portability.releaseString(publicIdentifier);
- if (other.publicIdentifier == null) {
- publicIdentifier = null;
- } else {
- publicIdentifier = Portability.newStringFromString(other.publicIdentifier);
- }
-
- if (tagName != null) {
- tagName.release();
- }
- if (other.tagName == null) {
- tagName = null;
- } else {
- tagName = other.tagName.cloneElementName(interner);
- }
-
- if (attributeName != null) {
- attributeName.release();
- }
- if (other.attributeName == null) {
- attributeName = null;
- } else {
- attributeName = other.attributeName.cloneAttributeName(interner);
- }
-
- Portability.delete(attributes);
- if (other.attributes == null) {
- attributes = null;
- } else {
- attributes = other.attributes.cloneAttributes(interner);
- }
- }
-
- public void initializeWithoutStarting() throws SAXException {
- confident = false;
- strBuf = null;
- line = 1;
- // CPPONLY: attributeLine = 1;
- // [NOCPP[
- html4 = false;
- metaBoundaryPassed = false;
- wantsComments = tokenHandler.wantsComments();
- if (!newAttributesEachTime) {
- attributes = new HtmlAttributes(mappingLangToXmlLang);
- }
- // ]NOCPP]
- resetToDataState();
- }
-
- protected void errGarbageAfterLtSlash() throws SAXException {
- }
-
- protected void errLtSlashGt() throws SAXException {
- }
-
- protected void errWarnLtSlashInRcdata() throws SAXException {
- }
-
- protected void errHtml4LtSlashInRcdata(char folded) throws SAXException {
- }
-
- protected void errCharRefLacksSemicolon() throws SAXException {
- }
-
- protected void errNoDigitsInNCR() throws SAXException {
- }
-
- protected void errGtInSystemId() throws SAXException {
- }
-
- protected void errGtInPublicId() throws SAXException {
- }
-
- protected void errNamelessDoctype() throws SAXException {
- }
-
- protected void errConsecutiveHyphens() throws SAXException {
- }
-
- protected void errPrematureEndOfComment() throws SAXException {
- }
-
- protected void errBogusComment() throws SAXException {
- }
-
- protected void errUnquotedAttributeValOrNull(char c) throws SAXException {
- }
-
- protected void errSlashNotFollowedByGt() throws SAXException {
- }
-
- protected void errHtml4XmlVoidSyntax() throws SAXException {
- }
-
- protected void errNoSpaceBetweenAttributes() throws SAXException {
- }
-
- protected void errHtml4NonNameInUnquotedAttribute(char c)
- throws SAXException {
- }
-
- protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c)
- throws SAXException {
- }
-
- protected void errAttributeValueMissing() throws SAXException {
- }
-
- protected void errBadCharBeforeAttributeNameOrNull(char c)
- throws SAXException {
- }
-
- protected void errEqualsSignBeforeAttributeName() throws SAXException {
- }
-
- protected void errBadCharAfterLt(char c) throws SAXException {
- }
-
- protected void errLtGt() throws SAXException {
- }
-
- protected void errProcessingInstruction() throws SAXException {
- }
-
- protected void errUnescapedAmpersandInterpretedAsCharacterReference()
- throws SAXException {
- }
-
- protected void errNotSemicolonTerminated() throws SAXException {
- }
-
- protected void errNoNamedCharacterMatch() throws SAXException {
- }
-
- protected void errQuoteBeforeAttributeName(char c) throws SAXException {
- }
-
- protected void errQuoteOrLtInAttributeNameOrNull(char c)
- throws SAXException {
- }
-
- protected void errExpectedPublicId() throws SAXException {
- }
-
- protected void errBogusDoctype() throws SAXException {
- }
-
- protected void maybeWarnPrivateUseAstral() throws SAXException {
- }
-
- protected void maybeWarnPrivateUse(char ch) throws SAXException {
- }
-
- protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs)
- throws SAXException {
- }
-
- protected void maybeErrSlashInEndTag(boolean selfClosing)
- throws SAXException {
- }
-
- protected char errNcrNonCharacter(char ch) throws SAXException {
- return ch;
- }
-
- protected void errAstralNonCharacter(int ch) throws SAXException {
- }
-
- protected void errNcrSurrogate() throws SAXException {
- }
-
- protected char errNcrControlChar(char ch) throws SAXException {
- return ch;
- }
-
- protected void errNcrCr() throws SAXException {
- }
-
- protected void errNcrInC1Range() throws SAXException {
- }
-
- protected void errEofInPublicId() throws SAXException {
- }
-
- protected void errEofInComment() throws SAXException {
- }
-
- protected void errEofInDoctype() throws SAXException {
- }
-
- protected void errEofInAttributeValue() throws SAXException {
- }
-
- protected void errEofInAttributeName() throws SAXException {
- }
-
- protected void errEofWithoutGt() throws SAXException {
- }
-
- protected void errEofInTagName() throws SAXException {
- }
-
- protected void errEofInEndTag() throws SAXException {
- }
-
- protected void errEofAfterLt() throws SAXException {
- }
-
- protected void errNcrOutOfRange() throws SAXException {
- }
-
- protected void errNcrUnassigned() throws SAXException {
- }
-
- protected void errDuplicateAttribute() throws SAXException {
- }
-
- protected void errEofInSystemId() throws SAXException {
- }
-
- protected void errExpectedSystemId() throws SAXException {
- }
-
- protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
- }
-
- protected void errHyphenHyphenBang() throws SAXException {
- }
-
- protected void errNcrControlChar() throws SAXException {
- }
-
- protected void errNcrZero() throws SAXException {
- }
-
- protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote()
- throws SAXException {
- }
-
- protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException {
- }
-
- protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote()
- throws SAXException {
- }
-
- protected void noteAttributeWithoutValue() throws SAXException {
- }
-
- protected void noteUnquotedAttributeValue() throws SAXException {
- }
-
- /**
- * Sets the encodingDeclarationHandler.
- *
- * @param encodingDeclarationHandler
- * the encodingDeclarationHandler to set
- */
- public void setEncodingDeclarationHandler(
- EncodingDeclarationHandler encodingDeclarationHandler) {
- this.encodingDeclarationHandler = encodingDeclarationHandler;
- }
-
- void destructor() {
- // The translator will write refcount tracing stuff here
- Portability.delete(attributes);
- attributes = null;
- }
-
- // [NOCPP[
-
- /**
- * Sets an offset to be added to the position reported to
- * <code>TransitionHandler</code>.
- *
- * @param offset the offset
- */
- public void setTransitionBaseOffset(int offset) {
-
- }
-
- // ]NOCPP]
-
-}