summaryrefslogtreecommitdiffstats
path: root/parser/html/java/htmlparser
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2020-01-15 22:07:59 -0500
committerMatt A. Tobin <email@mattatobin.com>2020-01-15 22:07:59 -0500
commite93b3733744e859e7dc4987d0b117be698246b6d (patch)
tree809c3c749c916006bae87a7e32937f36559e42b5 /parser/html/java/htmlparser
parent3da18fda029a038784525e840c831ffd73b25c33 (diff)
downloadUXP-e93b3733744e859e7dc4987d0b117be698246b6d.tar
UXP-e93b3733744e859e7dc4987d0b117be698246b6d.tar.gz
UXP-e93b3733744e859e7dc4987d0b117be698246b6d.tar.lz
UXP-e93b3733744e859e7dc4987d0b117be698246b6d.tar.xz
UXP-e93b3733744e859e7dc4987d0b117be698246b6d.zip
Fix an issue with the html5 tokenizer and tree builder (java htmlparser)
Diffstat (limited to 'parser/html/java/htmlparser')
-rw-r--r--parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java32
-rw-r--r--parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java21
2 files changed, 35 insertions, 18 deletions
diff --git a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java
index 75ba2e1e4..0f8e518ef 100644
--- a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java
+++ b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java
@@ -680,6 +680,22 @@ public class Tokenizer implements Locator {
*
* @param specialTokenizerState
* the tokenizer state to set
+ */
+ public void setState(int specialTokenizerState) {
+ this.stateSave = specialTokenizerState;
+ this.endTagExpectation = null;
+ this.endTagExpectationAsArray = null;
+ }
+
+ // [NOCPP[
+
+ /**
+ * Sets the tokenizer state and the associated element name. This should
+ * only ever used to put the tokenizer into one of the states that have
+ * a special end tag expectation. For use from the tokenizer test harness.
+ *
+ * @param specialTokenizerState
+ * the tokenizer state to set
* @param endTagExpectation
* the expected end tag for transitioning back to normal
*/
@@ -695,6 +711,8 @@ public class Tokenizer implements Locator {
endTagExpectationToArray();
}
+ // ]NOCPP]
+
/**
* Sets the tokenizer state and the associated element name. This should
* only ever used to put the tokenizer into one of the states that have
@@ -3749,11 +3767,17 @@ public class Tokenizer implements Locator {
c = checkChar(buf, pos);
/*
* ASSERT! when entering this state, set index to 0 and
- * call clearStrBufBeforeUse() assert (contentModelElement !=
- * null); Let's implement the above without lookahead.
- * strBuf is the 'temporary buffer'.
+ * call clearStrBufBeforeUse(); Let's implement the above
+ * without lookahead. strBuf is the 'temporary buffer'.
*/
- if (index < endTagExpectationAsArray.length) {
+ if (endTagExpectationAsArray == null) {
+ tokenHandler.characters(Tokenizer.LT_SOLIDUS,
+ 0, 2);
+ cstart = pos;
+ reconsume = true;
+ state = transition(state, returnState, reconsume, pos);
+ continue stateloop;
+ } else if (index < endTagExpectationAsArray.length) {
char e = endTagExpectationAsArray[index];
char folded = c;
if (c >= 'A' && c <= 'Z') {
diff --git a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java
index c69a66ea3..947657eb1 100644
--- a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java
+++ b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java
@@ -640,8 +640,7 @@ public abstract class TreeBuilder<T> implements TokenHandler,
);
currentPtr++;
stack[currentPtr] = node;
- tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA,
- contextName);
+ tokenizer.setState(Tokenizer.DATA);
// The frameset-ok flag is set even though <frameset> never
// ends up being allowed as HTML frameset in the fragment case.
mode = FRAMESET_OK;
@@ -671,8 +670,7 @@ public abstract class TreeBuilder<T> implements TokenHandler,
);
currentPtr++;
stack[currentPtr] = node;
- tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA,
- contextName);
+ tokenizer.setState(Tokenizer.DATA);
// The frameset-ok flag is set even though <frameset> never
// ends up being allowed as HTML frameset in the fragment case.
mode = FRAMESET_OK;
@@ -691,23 +689,18 @@ public abstract class TreeBuilder<T> implements TokenHandler,
resetTheInsertionMode();
formPointer = getFormPointerForContext(contextNode);
if ("title" == contextName || "textarea" == contextName) {
- tokenizer.setStateAndEndTagExpectation(Tokenizer.RCDATA,
- contextName);
+ tokenizer.setState(Tokenizer.RCDATA);
} else if ("style" == contextName || "xmp" == contextName
|| "iframe" == contextName || "noembed" == contextName
|| "noframes" == contextName
|| (scriptingEnabled && "noscript" == contextName)) {
- tokenizer.setStateAndEndTagExpectation(Tokenizer.RAWTEXT,
- contextName);
+ tokenizer.setState(Tokenizer.RAWTEXT);
} else if ("plaintext" == contextName) {
- tokenizer.setStateAndEndTagExpectation(Tokenizer.PLAINTEXT,
- contextName);
+ tokenizer.setState(Tokenizer.PLAINTEXT);
} else if ("script" == contextName) {
- tokenizer.setStateAndEndTagExpectation(
- Tokenizer.SCRIPT_DATA, contextName);
+ tokenizer.setState(Tokenizer.SCRIPT_DATA);
} else {
- tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA,
- contextName);
+ tokenizer.setState(Tokenizer.DATA);
}
}
contextName = null;