From e93b3733744e859e7dc4987d0b117be698246b6d Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Wed, 15 Jan 2020 22:07:59 -0500 Subject: Fix an issue with the html5 tokenizer and tree builder (java htmlparser) --- .../nu/validator/htmlparser/impl/Tokenizer.java | 32 +++++++++++++++++++--- .../nu/validator/htmlparser/impl/TreeBuilder.java | 21 +++++--------- 2 files changed, 35 insertions(+), 18 deletions(-) (limited to 'parser/html') diff --git a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java index 75ba2e1e4..0f8e518ef 100644 --- a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java +++ b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/Tokenizer.java @@ -680,6 +680,22 @@ public class Tokenizer implements Locator { * * @param specialTokenizerState * the tokenizer state to set + */ + public void setState(int specialTokenizerState) { + this.stateSave = specialTokenizerState; + this.endTagExpectation = null; + this.endTagExpectationAsArray = null; + } + + // [NOCPP[ + + /** + * Sets the tokenizer state and the associated element name. This should + * only ever used to put the tokenizer into one of the states that have + * a special end tag expectation. For use from the tokenizer test harness. + * + * @param specialTokenizerState + * the tokenizer state to set * @param endTagExpectation * the expected end tag for transitioning back to normal */ @@ -695,6 +711,8 @@ public class Tokenizer implements Locator { endTagExpectationToArray(); } + // ]NOCPP] + /** * Sets the tokenizer state and the associated element name. This should * only ever used to put the tokenizer into one of the states that have @@ -3749,11 +3767,17 @@ public class Tokenizer implements Locator { c = checkChar(buf, pos); /* * ASSERT! when entering this state, set index to 0 and - * call clearStrBufBeforeUse() assert (contentModelElement != - * null); Let's implement the above without lookahead. - * strBuf is the 'temporary buffer'. + * call clearStrBufBeforeUse(); Let's implement the above + * without lookahead. strBuf is the 'temporary buffer'. */ - if (index < endTagExpectationAsArray.length) { + if (endTagExpectationAsArray == null) { + tokenHandler.characters(Tokenizer.LT_SOLIDUS, + 0, 2); + cstart = pos; + reconsume = true; + state = transition(state, returnState, reconsume, pos); + continue stateloop; + } else if (index < endTagExpectationAsArray.length) { char e = endTagExpectationAsArray[index]; char folded = c; if (c >= 'A' && c <= 'Z') { diff --git a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java index c69a66ea3..947657eb1 100644 --- a/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java +++ b/parser/html/java/htmlparser/src/nu/validator/htmlparser/impl/TreeBuilder.java @@ -640,8 +640,7 @@ public abstract class TreeBuilder implements TokenHandler, ); currentPtr++; stack[currentPtr] = node; - tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA, - contextName); + tokenizer.setState(Tokenizer.DATA); // The frameset-ok flag is set even though never // ends up being allowed as HTML frameset in the fragment case. mode = FRAMESET_OK; @@ -671,8 +670,7 @@ public abstract class TreeBuilder implements TokenHandler, ); currentPtr++; stack[currentPtr] = node; - tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA, - contextName); + tokenizer.setState(Tokenizer.DATA); // The frameset-ok flag is set even though never // ends up being allowed as HTML frameset in the fragment case. mode = FRAMESET_OK; @@ -691,23 +689,18 @@ public abstract class TreeBuilder implements TokenHandler, resetTheInsertionMode(); formPointer = getFormPointerForContext(contextNode); if ("title" == contextName || "textarea" == contextName) { - tokenizer.setStateAndEndTagExpectation(Tokenizer.RCDATA, - contextName); + tokenizer.setState(Tokenizer.RCDATA); } else if ("style" == contextName || "xmp" == contextName || "iframe" == contextName || "noembed" == contextName || "noframes" == contextName || (scriptingEnabled && "noscript" == contextName)) { - tokenizer.setStateAndEndTagExpectation(Tokenizer.RAWTEXT, - contextName); + tokenizer.setState(Tokenizer.RAWTEXT); } else if ("plaintext" == contextName) { - tokenizer.setStateAndEndTagExpectation(Tokenizer.PLAINTEXT, - contextName); + tokenizer.setState(Tokenizer.PLAINTEXT); } else if ("script" == contextName) { - tokenizer.setStateAndEndTagExpectation( - Tokenizer.SCRIPT_DATA, contextName); + tokenizer.setState(Tokenizer.SCRIPT_DATA); } else { - tokenizer.setStateAndEndTagExpectation(Tokenizer.DATA, - contextName); + tokenizer.setState(Tokenizer.DATA); } } contextName = null; -- cgit v1.2.3