diff options
Diffstat (limited to 'server/src/org/jsoup/parser/HtmlTreeBuilderState.java')
-rw-r--r-- | server/src/org/jsoup/parser/HtmlTreeBuilderState.java | 1645 |
1 files changed, 917 insertions, 728 deletions
diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java index ceab9faa5a..258d547a49 100644 --- a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java @@ -1,17 +1,24 @@ package org.jsoup.parser; -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.*; - import java.util.Iterator; import java.util.LinkedList; +import org.jsoup.helper.DescendableLinkedList; +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.DocumentType; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + /** - * The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states. + * The Tree Builder's current state. Each state embodies the processing for the + * state, and transitions to other states. */ enum HtmlTreeBuilderState { Initial { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return true; // ignore whitespace @@ -21,10 +28,13 @@ enum HtmlTreeBuilderState { // todo: parse error check on expected doctypes // todo: quirk state check on doctype ids Token.Doctype d = t.asDoctype(); - DocumentType doctype = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri()); + DocumentType doctype = new DocumentType(d.getName(), + d.getPublicIdentifier(), d.getSystemIdentifier(), + tb.getBaseUri()); tb.getDocument().appendChild(doctype); - if (d.isForceQuirks()) + if (d.isForceQuirks()) { tb.getDocument().quirksMode(Document.QuirksMode.quirks); + } tb.transition(BeforeHtml); } else { // todo: check not iframe srcdoc @@ -35,6 +45,7 @@ enum HtmlTreeBuilderState { } }, BeforeHtml { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isDoctype()) { tb.error(this); @@ -46,7 +57,9 @@ enum HtmlTreeBuilderState { } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { tb.insert(t.asStartTag()); tb.transition(BeforeHead); - } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { + } else if (t.isEndTag() + && (StringUtil.in(t.asEndTag().name(), "head", "body", + "html", "br"))) { return anythingElse(t, tb); } else if (t.isEndTag()) { tb.error(this); @@ -64,6 +77,7 @@ enum HtmlTreeBuilderState { } }, BeforeHead { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return true; @@ -78,7 +92,9 @@ enum HtmlTreeBuilderState { Element head = tb.insert(t.asStartTag()); tb.setHeadElement(head); tb.transition(InHead); - } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { + } else if (t.isEndTag() + && (StringUtil.in(t.asEndTag().name(), "head", "body", + "html", "br"))) { tb.process(new Token.StartTag("head")); return tb.process(t); } else if (t.isEndTag()) { @@ -92,67 +108,71 @@ enum HtmlTreeBuilderState { } }, InHead { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + return false; + case StartTag: + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) { + return InBody.process(t, tb); + } else if (StringUtil.in(name, "base", "basefont", "bgsound", + "command", "link")) { + Element el = tb.insertEmpty(start); + // jsoup special: update base the frist time it is seen + if (name.equals("base") && el.hasAttr("href")) { + tb.maybeSetBaseUri(el); + } + } else if (name.equals("meta")) { + Element meta = tb.insertEmpty(start); + // todo: charset switches + } else if (name.equals("title")) { + handleRcData(start, tb); + } else if (StringUtil.in(name, "noframes", "style")) { + handleRawtext(start, tb); + } else if (name.equals("noscript")) { + // else if noscript && scripting flag = true: rawtext (jsoup + // doesn't run script, to handle as noscript) + tb.insert(start); + tb.transition(InHeadNoscript); + } else if (name.equals("script")) { + // skips some script rules as won't execute them + tb.insert(start); + tb.tokeniser.transition(TokeniserState.ScriptData); + tb.markInsertionMode(); + tb.transition(Text); + } else if (name.equals("head")) { tb.error(this); return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return InBody.process(t, tb); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { - Element el = tb.insertEmpty(start); - // jsoup special: update base the frist time it is seen - if (name.equals("base") && el.hasAttr("href")) - tb.maybeSetBaseUri(el); - } else if (name.equals("meta")) { - Element meta = tb.insertEmpty(start); - // todo: charset switches - } else if (name.equals("title")) { - handleRcData(start, tb); - } else if (StringUtil.in(name, "noframes", "style")) { - handleRawtext(start, tb); - } else if (name.equals("noscript")) { - // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) - tb.insert(start); - tb.transition(InHeadNoscript); - } else if (name.equals("script")) { - // skips some script rules as won't execute them - tb.insert(start); - tb.tokeniser.transition(TokeniserState.ScriptData); - tb.markInsertionMode(); - tb.transition(Text); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("head")) { - tb.pop(); - tb.transition(AfterHead); - } else if (StringUtil.in(name, "body", "html", "br")) { - return anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - break; - default: + } else { return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag end = t.asEndTag(); + name = end.name(); + if (name.equals("head")) { + tb.pop(); + tb.transition(AfterHead); + } else if (StringUtil.in(name, "body", "html", "br")) { + return anythingElse(t, tb); + } else { + tb.error(this); + return false; + } + break; + default: + return anythingElse(t, tb); } return true; } @@ -163,6 +183,7 @@ enum HtmlTreeBuilderState { } }, InHeadNoscript { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isDoctype()) { tb.error(this); @@ -171,12 +192,16 @@ enum HtmlTreeBuilderState { } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { tb.pop(); tb.transition(InHead); - } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "basefont", "bgsound", "link", "meta", "noframes", "style"))) { + } else if (isWhitespace(t) + || t.isComment() + || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), + "basefont", "bgsound", "link", "meta", "noframes", + "style"))) { return tb.process(t, InHead); } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { return anythingElse(t, tb); - } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) { + } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), + "head", "noscript")) || t.isEndTag()) { tb.error(this); return false; } else { @@ -192,6 +217,7 @@ enum HtmlTreeBuilderState { } }, AfterHead { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); @@ -211,7 +237,8 @@ enum HtmlTreeBuilderState { } else if (name.equals("frameset")) { tb.insert(startTag); tb.transition(InFrameset); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { + } else if (StringUtil.in(name, "base", "basefont", "bgsound", + "link", "meta", "noframes", "script", "style", "title")) { tb.error(this); Element head = tb.getHeadElement(); tb.push(head); @@ -243,519 +270,604 @@ enum HtmlTreeBuilderState { } }, InBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case Character: { - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - // todo confirm that check - tb.error(this); - return false; - } else if (isWhitespace(c)) { - tb.reconstructFormattingElements(); - tb.insert(c); - } else { - tb.reconstructFormattingElements(); - tb.insert(c); - tb.framesetOk(false); - } - break; - } - case Comment: { - tb.insert(t.asComment()); - break; - } - case Doctype: { + case Character: { + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + // todo confirm that check tb.error(this); return false; + } else if (isWhitespace(c)) { + tb.reconstructFormattingElements(); + tb.insert(c); + } else { + tb.reconstructFormattingElements(); + tb.insert(c); + tb.framesetOk(false); } - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - tb.error(this); - // merge attributes onto real html - Element html = tb.getStack().getFirst(); - for (Attribute attribute : startTag.getAttributes()) { - if (!html.hasAttr(attribute.getKey())) - html.attributes().put(attribute); - } - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title")) { - return tb.process(t, InHead); - } else if (name.equals("body")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { - // only in fragment case - return false; // ignore - } else { - tb.framesetOk(false); - Element body = stack.get(1); - for (Attribute attribute : startTag.getAttributes()) { - if (!body.hasAttr(attribute.getKey())) - body.attributes().put(attribute); - } - } - } else if (name.equals("frameset")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { - // only in fragment case - return false; // ignore - } else if (!tb.framesetOk()) { - return false; // ignore frameset - } else { - Element second = stack.get(1); - if (second.parent() != null) - second.remove(); - // pop up to html element - while (stack.size() > 1) - stack.removeLast(); - tb.insert(startTag); - tb.transition(InFrameset); - } - } else if (StringUtil.in(name, - "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", - "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", - "p", "section", "summary", "ul")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - if (StringUtil.in(tb.currentElement().nodeName(), "h1", "h2", "h3", "h4", "h5", "h6")) { - tb.error(this); - tb.pop(); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "pre", "listing")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - // todo: ignore LF if next token - tb.framesetOk(false); - } else if (name.equals("form")) { - if (tb.getFormElement() != null) { - tb.error(this); - return false; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + break; + } + case Comment: { + tb.insert(t.asComment()); + break; + } + case Doctype: { + tb.error(this); + return false; + } + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) { + tb.error(this); + // merge attributes onto real html + Element html = tb.getStack().getFirst(); + for (Attribute attribute : startTag.getAttributes()) { + if (!html.hasAttr(attribute.getKey())) { + html.attributes().put(attribute); } - Element form = tb.insert(startTag); - tb.setFormElement(form); - } else if (name.equals("li")) { + } + } else if (StringUtil.in(name, "base", "basefont", "bgsound", + "command", "link", "meta", "noframes", "script", + "style", "title")) { + return tb.process(t, InHead); + } else if (name.equals("body")) { + tb.error(this); + LinkedList<Element> stack = tb.getStack(); + if (stack.size() == 1 + || (stack.size() > 2 && !stack.get(1).nodeName() + .equals("body"))) { + // only in fragment case + return false; // ignore + } else { tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (el.nodeName().equals("li")) { - tb.process(new Token.EndTag("li")); - break; + Element body = stack.get(1); + for (Attribute attribute : startTag.getAttributes()) { + if (!body.hasAttr(attribute.getKey())) { + body.attributes().put(attribute); } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) - break; } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "dd", "dt")) { - tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (StringUtil.in(el.nodeName(), "dd", "dt")) { - tb.process(new Token.EndTag(el.nodeName())); - break; - } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) - break; + } + } else if (name.equals("frameset")) { + tb.error(this); + LinkedList<Element> stack = tb.getStack(); + if (stack.size() == 1 + || (stack.size() > 2 && !stack.get(1).nodeName() + .equals("body"))) { + // only in fragment case + return false; // ignore + } else if (!tb.framesetOk()) { + return false; // ignore frameset + } else { + Element second = stack.get(1); + if (second.parent() != null) { + second.remove(); } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + // pop up to html element + while (stack.size() > 1) { + stack.removeLast(); } tb.insert(startTag); - } else if (name.equals("plaintext")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + tb.transition(InFrameset); + } + } else if (StringUtil.in(name, "address", "article", "aside", + "blockquote", "center", "details", "dir", "div", "dl", + "fieldset", "figcaption", "figure", "footer", "header", + "hgroup", "menu", "nav", "ol", "p", "section", + "summary", "ul")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", + "h6")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + if (StringUtil.in(tb.currentElement().nodeName(), "h1", + "h2", "h3", "h4", "h5", "h6")) { + tb.error(this); + tb.pop(); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "pre", "listing")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + // todo: ignore LF if next token + tb.framesetOk(false); + } else if (name.equals("form")) { + if (tb.getFormElement() != null) { + tb.error(this); + return false; + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + Element form = tb.insert(startTag); + tb.setFormElement(form); + } else if (name.equals("li")) { + tb.framesetOk(false); + LinkedList<Element> stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (el.nodeName().equals("li")) { + tb.process(new Token.EndTag("li")); + break; } - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out - } else if (name.equals("button")) { - if (tb.inButtonScope("button")) { - // close and reprocess - tb.error(this); - tb.process(new Token.EndTag("button")); - tb.process(startTag); - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); + if (tb.isSpecial(el) + && !StringUtil.in(el.nodeName(), "address", + "div", "p")) { + break; } - } else if (name.equals("a")) { - if (tb.getActiveFormattingElement("a") != null) { - tb.error(this); - tb.process(new Token.EndTag("a")); - - // still on stack? - Element remainingA = tb.getFromStack("a"); - if (remainingA != null) { - tb.removeFromActiveFormattingElements(remainingA); - tb.removeFromStack(remainingA); - } + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "dd", "dt")) { + tb.framesetOk(false); + LinkedList<Element> stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (StringUtil.in(el.nodeName(), "dd", "dt")) { + tb.process(new Token.EndTag(el.nodeName())); + break; } - tb.reconstructFormattingElements(); - Element a = tb.insert(startTag); - tb.pushActiveFormattingElements(a); - } else if (StringUtil.in(name, - "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) { - tb.reconstructFormattingElements(); - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (name.equals("nobr")) { - tb.reconstructFormattingElements(); - if (tb.inScope("nobr")) { - tb.error(this); - tb.process(new Token.EndTag("nobr")); - tb.reconstructFormattingElements(); + if (tb.isSpecial(el) + && !StringUtil.in(el.nodeName(), "address", + "div", "p")) { + break; } - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (StringUtil.in(name, "applet", "marquee", "object")) { + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (name.equals("plaintext")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once + // in, + // never + // gets + // out + } else if (name.equals("button")) { + if (tb.inButtonScope("button")) { + // close and reprocess + tb.error(this); + tb.process(new Token.EndTag("button")); + tb.process(startTag); + } else { tb.reconstructFormattingElements(); tb.insert(startTag); - tb.insertMarkerToFormattingElements(); tb.framesetOk(false); - } else if (name.equals("table")) { - if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + } + } else if (name.equals("a")) { + if (tb.getActiveFormattingElement("a") != null) { + tb.error(this); + tb.process(new Token.EndTag("a")); + + // still on stack? + Element remainingA = tb.getFromStack("a"); + if (remainingA != null) { + tb.removeFromActiveFormattingElements(remainingA); + tb.removeFromStack(remainingA); } - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InTable); - } else if (StringUtil.in(name, "area", "br", "embed", "img", "keygen", "wbr")) { - tb.reconstructFormattingElements(); - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("input")) { + } + tb.reconstructFormattingElements(); + Element a = tb.insert(startTag); + tb.pushActiveFormattingElements(a); + } else if (StringUtil.in(name, "b", "big", "code", "em", + "font", "i", "s", "small", "strike", "strong", "tt", + "u")) { + tb.reconstructFormattingElements(); + Element el = tb.insert(startTag); + tb.pushActiveFormattingElements(el); + } else if (name.equals("nobr")) { + tb.reconstructFormattingElements(); + if (tb.inScope("nobr")) { + tb.error(this); + tb.process(new Token.EndTag("nobr")); tb.reconstructFormattingElements(); - Element el = tb.insertEmpty(startTag); - if (!el.attr("type").equalsIgnoreCase("hidden")) - tb.framesetOk(false); - } else if (StringUtil.in(name, "param", "source", "track")) { - tb.insertEmpty(startTag); - } else if (name.equals("hr")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insertEmpty(startTag); + } + Element el = tb.insert(startTag); + tb.pushActiveFormattingElements(el); + } else if (StringUtil.in(name, "applet", "marquee", "object")) { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.insertMarkerToFormattingElements(); + tb.framesetOk(false); + } else if (name.equals("table")) { + if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks + && tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + tb.framesetOk(false); + tb.transition(InTable); + } else if (StringUtil.in(name, "area", "br", "embed", "img", + "keygen", "wbr")) { + tb.reconstructFormattingElements(); + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (name.equals("input")) { + tb.reconstructFormattingElements(); + Element el = tb.insertEmpty(startTag); + if (!el.attr("type").equalsIgnoreCase("hidden")) { tb.framesetOk(false); - } else if (name.equals("image")) { - // we're not supposed to ask. - startTag.name("img"); - return tb.process(startTag); - } else if (name.equals("isindex")) { - // how much do we care about the early 90s? - tb.error(this); - if (tb.getFormElement() != null) - return false; + } + } else if (StringUtil.in(name, "param", "source", "track")) { + tb.insertEmpty(startTag); + } else if (name.equals("hr")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (name.equals("image")) { + // we're not supposed to ask. + startTag.name("img"); + return tb.process(startTag); + } else if (name.equals("isindex")) { + // how much do we care about the early 90s? + tb.error(this); + if (tb.getFormElement() != null) { + return false; + } - tb.tokeniser.acknowledgeSelfClosingFlag(); - tb.process(new Token.StartTag("form")); - if (startTag.attributes.hasKey("action")) { - Element form = tb.getFormElement(); - form.attr("action", startTag.attributes.get("action")); - } - tb.process(new Token.StartTag("hr")); - tb.process(new Token.StartTag("label")); - // hope you like english. - String prompt = startTag.attributes.hasKey("prompt") ? - startTag.attributes.get("prompt") : - "This is a searchable index. Enter search keywords: "; + tb.tokeniser.acknowledgeSelfClosingFlag(); + tb.process(new Token.StartTag("form")); + if (startTag.attributes.hasKey("action")) { + Element form = tb.getFormElement(); + form.attr("action", startTag.attributes.get("action")); + } + tb.process(new Token.StartTag("hr")); + tb.process(new Token.StartTag("label")); + // hope you like english. + String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes + .get("prompt") + : "This is a searchable index. Enter search keywords: "; - tb.process(new Token.Character(prompt)); + tb.process(new Token.Character(prompt)); - // input - Attributes inputAttribs = new Attributes(); - for (Attribute attr : startTag.attributes) { - if (!StringUtil.in(attr.getKey(), "name", "action", "prompt")) - inputAttribs.put(attr); + // input + Attributes inputAttribs = new Attributes(); + for (Attribute attr : startTag.attributes) { + if (!StringUtil.in(attr.getKey(), "name", "action", + "prompt")) { + inputAttribs.put(attr); } - inputAttribs.put("name", "isindex"); - tb.process(new Token.StartTag("input", inputAttribs)); - tb.process(new Token.EndTag("label")); - tb.process(new Token.StartTag("hr")); - tb.process(new Token.EndTag("form")); - } else if (name.equals("textarea")) { - tb.insert(startTag); - // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.framesetOk(false); - tb.transition(Text); - } else if (name.equals("xmp")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.reconstructFormattingElements(); - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("iframe")) { - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("noembed")) { - // also handle noscript if script enabled - handleRawtext(startTag, tb); - } else if (name.equals("select")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); + } + inputAttribs.put("name", "isindex"); + tb.process(new Token.StartTag("input", inputAttribs)); + tb.process(new Token.EndTag("label")); + tb.process(new Token.StartTag("hr")); + tb.process(new Token.EndTag("form")); + } else if (name.equals("textarea")) { + tb.insert(startTag); + // todo: If the next token is a U+000A LINE FEED (LF) + // character token, then ignore that token and move on to + // the next one. (Newlines at the start of textarea elements + // are ignored as an authoring convenience.) + tb.tokeniser.transition(TokeniserState.Rcdata); + tb.markInsertionMode(); + tb.framesetOk(false); + tb.transition(Text); + } else if (name.equals("xmp")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.reconstructFormattingElements(); + tb.framesetOk(false); + handleRawtext(startTag, tb); + } else if (name.equals("iframe")) { + tb.framesetOk(false); + handleRawtext(startTag, tb); + } else if (name.equals("noembed")) { + // also handle noscript if script enabled + handleRawtext(startTag, tb); + } else if (name.equals("select")) { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.framesetOk(false); - HtmlTreeBuilderState state = tb.state(); - if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) - tb.transition(InSelectInTable); - else - tb.transition(InSelect); - } else if (StringUtil.in("optgroup", "option")) { - if (tb.currentElement().nodeName().equals("option")) - tb.process(new Token.EndTag("option")); - tb.reconstructFormattingElements(); - tb.insert(startTag); - } else if (StringUtil.in("rp", "rt")) { - if (tb.inScope("ruby")) { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("ruby")) { - tb.error(this); - tb.popStackToBefore("ruby"); // i.e. close up to but not include name - } - tb.insert(startTag); + HtmlTreeBuilderState state = tb.state(); + if (state.equals(InTable) || state.equals(InCaption) + || state.equals(InTableBody) || state.equals(InRow) + || state.equals(InCell)) { + tb.transition(InSelectInTable); + } else { + tb.transition(InSelect); + } + } else if (StringUtil.in("optgroup", "option")) { + if (tb.currentElement().nodeName().equals("option")) { + tb.process(new Token.EndTag("option")); + } + tb.reconstructFormattingElements(); + tb.insert(startTag); + } else if (StringUtil.in("rp", "rt")) { + if (tb.inScope("ruby")) { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals("ruby")) { + tb.error(this); + tb.popStackToBefore("ruby"); // i.e. close up to but + // not include name } - } else if (name.equals("math")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (name.equals("svg")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, svg) tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (StringUtil.in(name, - "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } + } else if (name.equals("math")) { + tb.reconstructFormattingElements(); + // todo: handle A start tag whose tag name is "math" (i.e. + // foreign, mathml) + tb.insert(startTag); + tb.tokeniser.acknowledgeSelfClosingFlag(); + } else if (name.equals("svg")) { + tb.reconstructFormattingElements(); + // todo: handle A start tag whose tag name is "svg" (xlink, + // svg) + tb.insert(startTag); + tb.tokeniser.acknowledgeSelfClosingFlag(); + } else if (StringUtil.in(name, "caption", "col", "colgroup", + "frame", "head", "tbody", "td", "tfoot", "th", "thead", + "tr")) { + tb.error(this); + return false; + } else { + tb.reconstructFormattingElements(); + tb.insert(startTag); + } + break; + + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (name.equals("body")) { + if (!tb.inScope("body")) { tb.error(this); return false; } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); + // todo: error if stack contains something not dd, dt, + // li, optgroup, option, p, rp, rt, tbody, td, tfoot, + // th, thead, tr, body, html + tb.transition(AfterBody); } - break; - - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("body")) { - if (!tb.inScope("body")) { + } else if (name.equals("html")) { + boolean notIgnored = tb.process(new Token.EndTag("body")); + if (notIgnored) { + return tb.process(endTag); + } + } else if (StringUtil.in(name, "address", "article", "aside", + "blockquote", "button", "center", "details", "dir", + "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "listing", "menu", "nav", + "ol", "pre", "section", "summary", "ul")) { + // todo: refactor these lookups + if (!tb.inScope(name)) { + // nothing to close + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - tb.transition(AfterBody); } - } else if (name.equals("html")) { - boolean notIgnored = tb.process(new Token.EndTag("body")); - if (notIgnored) - return tb.process(endTag); - } else if (StringUtil.in(name, - "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", - "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", - "nav", "ol", "pre", "section", "summary", "ul")) { - // todo: refactor these lookups - if (!tb.inScope(name)) { - // nothing to close + tb.popStackToClose(name); + } + } else if (name.equals("form")) { + Element currentForm = tb.getFormElement(); + tb.setFormElement(null); + if (currentForm == null || !tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (name.equals("form")) { - Element currentForm = tb.getFormElement(); - tb.setFormElement(null); - if (currentForm == null || !tb.inScope(name)) { + // remove currentForm from stack. will shift anything + // under up. + tb.removeFromStack(currentForm); + } + } else if (name.equals("p")) { + if (!tb.inButtonScope(name)) { + tb.error(this); + tb.process(new Token.StartTag(name)); // if no p to + // close, creates + // an empty + // <p></p> + return tb.process(endTag); + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - // remove currentForm from stack. will shift anything under up. - tb.removeFromStack(currentForm); } - } else if (name.equals("p")) { - if (!tb.inButtonScope(name)) { + tb.popStackToClose(name); + } + } else if (name.equals("li")) { + if (!tb.inListItemScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - tb.process(new Token.StartTag(name)); // if no p to close, creates an empty <p></p> - return tb.process(endTag); - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (name.equals("li")) { - if (!tb.inListItemScope(name)) { + tb.popStackToClose(name); + } + } else if (StringUtil.in(name, "dd", "dt")) { + if (!tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (StringUtil.in(name, "dd", "dt")) { - if (!tb.inScope(name)) { + tb.popStackToClose(name); + } + } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", + "h6")) { + if (!tb.inScope(new String[] { "h1", "h2", "h3", "h4", + "h5", "h6" })) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { - if (!tb.inScope(new String[]{"h1", "h2", "h3", "h4", "h5", "h6"})) { + tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); + } + } else if (name.equals("sarcasm")) { + // *sigh* + return anyOtherEndTag(t, tb); + } else if (StringUtil.in(name, "a", "b", "big", "code", "em", + "font", "i", "nobr", "s", "small", "strike", "strong", + "tt", "u")) { + // Adoption Agency Algorithm. + OUTER: for (int i = 0; i < 8; i++) { + Element formatEl = tb.getActiveFormattingElement(name); + if (formatEl == null) { + return anyOtherEndTag(t, tb); + } else if (!tb.onStack(formatEl)) { + tb.error(this); + tb.removeFromActiveFormattingElements(formatEl); + return true; + } else if (!tb.inScope(formatEl.nodeName())) { tb.error(this); return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); + } else if (tb.currentElement() != formatEl) { + tb.error(this); + } + + Element furthestBlock = null; + Element commonAncestor = null; + boolean seenFormattingElement = false; + LinkedList<Element> stack = tb.getStack(); + for (int si = 0; si < stack.size(); si++) { + Element el = stack.get(si); + if (el == formatEl) { + commonAncestor = stack.get(si - 1); + seenFormattingElement = true; + } else if (seenFormattingElement + && tb.isSpecial(el)) { + furthestBlock = el; + break; + } + } + if (furthestBlock == null) { + tb.popStackToClose(formatEl.nodeName()); + tb.removeFromActiveFormattingElements(formatEl); + return true; } - } else if (name.equals("sarcasm")) { - // *sigh* - return anyOtherEndTag(t, tb); - } else if (StringUtil.in(name, - "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u")) { - // Adoption Agency Algorithm. - OUTER: - for (int i = 0; i < 8; i++) { - Element formatEl = tb.getActiveFormattingElement(name); - if (formatEl == null) - return anyOtherEndTag(t, tb); - else if (!tb.onStack(formatEl)) { - tb.error(this); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } else if (!tb.inScope(formatEl.nodeName())) { - tb.error(this); - return false; - } else if (tb.currentElement() != formatEl) - tb.error(this); - Element furthestBlock = null; - Element commonAncestor = null; - boolean seenFormattingElement = false; - LinkedList<Element> stack = tb.getStack(); - for (int si = 0; si < stack.size(); si++) { - Element el = stack.get(si); - if (el == formatEl) { - commonAncestor = stack.get(si - 1); - seenFormattingElement = true; - } else if (seenFormattingElement && tb.isSpecial(el)) { - furthestBlock = el; - break; - } + // todo: Let a bookmark note the position of the + // formatting element in the list of active formatting + // elements relative to the elements on either side of + // it in the list. + // does that mean: int pos of format el in list? + Element node = furthestBlock; + Element lastNode = furthestBlock; + INNER: for (int j = 0; j < 3; j++) { + if (tb.onStack(node)) { + node = tb.aboveOnStack(node); } - if (furthestBlock == null) { - tb.popStackToClose(formatEl.nodeName()); - tb.removeFromActiveFormattingElements(formatEl); - return true; + if (!tb.isInActiveFormattingElements(node)) { // note + // no + // bookmark + // check + tb.removeFromStack(node); + continue INNER; + } else if (node == formatEl) { + break INNER; } - // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. - // does that mean: int pos of format el in list? - Element node = furthestBlock; - Element lastNode = furthestBlock; - INNER: - for (int j = 0; j < 3; j++) { - if (tb.onStack(node)) - node = tb.aboveOnStack(node); - if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check - tb.removeFromStack(node); - continue INNER; - } else if (node == formatEl) - break INNER; + Element replacement = new Element(Tag.valueOf(node + .nodeName()), tb.getBaseUri()); + tb.replaceActiveFormattingElement(node, replacement); + tb.replaceOnStack(node, replacement); + node = replacement; - Element replacement = new Element(Tag.valueOf(node.nodeName()), tb.getBaseUri()); - tb.replaceActiveFormattingElement(node, replacement); - tb.replaceOnStack(node, replacement); - node = replacement; + if (lastNode == furthestBlock) { + // todo: move the aforementioned bookmark to be + // immediately after the new node in the list of + // active formatting elements. + // not getting how this bookmark both straddles + // the element above, but is inbetween here... + } + if (lastNode.parent() != null) { + lastNode.remove(); + } + node.appendChild(lastNode); - if (lastNode == furthestBlock) { - // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements. - // not getting how this bookmark both straddles the element above, but is inbetween here... - } - if (lastNode.parent() != null) - lastNode.remove(); - node.appendChild(lastNode); + lastNode = node; + } - lastNode = node; + if (StringUtil.in(commonAncestor.nodeName(), "table", + "tbody", "tfoot", "thead", "tr")) { + if (lastNode.parent() != null) { + lastNode.remove(); } - - if (StringUtil.in(commonAncestor.nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - if (lastNode.parent() != null) - lastNode.remove(); - tb.insertInFosterParent(lastNode); - } else { - if (lastNode.parent() != null) - lastNode.remove(); - commonAncestor.appendChild(lastNode); + tb.insertInFosterParent(lastNode); + } else { + if (lastNode.parent() != null) { + lastNode.remove(); } + commonAncestor.appendChild(lastNode); + } - Element adopter = new Element(Tag.valueOf(name), tb.getBaseUri()); - Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodes().size()]); - for (Node childNode : childNodes) { - adopter.appendChild(childNode); // append will reparent. thus the clone to avoid concurrent mod. - } - furthestBlock.appendChild(adopter); - tb.removeFromActiveFormattingElements(formatEl); - // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. - tb.removeFromStack(formatEl); - tb.insertOnStackAfter(furthestBlock, adopter); + Element adopter = new Element(Tag.valueOf(name), + tb.getBaseUri()); + Node[] childNodes = furthestBlock.childNodes().toArray( + new Node[furthestBlock.childNodes().size()]); + for (Node childNode : childNodes) { + adopter.appendChild(childNode); // append will + // reparent. thus + // the clone to + // avoid concurrent + // mod. } - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - if (!tb.inScope("name")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); + furthestBlock.appendChild(adopter); + tb.removeFromActiveFormattingElements(formatEl); + // todo: insert the new element into the list of active + // formatting elements at the position of the + // aforementioned bookmark. + tb.removeFromStack(formatEl); + tb.insertOnStackAfter(furthestBlock, adopter); + } + } else if (StringUtil.in(name, "applet", "marquee", "object")) { + if (!tb.inScope("name")) { + if (!tb.inScope(name)) { + tb.error(this); + return false; } - } else if (name.equals("br")) { - tb.error(this); - tb.process(new Token.StartTag("br")); - return false; - } else { - return anyOtherEndTag(t, tb); + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) { + tb.error(this); + } + tb.popStackToClose(name); + tb.clearFormattingElementsToLastMarker(); } + } else if (name.equals("br")) { + tb.error(this); + tb.process(new Token.StartTag("br")); + return false; + } else { + return anyOtherEndTag(t, tb); + } - break; - case EOF: - // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html - // stop parsing - break; + break; + case EOF: + // todo: error if stack contains something not dd, dt, li, p, + // tbody, td, tfoot, th, thead, tr, body, html + // stop parsing + break; } return true; } @@ -768,8 +880,9 @@ enum HtmlTreeBuilderState { Element node = it.next(); if (node.nodeName().equals(name)) { tb.generateImpliedEndTags(name); - if (!name.equals(tb.currentElement().nodeName())) + if (!name.equals(tb.currentElement().nodeName())) { tb.error(this); + } tb.popStackToClose(name); break; } else { @@ -784,6 +897,7 @@ enum HtmlTreeBuilderState { }, Text { // in script, style etc. normally treated as data tags + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isCharacter()) { tb.insert(t.asCharacter()); @@ -794,7 +908,8 @@ enum HtmlTreeBuilderState { tb.transition(tb.originalState()); return tb.process(t); } else if (t.isEndTag()) { - // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts + // if: An end tag whose tag name is "script" -- scripting + // nesting level, if evaluating scripts tb.pop(); tb.transition(tb.originalState()); } @@ -802,6 +917,7 @@ enum HtmlTreeBuilderState { } }, InTable { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isCharacter()) { tb.newPendingTableCharacters(); @@ -839,21 +955,23 @@ enum HtmlTreeBuilderState { } else if (name.equals("table")) { tb.error(this); boolean processed = tb.process(new Token.EndTag("table")); - if (processed) // only ignored if in fragment + if (processed) { return tb.process(t); + } } else if (StringUtil.in(name, "style", "script")) { return tb.process(t, InHead); } else if (name.equals("input")) { - if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) { + if (!startTag.attributes.get("type").equalsIgnoreCase( + "hidden")) { return anythingElse(t, tb); } else { tb.insertEmpty(startTag); } } else if (name.equals("form")) { tb.error(this); - if (tb.getFormElement() != null) + if (tb.getFormElement() != null) { return false; - else { + } else { Element form = tb.insertEmpty(startTag); tb.setFormElement(form); } @@ -872,16 +990,18 @@ enum HtmlTreeBuilderState { tb.popStackToClose("table"); } tb.resetInsertionMode(); - } else if (StringUtil.in(name, - "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr")) { tb.error(this); return false; } else { return anythingElse(t, tb); } } else if (t.isEOF()) { - if (tb.currentElement().nodeName().equals("html")) + if (tb.currentElement().nodeName().equals("html")) { tb.error(this); + } return true; // stops parsing } return anythingElse(t, tb); @@ -890,7 +1010,8 @@ enum HtmlTreeBuilderState { boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.error(this); boolean processed = true; - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { + if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", + "tfoot", "thead", "tr")) { tb.setFosterInserts(true); processed = tb.process(t, InBody); tb.setFosterInserts(false); @@ -901,42 +1022,47 @@ enum HtmlTreeBuilderState { } }, InTableText { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.getPendingTableCharacters().add(c); - } - break; - default: - if (tb.getPendingTableCharacters().size() > 0) { - for (Token.Character character : tb.getPendingTableCharacters()) { - if (!isWhitespace(character)) { - // InTable anything else section: - tb.error(this); - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - tb.process(character, InBody); - tb.setFosterInserts(false); - } else { - tb.process(character, InBody); - } - } else - tb.insert(character); + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + tb.error(this); + return false; + } else { + tb.getPendingTableCharacters().add(c); + } + break; + default: + if (tb.getPendingTableCharacters().size() > 0) { + for (Token.Character character : tb + .getPendingTableCharacters()) { + if (!isWhitespace(character)) { + // InTable anything else section: + tb.error(this); + if (StringUtil.in(tb.currentElement().nodeName(), + "table", "tbody", "tfoot", "thead", "tr")) { + tb.setFosterInserts(true); + tb.process(character, InBody); + tb.setFosterInserts(false); + } else { + tb.process(character, InBody); + } + } else { + tb.insert(character); } - tb.newPendingTableCharacters(); } - tb.transition(tb.originalState()); - return tb.process(t); + tb.newPendingTableCharacters(); + } + tb.transition(tb.originalState()); + return tb.process(t); } return true; } }, InCaption { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isEndTag() && t.asEndTag().name().equals("caption")) { Token.EndTag endTag = t.asEndTag(); @@ -946,23 +1072,27 @@ enum HtmlTreeBuilderState { return false; } else { tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("caption")) + if (!tb.currentElement().nodeName().equals("caption")) { tb.error(this); + } tb.popStackToClose("caption"); tb.clearFormattingElementsToLastMarker(); tb.transition(InTable); } - } else if (( - t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || - t.isEndTag() && t.asEndTag().name().equals("table")) - ) { + } else if ((t.isStartTag() + && StringUtil.in(t.asStartTag().name(), "caption", "col", + "colgroup", "tbody", "td", "tfoot", "th", "thead", + "tr") || t.isEndTag() + && t.asEndTag().name().equals("table"))) { tb.error(this); boolean processed = tb.process(new Token.EndTag("caption")); - if (processed) + if (processed) { return tb.process(t); - } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), - "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } + } else if (t.isEndTag() + && StringUtil.in(t.asEndTag().name(), "body", "col", + "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr")) { tb.error(this); return false; } else { @@ -972,113 +1102,127 @@ enum HtmlTreeBuilderState { } }, InColumnGroup { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - break; - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) - return tb.process(t, InBody); - else if (name.equals("col")) - tb.insertEmpty(startTag); - else - return anythingElse(t, tb); - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("colgroup")) { - if (tb.currentElement().nodeName().equals("html")) { // frag case - tb.error(this); - return false; - } else { - tb.pop(); - tb.transition(InTable); - } - } else - return anythingElse(t, tb); - break; - case EOF: - if (tb.currentElement().nodeName().equals("html")) - return true; // stop parsing; frag case - else - return anythingElse(t, tb); - default: + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + break; + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) { + return tb.process(t, InBody); + } else if (name.equals("col")) { + tb.insertEmpty(startTag); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (name.equals("colgroup")) { + if (tb.currentElement().nodeName().equals("html")) { // frag + // case + tb.error(this); + return false; + } else { + tb.pop(); + tb.transition(InTable); + } + } else { + return anythingElse(t, tb); + } + break; + case EOF: + if (tb.currentElement().nodeName().equals("html")) { + return true; // stop parsing; frag case + } else { return anythingElse(t, tb); + } + default: + return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("colgroup")); - if (processed) // only ignored in frag case + if (processed) { return tb.process(t); + } return true; } }, InTableBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("tr")) { - tb.clearStackToTableBodyContext(); - tb.insert(startTag); - tb.transition(InRow); - } else if (StringUtil.in(name, "th", "td")) { - tb.error(this); - tb.process(new Token.StartTag("tr")); - return tb.process(startTag); - } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { - return exitTableBody(t, tb); - } else - return anythingElse(t, tb); - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.clearStackToTableBodyContext(); - tb.pop(); - tb.transition(InTable); - } - } else if (name.equals("table")) { - return exitTableBody(t, tb); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("tr")) { + tb.clearStackToTableBodyContext(); + tb.insert(startTag); + tb.transition(InRow); + } else if (StringUtil.in(name, "th", "td")) { + tb.error(this); + tb.process(new Token.StartTag("tr")); + return tb.process(startTag); + } else if (StringUtil.in(name, "caption", "col", "colgroup", + "tbody", "tfoot", "thead")) { + return exitTableBody(t, tb); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (StringUtil.in(name, "tbody", "tfoot", "thead")) { + if (!tb.inTableScope(name)) { tb.error(this); return false; - } else - return anythingElse(t, tb); - break; - default: + } else { + tb.clearStackToTableBodyContext(); + tb.pop(); + tb.transition(InTable); + } + } else if (name.equals("table")) { + return exitTableBody(t, tb); + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html", "td", "th", "tr")) { + tb.error(this); + return false; + } else { return anythingElse(t, tb); + } + break; + default: + return anythingElse(t, tb); } return true; } private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { - if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) { + if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb + .inScope("tfoot"))) { // frag case tb.error(this); return false; } tb.clearStackToTableBodyContext(); - tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, tfoot, thead + tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, + // tfoot, + // thead return tb.process(t); } @@ -1087,6 +1231,7 @@ enum HtmlTreeBuilderState { } }, InRow { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); @@ -1097,7 +1242,8 @@ enum HtmlTreeBuilderState { tb.insert(startTag); tb.transition(InCell); tb.insertMarkerToFormattingElements(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { + } else if (StringUtil.in(name, "caption", "col", "colgroup", + "tbody", "tfoot", "thead", "tr")) { return handleMissingTr(t, tb); } else { return anythingElse(t, tb); @@ -1123,7 +1269,8 @@ enum HtmlTreeBuilderState { } tb.process(new Token.EndTag("tr")); return tb.process(t); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html", "td", "th")) { tb.error(this); return false; } else { @@ -1141,13 +1288,15 @@ enum HtmlTreeBuilderState { private boolean handleMissingTr(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("tr")); - if (processed) + if (processed) { return tb.process(t); - else + } else { return false; + } } }, InCell { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); @@ -1156,19 +1305,24 @@ enum HtmlTreeBuilderState { if (StringUtil.in(name, "td", "th")) { if (!tb.inTableScope(name)) { tb.error(this); - tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag + tb.transition(InRow); // might not be in scope if empty: + // <td /> and processing fake end + // tag return false; } tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); + } tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); tb.transition(InRow); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html")) { tb.error(this); return false; - } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { + } else if (StringUtil.in(name, "table", "tbody", "tfoot", + "thead", "tr")) { if (!tb.inTableScope(name)) { tb.error(this); return false; @@ -1178,9 +1332,10 @@ enum HtmlTreeBuilderState { } else { return anythingElse(t, tb); } - } else if (t.isStartTag() && - StringUtil.in(t.asStartTag().name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } else if (t.isStartTag() + && StringUtil.in(t.asStartTag().name(), "caption", "col", + "colgroup", "tbody", "td", "tfoot", "th", "thead", + "tr")) { if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { tb.error(this); return false; @@ -1198,91 +1353,105 @@ enum HtmlTreeBuilderState { } private void closeCell(HtmlTreeBuilder tb) { - if (tb.inTableScope("td")) + if (tb.inTableScope("td")) { tb.process(new Token.EndTag("td")); - else - tb.process(new Token.EndTag("th")); // only here if th or td in scope + } else { + tb.process(new Token.EndTag("th")); // only here if th or td in + // scope + } } }, InSelect { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.insert(c); - } - break; - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { tb.error(this); return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) - return tb.process(start, InBody); - else if (name.equals("option")) { + } else { + tb.insert(c); + } + break; + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + return false; + case StartTag: + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) { + return tb.process(start, InBody); + } else if (name.equals("option")) { + tb.process(new Token.EndTag("option")); + tb.insert(start); + } else if (name.equals("optgroup")) { + if (tb.currentElement().nodeName().equals("option")) { tb.process(new Token.EndTag("option")); - tb.insert(start); - } else if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option")) - tb.process(new Token.EndTag("option")); - else if (tb.currentElement().nodeName().equals("optgroup")) - tb.process(new Token.EndTag("optgroup")); - tb.insert(start); - } else if (name.equals("select")) { - tb.error(this); - return tb.process(new Token.EndTag("select")); - } else if (StringUtil.in(name, "input", "keygen", "textarea")) { + } else if (tb.currentElement().nodeName() + .equals("optgroup")) { + tb.process(new Token.EndTag("optgroup")); + } + tb.insert(start); + } else if (name.equals("select")) { + tb.error(this); + return tb.process(new Token.EndTag("select")); + } else if (StringUtil.in(name, "input", "keygen", "textarea")) { + tb.error(this); + if (!tb.inSelectScope("select")) { + return false; // frag + } + tb.process(new Token.EndTag("select")); + return tb.process(start); + } else if (name.equals("script")) { + return tb.process(t, InHead); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag end = t.asEndTag(); + name = end.name(); + if (name.equals("optgroup")) { + if (tb.currentElement().nodeName().equals("option") + && tb.aboveOnStack(tb.currentElement()) != null + && tb.aboveOnStack(tb.currentElement()).nodeName() + .equals("optgroup")) { + tb.process(new Token.EndTag("option")); + } + if (tb.currentElement().nodeName().equals("optgroup")) { + tb.pop(); + } else { tb.error(this); - if (!tb.inSelectScope("select")) - return false; // frag - tb.process(new Token.EndTag("select")); - return tb.process(start); - } else if (name.equals("script")) { - return tb.process(t, InHead); + } + } else if (name.equals("option")) { + if (tb.currentElement().nodeName().equals("option")) { + tb.pop(); } else { - return anythingElse(t, tb); + tb.error(this); } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup")) - tb.process(new Token.EndTag("option")); - if (tb.currentElement().nodeName().equals("optgroup")) - tb.pop(); - else - tb.error(this); - } else if (name.equals("option")) { - if (tb.currentElement().nodeName().equals("option")) - tb.pop(); - else - tb.error(this); - } else if (name.equals("select")) { - if (!tb.inSelectScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose(name); - tb.resetInsertionMode(); - } - } else - return anythingElse(t, tb); - break; - case EOF: - if (!tb.currentElement().nodeName().equals("html")) + } else if (name.equals("select")) { + if (!tb.inSelectScope(name)) { tb.error(this); - break; - default: + return false; + } else { + tb.popStackToClose(name); + tb.resetInsertionMode(); + } + } else { return anythingElse(t, tb); + } + break; + case EOF: + if (!tb.currentElement().nodeName().equals("html")) { + tb.error(this); + } + break; + default: + return anythingElse(t, tb); } return true; } @@ -1293,24 +1462,31 @@ enum HtmlTreeBuilderState { } }, InSelectInTable { + @Override boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { + if (t.isStartTag() + && StringUtil.in(t.asStartTag().name(), "caption", "table", + "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); tb.process(new Token.EndTag("select")); return tb.process(t); - } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { + } else if (t.isEndTag() + && StringUtil.in(t.asEndTag().name(), "caption", "table", + "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); if (tb.inTableScope(t.asEndTag().name())) { tb.process(new Token.EndTag("select")); return (tb.process(t)); - } else + } else { return false; + } } else { return tb.process(t, InSelect); } } }, AfterBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return tb.process(t, InBody); @@ -1339,6 +1515,7 @@ enum HtmlTreeBuilderState { } }, InFrameset { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); @@ -1368,7 +1545,9 @@ enum HtmlTreeBuilderState { return false; } else { tb.pop(); - if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) { + if (!tb.isFragmentParsing() + && !tb.currentElement().nodeName() + .equals("frameset")) { tb.transition(AfterFrameset); } } @@ -1385,6 +1564,7 @@ enum HtmlTreeBuilderState { } }, AfterFrameset { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); @@ -1397,7 +1577,8 @@ enum HtmlTreeBuilderState { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { tb.transition(AfterAfterFrameset); - } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { + } else if (t.isStartTag() + && t.asStartTag().name().equals("noframes")) { return tb.process(t, InHead); } else if (t.isEOF()) { // cool your heels, we're complete @@ -1409,10 +1590,12 @@ enum HtmlTreeBuilderState { } }, AfterAfterBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isComment()) { tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { + } else if (t.isDoctype() || isWhitespace(t) + || (t.isStartTag() && t.asStartTag().name().equals("html"))) { return tb.process(t, InBody); } else if (t.isEOF()) { // nice work chuck @@ -1425,14 +1608,17 @@ enum HtmlTreeBuilderState { } }, AfterAfterFrameset { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isComment()) { tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { + } else if (t.isDoctype() || isWhitespace(t) + || (t.isStartTag() && t.asStartTag().name().equals("html"))) { return tb.process(t, InBody); } else if (t.isEOF()) { // nice work chuck - } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { + } else if (t.isStartTag() + && t.asStartTag().name().equals("noframes")) { return tb.process(t, InHead); } else { tb.error(this); @@ -1442,6 +1628,7 @@ enum HtmlTreeBuilderState { } }, ForeignContent { + @Override boolean process(Token t, HtmlTreeBuilder tb) { return true; // todo: implement. Also; how do we get here? @@ -1458,8 +1645,9 @@ enum HtmlTreeBuilderState { // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " for (int i = 0; i < data.length(); i++) { char c = data.charAt(i); - if (!StringUtil.isWhitespace(c)) + if (!StringUtil.isWhitespace(c)) { return false; + } } return true; } @@ -1473,7 +1661,8 @@ enum HtmlTreeBuilderState { tb.transition(Text); } - private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) { + private static void handleRawtext(Token.StartTag startTag, + HtmlTreeBuilder tb) { tb.insert(startTag); tb.tokeniser.transition(TokeniserState.Rawtext); tb.markInsertionMode(); |