summaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/parser/HtmlTreeBuilder.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/parser/HtmlTreeBuilder.java')
-rw-r--r--server/src/org/jsoup/parser/HtmlTreeBuilder.java672
1 files changed, 672 insertions, 0 deletions
diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilder.java b/server/src/org/jsoup/parser/HtmlTreeBuilder.java
new file mode 100644
index 0000000000..457a4c3249
--- /dev/null
+++ b/server/src/org/jsoup/parser/HtmlTreeBuilder.java
@@ -0,0 +1,672 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.DescendableLinkedList;
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * HTML Tree Builder; creates a DOM from Tokens.
+ */
+class HtmlTreeBuilder extends TreeBuilder {
+
+ private HtmlTreeBuilderState state; // the current state
+ private HtmlTreeBuilderState originalState; // original / marked state
+
+ private boolean baseUriSetFromDoc = false;
+ private Element headElement; // the current head element
+ private Element formElement; // the current form element
+ private Element contextElement; // fragment parse context -- could be null even if fragment parsing
+ private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active (open) formatting elements
+ private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars in table to be shifted out
+
+ private boolean framesetOk = true; // if ok to go into frameset
+ private boolean fosterInserts = false; // if next inserts should be fostered
+ private boolean fragmentParsing = false; // if parsing a fragment of html
+
+ HtmlTreeBuilder() {}
+
+ @Override
+ Document parse(String input, String baseUri, ParseErrorList errors) {
+ state = HtmlTreeBuilderState.Initial;
+ return super.parse(input, baseUri, errors);
+ }
+
+ List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors) {
+ // context may be null
+ state = HtmlTreeBuilderState.Initial;
+ initialiseParse(inputFragment, baseUri, errors);
+ contextElement = context;
+ fragmentParsing = true;
+ Element root = null;
+
+ if (context != null) {
+ if (context.ownerDocument() != null) // quirks setup:
+ doc.quirksMode(context.ownerDocument().quirksMode());
+
+ // initialise the tokeniser state:
+ String contextTag = context.tagName();
+ if (StringUtil.in(contextTag, "title", "textarea"))
+ tokeniser.transition(TokeniserState.Rcdata);
+ else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp"))
+ tokeniser.transition(TokeniserState.Rawtext);
+ else if (contextTag.equals("script"))
+ tokeniser.transition(TokeniserState.ScriptData);
+ else if (contextTag.equals(("noscript")))
+ tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext
+ else if (contextTag.equals("plaintext"))
+ tokeniser.transition(TokeniserState.Data);
+ else
+ tokeniser.transition(TokeniserState.Data); // default
+
+ root = new Element(Tag.valueOf("html"), baseUri);
+ doc.appendChild(root);
+ stack.push(root);
+ resetInsertionMode();
+ // todo: setup form element to nearest form on context (up ancestor chain)
+ }
+
+ runParser();
+ if (context != null)
+ return root.childNodes();
+ else
+ return doc.childNodes();
+ }
+
+ @Override
+ protected boolean process(Token token) {
+ currentToken = token;
+ return this.state.process(token, this);
+ }
+
+ boolean process(Token token, HtmlTreeBuilderState state) {
+ currentToken = token;
+ return state.process(token, this);
+ }
+
+ void transition(HtmlTreeBuilderState state) {
+ this.state = state;
+ }
+
+ HtmlTreeBuilderState state() {
+ return state;
+ }
+
+ void markInsertionMode() {
+ originalState = state;
+ }
+
+ HtmlTreeBuilderState originalState() {
+ return originalState;
+ }
+
+ void framesetOk(boolean framesetOk) {
+ this.framesetOk = framesetOk;
+ }
+
+ boolean framesetOk() {
+ return framesetOk;
+ }
+
+ Document getDocument() {
+ return doc;
+ }
+
+ String getBaseUri() {
+ return baseUri;
+ }
+
+ void maybeSetBaseUri(Element base) {
+ if (baseUriSetFromDoc) // only listen to the first <base href> in parse
+ return;
+
+ String href = base.absUrl("href");
+ if (href.length() != 0) { // ignore <base target> etc
+ baseUri = href;
+ baseUriSetFromDoc = true;
+ doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
+ }
+ }
+
+ boolean isFragmentParsing() {
+ return fragmentParsing;
+ }
+
+ void error(HtmlTreeBuilderState state) {
+ if (errors.canAddError())
+ errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state));
+ }
+
+ Element insert(Token.StartTag startTag) {
+ // handle empty unknown tags
+ // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate fake end tag.
+ if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) {
+ Element el = insertEmpty(startTag);
+ process(new Token.EndTag(el.tagName())); // ensure we get out of whatever state we are in
+ return el;
+ }
+
+ Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes);
+ insert(el);
+ return el;
+ }
+
+ Element insert(String startTagName) {
+ Element el = new Element(Tag.valueOf(startTagName), baseUri);
+ insert(el);
+ return el;
+ }
+
+ void insert(Element el) {
+ insertNode(el);
+ stack.add(el);
+ }
+
+ Element insertEmpty(Token.StartTag startTag) {
+ Tag tag = Tag.valueOf(startTag.name());
+ Element el = new Element(tag, baseUri, startTag.attributes);
+ insertNode(el);
+ if (startTag.isSelfClosing()) {
+ tokeniser.acknowledgeSelfClosingFlag();
+ if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output
+ tag.setSelfClosing();
+ }
+ return el;
+ }
+
+ void insert(Token.Comment commentToken) {
+ Comment comment = new Comment(commentToken.getData(), baseUri);
+ insertNode(comment);
+ }
+
+ void insert(Token.Character characterToken) {
+ Node node;
+ // characters in script and style go in as datanodes, not text nodes
+ if (StringUtil.in(currentElement().tagName(), "script", "style"))
+ node = new DataNode(characterToken.getData(), baseUri);
+ else
+ node = new TextNode(characterToken.getData(), baseUri);
+ currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack.
+ }
+
+ private void insertNode(Node node) {
+ // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
+ if (stack.size() == 0)
+ doc.appendChild(node);
+ else if (isFosterInserts())
+ insertInFosterParent(node);
+ else
+ currentElement().appendChild(node);
+ }
+
+ Element pop() {
+ // todo - dev, remove validation check
+ if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell"))
+ Validate.isFalse(true, "pop td not in cell");
+ if (stack.peekLast().nodeName().equals("html"))
+ Validate.isFalse(true, "popping html!");
+ return stack.pollLast();
+ }
+
+ void push(Element element) {
+ stack.add(element);
+ }
+
+ DescendableLinkedList<Element> getStack() {
+ return stack;
+ }
+
+ boolean onStack(Element el) {
+ return isElementInQueue(stack, el);
+ }
+
+ private boolean isElementInQueue(DescendableLinkedList<Element> queue, Element element) {
+ Iterator<Element> it = queue.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next == element) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ Element getFromStack(String elName) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next.nodeName().equals(elName)) {
+ return next;
+ }
+ }
+ return null;
+ }
+
+ boolean removeFromStack(Element el) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next == el) {
+ it.remove();
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void popStackToClose(String elName) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next.nodeName().equals(elName)) {
+ it.remove();
+ break;
+ } else {
+ it.remove();
+ }
+ }
+ }
+
+ void popStackToClose(String... elNames) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (StringUtil.in(next.nodeName(), elNames)) {
+ it.remove();
+ break;
+ } else {
+ it.remove();
+ }
+ }
+ }
+
+ void popStackToBefore(String elName) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next.nodeName().equals(elName)) {
+ break;
+ } else {
+ it.remove();
+ }
+ }
+ }
+
+ void clearStackToTableContext() {
+ clearStackToContext("table");
+ }
+
+ void clearStackToTableBodyContext() {
+ clearStackToContext("tbody", "tfoot", "thead");
+ }
+
+ void clearStackToTableRowContext() {
+ clearStackToContext("tr");
+ }
+
+ private void clearStackToContext(String... nodeNames) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html"))
+ break;
+ else
+ it.remove();
+ }
+ }
+
+ Element aboveOnStack(Element el) {
+ assert onStack(el);
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next == el) {
+ return it.next();
+ }
+ }
+ return null;
+ }
+
+ void insertOnStackAfter(Element after, Element in) {
+ int i = stack.lastIndexOf(after);
+ Validate.isTrue(i != -1);
+ stack.add(i+1, in);
+ }
+
+ void replaceOnStack(Element out, Element in) {
+ replaceInQueue(stack, out, in);
+ }
+
+ private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) {
+ int i = queue.lastIndexOf(out);
+ Validate.isTrue(i != -1);
+ queue.remove(i);
+ queue.add(i, in);
+ }
+
+ void resetInsertionMode() {
+ boolean last = false;
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element node = it.next();
+ if (!it.hasNext()) {
+ last = true;
+ node = contextElement;
+ }
+ String name = node.nodeName();
+ if ("select".equals(name)) {
+ transition(HtmlTreeBuilderState.InSelect);
+ break; // frag
+ } else if (("td".equals(name) || "td".equals(name) && !last)) {
+ transition(HtmlTreeBuilderState.InCell);
+ break;
+ } else if ("tr".equals(name)) {
+ transition(HtmlTreeBuilderState.InRow);
+ break;
+ } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) {
+ transition(HtmlTreeBuilderState.InTableBody);
+ break;
+ } else if ("caption".equals(name)) {
+ transition(HtmlTreeBuilderState.InCaption);
+ break;
+ } else if ("colgroup".equals(name)) {
+ transition(HtmlTreeBuilderState.InColumnGroup);
+ break; // frag
+ } else if ("table".equals(name)) {
+ transition(HtmlTreeBuilderState.InTable);
+ break;
+ } else if ("head".equals(name)) {
+ transition(HtmlTreeBuilderState.InBody);
+ break; // frag
+ } else if ("body".equals(name)) {
+ transition(HtmlTreeBuilderState.InBody);
+ break;
+ } else if ("frameset".equals(name)) {
+ transition(HtmlTreeBuilderState.InFrameset);
+ break; // frag
+ } else if ("html".equals(name)) {
+ transition(HtmlTreeBuilderState.BeforeHead);
+ break; // frag
+ } else if (last) {
+ transition(HtmlTreeBuilderState.InBody);
+ break; // frag
+ }
+ }
+ }
+
+ // todo: tidy up in specific scope methods
+ private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) {
+ return inSpecificScope(new String[]{targetName}, baseTypes, extraTypes);
+ }
+
+ private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element el = it.next();
+ String elName = el.nodeName();
+ if (StringUtil.in(elName, targetNames))
+ return true;
+ if (StringUtil.in(elName, baseTypes))
+ return false;
+ if (extraTypes != null && StringUtil.in(elName, extraTypes))
+ return false;
+ }
+ Validate.fail("Should not be reachable");
+ return false;
+ }
+
+ boolean inScope(String[] targetNames) {
+ return inSpecificScope(targetNames, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, null);
+ }
+
+ boolean inScope(String targetName) {
+ return inScope(targetName, null);
+ }
+
+ boolean inScope(String targetName, String[] extras) {
+ return inSpecificScope(targetName, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, extras);
+ // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml
+ // todo: in svg namespace: forignOjbect, desc, title
+ }
+
+ boolean inListItemScope(String targetName) {
+ return inScope(targetName, new String[]{"ol", "ul"});
+ }
+
+ boolean inButtonScope(String targetName) {
+ return inScope(targetName, new String[]{"button"});
+ }
+
+ boolean inTableScope(String targetName) {
+ return inSpecificScope(targetName, new String[]{"html", "table"}, null);
+ }
+
+ boolean inSelectScope(String targetName) {
+ Iterator<Element> it = stack.descendingIterator();
+ while (it.hasNext()) {
+ Element el = it.next();
+ String elName = el.nodeName();
+ if (elName.equals(targetName))
+ return true;
+ if (!StringUtil.in(elName, "optgroup", "option")) // all elements except
+ return false;
+ }
+ Validate.fail("Should not be reachable");
+ return false;
+ }
+
+ void setHeadElement(Element headElement) {
+ this.headElement = headElement;
+ }
+
+ Element getHeadElement() {
+ return headElement;
+ }
+
+ boolean isFosterInserts() {
+ return fosterInserts;
+ }
+
+ void setFosterInserts(boolean fosterInserts) {
+ this.fosterInserts = fosterInserts;
+ }
+
+ Element getFormElement() {
+ return formElement;
+ }
+
+ void setFormElement(Element formElement) {
+ this.formElement = formElement;
+ }
+
+ void newPendingTableCharacters() {
+ pendingTableCharacters = new ArrayList<Token.Character>();
+ }
+
+ List<Token.Character> getPendingTableCharacters() {
+ return pendingTableCharacters;
+ }
+
+ void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) {
+ this.pendingTableCharacters = pendingTableCharacters;
+ }
+
+ /**
+ 11.2.5.2 Closing elements that have implied end tags<p/>
+ When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a
+ dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element,
+ the UA must pop the current node off the stack of open elements.
+
+ @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the
+ process, then the UA must perform the above steps as if that element was not in the above list.
+ */
+ void generateImpliedEndTags(String excludeTag) {
+ while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) &&
+ StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
+ pop();
+ }
+
+ void generateImpliedEndTags() {
+ generateImpliedEndTags(null);
+ }
+
+ boolean isSpecial(Element el) {
+ // todo: mathml's mi, mo, mn
+ // todo: svg's foreigObject, desc, title
+ String name = el.nodeName();
+ return StringUtil.in(name, "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound",
+ "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd",
+ "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form",
+ "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
+ "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav",
+ "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
+ "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
+ "title", "tr", "ul", "wbr", "xmp");
+ }
+
+ // active formatting elements
+ void pushActiveFormattingElements(Element in) {
+ int numSeen = 0;
+ Iterator<Element> iter = formattingElements.descendingIterator();
+ while (iter.hasNext()) {
+ Element el = iter.next();
+ if (el == null) // marker
+ break;
+
+ if (isSameFormattingElement(in, el))
+ numSeen++;
+
+ if (numSeen == 3) {
+ iter.remove();
+ break;
+ }
+ }
+ formattingElements.add(in);
+ }
+
+ private boolean isSameFormattingElement(Element a, Element b) {
+ // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children
+ return a.nodeName().equals(b.nodeName()) &&
+ // a.namespace().equals(b.namespace()) &&
+ a.attributes().equals(b.attributes());
+ // todo: namespaces
+ }
+
+ void reconstructFormattingElements() {
+ int size = formattingElements.size();
+ if (size == 0 || formattingElements.getLast() == null || onStack(formattingElements.getLast()))
+ return;
+
+ Element entry = formattingElements.getLast();
+ int pos = size - 1;
+ boolean skip = false;
+ while (true) {
+ if (pos == 0) { // step 4. if none before, skip to 8
+ skip = true;
+ break;
+ }
+ entry = formattingElements.get(--pos); // step 5. one earlier than entry
+ if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack
+ break; // jump to 8, else continue back to 4
+ }
+ while(true) {
+ if (!skip) // step 7: on later than entry
+ entry = formattingElements.get(++pos);
+ Validate.notNull(entry); // should not occur, as we break at last element
+
+ // 8. create new element from element, 9 insert into current node, onto stack
+ skip = false; // can only skip increment from 4.
+ Element newEl = insert(entry.nodeName()); // todo: avoid fostering here?
+ // newEl.namespace(entry.namespace()); // todo: namespaces
+ newEl.attributes().addAll(entry.attributes());
+
+ // 10. replace entry with new entry
+ formattingElements.add(pos, newEl);
+ formattingElements.remove(pos + 1);
+
+ // 11
+ if (pos == size-1) // if not last entry in list, jump to 7
+ break;
+ }
+ }
+
+ void clearFormattingElementsToLastMarker() {
+ while (!formattingElements.isEmpty()) {
+ Element el = formattingElements.peekLast();
+ formattingElements.removeLast();
+ if (el == null)
+ break;
+ }
+ }
+
+ void removeFromActiveFormattingElements(Element el) {
+ Iterator<Element> it = formattingElements.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next == el) {
+ it.remove();
+ break;
+ }
+ }
+ }
+
+ boolean isInActiveFormattingElements(Element el) {
+ return isElementInQueue(formattingElements, el);
+ }
+
+ Element getActiveFormattingElement(String nodeName) {
+ Iterator<Element> it = formattingElements.descendingIterator();
+ while (it.hasNext()) {
+ Element next = it.next();
+ if (next == null) // scope marker
+ break;
+ else if (next.nodeName().equals(nodeName))
+ return next;
+ }
+ return null;
+ }
+
+ void replaceActiveFormattingElement(Element out, Element in) {
+ replaceInQueue(formattingElements, out, in);
+ }
+
+ void insertMarkerToFormattingElements() {
+ formattingElements.add(null);
+ }
+
+ void insertInFosterParent(Node in) {
+ Element fosterParent = null;
+ Element lastTable = getFromStack("table");
+ boolean isLastTableParent = false;
+ if (lastTable != null) {
+ if (lastTable.parent() != null) {
+ fosterParent = lastTable.parent();
+ isLastTableParent = true;
+ } else
+ fosterParent = aboveOnStack(lastTable);
+ } else { // no table == frag
+ fosterParent = stack.get(0);
+ }
+
+ if (isLastTableParent) {
+ Validate.notNull(lastTable); // last table cannot be null by this point.
+ lastTable.before(in);
+ }
+ else
+ fosterParent.appendChild(in);
+ }
+
+ @Override
+ public String toString() {
+ return "TreeBuilder{" +
+ "currentToken=" + currentToken +
+ ", state=" + state +
+ ", currentElement=" + currentElement() +
+ '}';
+ }
+}