diff options
Diffstat (limited to 'src/org/jsoup/parser')
-rw-r--r-- | src/org/jsoup/parser/CharacterReader.java | 230 | ||||
-rw-r--r-- | src/org/jsoup/parser/HtmlTreeBuilder.java | 672 | ||||
-rw-r--r-- | src/org/jsoup/parser/HtmlTreeBuilderState.java | 1482 | ||||
-rw-r--r-- | src/org/jsoup/parser/ParseError.java | 40 | ||||
-rw-r--r-- | src/org/jsoup/parser/ParseErrorList.java | 34 | ||||
-rw-r--r-- | src/org/jsoup/parser/Parser.java | 157 | ||||
-rw-r--r-- | src/org/jsoup/parser/Tag.java | 262 | ||||
-rw-r--r-- | src/org/jsoup/parser/Token.java | 252 | ||||
-rw-r--r-- | src/org/jsoup/parser/TokenQueue.java | 393 | ||||
-rw-r--r-- | src/org/jsoup/parser/Tokeniser.java | 230 | ||||
-rw-r--r-- | src/org/jsoup/parser/TokeniserState.java | 1778 | ||||
-rw-r--r-- | src/org/jsoup/parser/TreeBuilder.java | 60 | ||||
-rw-r--r-- | src/org/jsoup/parser/XmlTreeBuilder.java | 111 | ||||
-rw-r--r-- | src/org/jsoup/parser/package-info.java | 4 |
14 files changed, 0 insertions, 5705 deletions
diff --git a/src/org/jsoup/parser/CharacterReader.java b/src/org/jsoup/parser/CharacterReader.java deleted file mode 100644 index b549a571a0..0000000000 --- a/src/org/jsoup/parser/CharacterReader.java +++ /dev/null @@ -1,230 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; - -/** - CharacterReader consumes tokens off a string. To replace the old TokenQueue. - */ -class CharacterReader { - static final char EOF = (char) -1; - - private final String input; - private final int length; - private int pos = 0; - private int mark = 0; - - CharacterReader(String input) { - Validate.notNull(input); - input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns to newlines - - this.input = input; - this.length = input.length(); - } - - int pos() { - return pos; - } - - boolean isEmpty() { - return pos >= length; - } - - char current() { - return isEmpty() ? EOF : input.charAt(pos); - } - - char consume() { - char val = isEmpty() ? EOF : input.charAt(pos); - pos++; - return val; - } - - void unconsume() { - pos--; - } - - void advance() { - pos++; - } - - void mark() { - mark = pos; - } - - void rewindToMark() { - pos = mark; - } - - String consumeAsString() { - return input.substring(pos, pos++); - } - - String consumeTo(char c) { - int offset = input.indexOf(c, pos); - if (offset != -1) { - String consumed = input.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return consumeToEnd(); - } - } - - String consumeTo(String seq) { - int offset = input.indexOf(seq, pos); - if (offset != -1) { - String consumed = input.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return consumeToEnd(); - } - } - - String consumeToAny(char... seq) { - int start = pos; - - OUTER: while (!isEmpty()) { - char c = input.charAt(pos); - for (char seek : seq) { - if (seek == c) - break OUTER; - } - pos++; - } - - return pos > start ? input.substring(start, pos) : ""; - } - - String consumeToEnd() { - String data = input.substring(pos, input.length()); - pos = input.length(); - return data; - } - - String consumeLetterSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) - pos++; - else - break; - } - - return input.substring(start, pos); - } - - String consumeLetterThenDigitSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) - pos++; - else - break; - } - while (!isEmpty()) { - char c = input.charAt(pos); - if (c >= '0' && c <= '9') - pos++; - else - break; - } - - return input.substring(start, pos); - } - - String consumeHexSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) - pos++; - else - break; - } - return input.substring(start, pos); - } - - String consumeDigitSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if (c >= '0' && c <= '9') - pos++; - else - break; - } - return input.substring(start, pos); - } - - boolean matches(char c) { - return !isEmpty() && input.charAt(pos) == c; - - } - - boolean matches(String seq) { - return input.startsWith(seq, pos); - } - - boolean matchesIgnoreCase(String seq) { - return input.regionMatches(true, pos, seq, 0, seq.length()); - } - - boolean matchesAny(char... seq) { - if (isEmpty()) - return false; - - char c = input.charAt(pos); - for (char seek : seq) { - if (seek == c) - return true; - } - return false; - } - - boolean matchesLetter() { - if (isEmpty()) - return false; - char c = input.charAt(pos); - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - } - - boolean matchesDigit() { - if (isEmpty()) - return false; - char c = input.charAt(pos); - return (c >= '0' && c <= '9'); - } - - boolean matchConsume(String seq) { - if (matches(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - boolean matchConsumeIgnoreCase(String seq) { - if (matchesIgnoreCase(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - boolean containsIgnoreCase(String seq) { - // used to check presence of </title>, </style>. only finds consistent case. - String loScan = seq.toLowerCase(); - String hiScan = seq.toUpperCase(); - return (input.indexOf(loScan, pos) > -1) || (input.indexOf(hiScan, pos) > -1); - } - - @Override - public String toString() { - return input.substring(pos); - } -} diff --git a/src/org/jsoup/parser/HtmlTreeBuilder.java b/src/org/jsoup/parser/HtmlTreeBuilder.java deleted file mode 100644 index 457a4c3249..0000000000 --- a/src/org/jsoup/parser/HtmlTreeBuilder.java +++ /dev/null @@ -1,672 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.*; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * HTML Tree Builder; creates a DOM from Tokens. - */ -class HtmlTreeBuilder extends TreeBuilder { - - private HtmlTreeBuilderState state; // the current state - private HtmlTreeBuilderState originalState; // original / marked state - - private boolean baseUriSetFromDoc = false; - private Element headElement; // the current head element - private Element formElement; // the current form element - private Element contextElement; // fragment parse context -- could be null even if fragment parsing - private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active (open) formatting elements - private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars in table to be shifted out - - private boolean framesetOk = true; // if ok to go into frameset - private boolean fosterInserts = false; // if next inserts should be fostered - private boolean fragmentParsing = false; // if parsing a fragment of html - - HtmlTreeBuilder() {} - - @Override - Document parse(String input, String baseUri, ParseErrorList errors) { - state = HtmlTreeBuilderState.Initial; - return super.parse(input, baseUri, errors); - } - - List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors) { - // context may be null - state = HtmlTreeBuilderState.Initial; - initialiseParse(inputFragment, baseUri, errors); - contextElement = context; - fragmentParsing = true; - Element root = null; - - if (context != null) { - if (context.ownerDocument() != null) // quirks setup: - doc.quirksMode(context.ownerDocument().quirksMode()); - - // initialise the tokeniser state: - String contextTag = context.tagName(); - if (StringUtil.in(contextTag, "title", "textarea")) - tokeniser.transition(TokeniserState.Rcdata); - else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) - tokeniser.transition(TokeniserState.Rawtext); - else if (contextTag.equals("script")) - tokeniser.transition(TokeniserState.ScriptData); - else if (contextTag.equals(("noscript"))) - tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext - else if (contextTag.equals("plaintext")) - tokeniser.transition(TokeniserState.Data); - else - tokeniser.transition(TokeniserState.Data); // default - - root = new Element(Tag.valueOf("html"), baseUri); - doc.appendChild(root); - stack.push(root); - resetInsertionMode(); - // todo: setup form element to nearest form on context (up ancestor chain) - } - - runParser(); - if (context != null) - return root.childNodes(); - else - return doc.childNodes(); - } - - @Override - protected boolean process(Token token) { - currentToken = token; - return this.state.process(token, this); - } - - boolean process(Token token, HtmlTreeBuilderState state) { - currentToken = token; - return state.process(token, this); - } - - void transition(HtmlTreeBuilderState state) { - this.state = state; - } - - HtmlTreeBuilderState state() { - return state; - } - - void markInsertionMode() { - originalState = state; - } - - HtmlTreeBuilderState originalState() { - return originalState; - } - - void framesetOk(boolean framesetOk) { - this.framesetOk = framesetOk; - } - - boolean framesetOk() { - return framesetOk; - } - - Document getDocument() { - return doc; - } - - String getBaseUri() { - return baseUri; - } - - void maybeSetBaseUri(Element base) { - if (baseUriSetFromDoc) // only listen to the first <base href> in parse - return; - - String href = base.absUrl("href"); - if (href.length() != 0) { // ignore <base target> etc - baseUri = href; - baseUriSetFromDoc = true; - doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants - } - } - - boolean isFragmentParsing() { - return fragmentParsing; - } - - void error(HtmlTreeBuilderState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state)); - } - - Element insert(Token.StartTag startTag) { - // handle empty unknown tags - // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate fake end tag. - if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) { - Element el = insertEmpty(startTag); - process(new Token.EndTag(el.tagName())); // ensure we get out of whatever state we are in - return el; - } - - Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes); - insert(el); - return el; - } - - Element insert(String startTagName) { - Element el = new Element(Tag.valueOf(startTagName), baseUri); - insert(el); - return el; - } - - void insert(Element el) { - insertNode(el); - stack.add(el); - } - - Element insertEmpty(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name()); - Element el = new Element(tag, baseUri, startTag.attributes); - insertNode(el); - if (startTag.isSelfClosing()) { - tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output - tag.setSelfClosing(); - } - return el; - } - - void insert(Token.Comment commentToken) { - Comment comment = new Comment(commentToken.getData(), baseUri); - insertNode(comment); - } - - void insert(Token.Character characterToken) { - Node node; - // characters in script and style go in as datanodes, not text nodes - if (StringUtil.in(currentElement().tagName(), "script", "style")) - node = new DataNode(characterToken.getData(), baseUri); - else - node = new TextNode(characterToken.getData(), baseUri); - currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. - } - - private void insertNode(Node node) { - // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc - if (stack.size() == 0) - doc.appendChild(node); - else if (isFosterInserts()) - insertInFosterParent(node); - else - currentElement().appendChild(node); - } - - Element pop() { - // todo - dev, remove validation check - if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell")) - Validate.isFalse(true, "pop td not in cell"); - if (stack.peekLast().nodeName().equals("html")) - Validate.isFalse(true, "popping html!"); - return stack.pollLast(); - } - - void push(Element element) { - stack.add(element); - } - - DescendableLinkedList<Element> getStack() { - return stack; - } - - boolean onStack(Element el) { - return isElementInQueue(stack, el); - } - - private boolean isElementInQueue(DescendableLinkedList<Element> queue, Element element) { - Iterator<Element> it = queue.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == element) { - return true; - } - } - return false; - } - - Element getFromStack(String elName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - return next; - } - } - return null; - } - - boolean removeFromStack(Element el) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - it.remove(); - return true; - } - } - return false; - } - - void popStackToClose(String elName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - it.remove(); - break; - } else { - it.remove(); - } - } - } - - void popStackToClose(String... elNames) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (StringUtil.in(next.nodeName(), elNames)) { - it.remove(); - break; - } else { - it.remove(); - } - } - } - - void popStackToBefore(String elName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - break; - } else { - it.remove(); - } - } - } - - void clearStackToTableContext() { - clearStackToContext("table"); - } - - void clearStackToTableBodyContext() { - clearStackToContext("tbody", "tfoot", "thead"); - } - - void clearStackToTableRowContext() { - clearStackToContext("tr"); - } - - private void clearStackToContext(String... nodeNames) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html")) - break; - else - it.remove(); - } - } - - Element aboveOnStack(Element el) { - assert onStack(el); - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - return it.next(); - } - } - return null; - } - - void insertOnStackAfter(Element after, Element in) { - int i = stack.lastIndexOf(after); - Validate.isTrue(i != -1); - stack.add(i+1, in); - } - - void replaceOnStack(Element out, Element in) { - replaceInQueue(stack, out, in); - } - - private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) { - int i = queue.lastIndexOf(out); - Validate.isTrue(i != -1); - queue.remove(i); - queue.add(i, in); - } - - void resetInsertionMode() { - boolean last = false; - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element node = it.next(); - if (!it.hasNext()) { - last = true; - node = contextElement; - } - String name = node.nodeName(); - if ("select".equals(name)) { - transition(HtmlTreeBuilderState.InSelect); - break; // frag - } else if (("td".equals(name) || "td".equals(name) && !last)) { - transition(HtmlTreeBuilderState.InCell); - break; - } else if ("tr".equals(name)) { - transition(HtmlTreeBuilderState.InRow); - break; - } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) { - transition(HtmlTreeBuilderState.InTableBody); - break; - } else if ("caption".equals(name)) { - transition(HtmlTreeBuilderState.InCaption); - break; - } else if ("colgroup".equals(name)) { - transition(HtmlTreeBuilderState.InColumnGroup); - break; // frag - } else if ("table".equals(name)) { - transition(HtmlTreeBuilderState.InTable); - break; - } else if ("head".equals(name)) { - transition(HtmlTreeBuilderState.InBody); - break; // frag - } else if ("body".equals(name)) { - transition(HtmlTreeBuilderState.InBody); - break; - } else if ("frameset".equals(name)) { - transition(HtmlTreeBuilderState.InFrameset); - break; // frag - } else if ("html".equals(name)) { - transition(HtmlTreeBuilderState.BeforeHead); - break; // frag - } else if (last) { - transition(HtmlTreeBuilderState.InBody); - break; // frag - } - } - } - - // todo: tidy up in specific scope methods - private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { - return inSpecificScope(new String[]{targetName}, baseTypes, extraTypes); - } - - private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element el = it.next(); - String elName = el.nodeName(); - if (StringUtil.in(elName, targetNames)) - return true; - if (StringUtil.in(elName, baseTypes)) - return false; - if (extraTypes != null && StringUtil.in(elName, extraTypes)) - return false; - } - Validate.fail("Should not be reachable"); - return false; - } - - boolean inScope(String[] targetNames) { - return inSpecificScope(targetNames, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, null); - } - - boolean inScope(String targetName) { - return inScope(targetName, null); - } - - boolean inScope(String targetName, String[] extras) { - return inSpecificScope(targetName, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, extras); - // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml - // todo: in svg namespace: forignOjbect, desc, title - } - - boolean inListItemScope(String targetName) { - return inScope(targetName, new String[]{"ol", "ul"}); - } - - boolean inButtonScope(String targetName) { - return inScope(targetName, new String[]{"button"}); - } - - boolean inTableScope(String targetName) { - return inSpecificScope(targetName, new String[]{"html", "table"}, null); - } - - boolean inSelectScope(String targetName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element el = it.next(); - String elName = el.nodeName(); - if (elName.equals(targetName)) - return true; - if (!StringUtil.in(elName, "optgroup", "option")) // all elements except - return false; - } - Validate.fail("Should not be reachable"); - return false; - } - - void setHeadElement(Element headElement) { - this.headElement = headElement; - } - - Element getHeadElement() { - return headElement; - } - - boolean isFosterInserts() { - return fosterInserts; - } - - void setFosterInserts(boolean fosterInserts) { - this.fosterInserts = fosterInserts; - } - - Element getFormElement() { - return formElement; - } - - void setFormElement(Element formElement) { - this.formElement = formElement; - } - - void newPendingTableCharacters() { - pendingTableCharacters = new ArrayList<Token.Character>(); - } - - List<Token.Character> getPendingTableCharacters() { - return pendingTableCharacters; - } - - void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) { - this.pendingTableCharacters = pendingTableCharacters; - } - - /** - 11.2.5.2 Closing elements that have implied end tags<p/> - When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a - dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element, - the UA must pop the current node off the stack of open elements. - - @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the - process, then the UA must perform the above steps as if that element was not in the above list. - */ - void generateImpliedEndTags(String excludeTag) { - while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) && - StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) - pop(); - } - - void generateImpliedEndTags() { - generateImpliedEndTags(null); - } - - boolean isSpecial(Element el) { - // todo: mathml's mi, mo, mn - // todo: svg's foreigObject, desc, title - String name = el.nodeName(); - return StringUtil.in(name, "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", - "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", - "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", - "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", - "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", - "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", - "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", - "title", "tr", "ul", "wbr", "xmp"); - } - - // active formatting elements - void pushActiveFormattingElements(Element in) { - int numSeen = 0; - Iterator<Element> iter = formattingElements.descendingIterator(); - while (iter.hasNext()) { - Element el = iter.next(); - if (el == null) // marker - break; - - if (isSameFormattingElement(in, el)) - numSeen++; - - if (numSeen == 3) { - iter.remove(); - break; - } - } - formattingElements.add(in); - } - - private boolean isSameFormattingElement(Element a, Element b) { - // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children - return a.nodeName().equals(b.nodeName()) && - // a.namespace().equals(b.namespace()) && - a.attributes().equals(b.attributes()); - // todo: namespaces - } - - void reconstructFormattingElements() { - int size = formattingElements.size(); - if (size == 0 || formattingElements.getLast() == null || onStack(formattingElements.getLast())) - return; - - Element entry = formattingElements.getLast(); - int pos = size - 1; - boolean skip = false; - while (true) { - if (pos == 0) { // step 4. if none before, skip to 8 - skip = true; - break; - } - entry = formattingElements.get(--pos); // step 5. one earlier than entry - if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack - break; // jump to 8, else continue back to 4 - } - while(true) { - if (!skip) // step 7: on later than entry - entry = formattingElements.get(++pos); - Validate.notNull(entry); // should not occur, as we break at last element - - // 8. create new element from element, 9 insert into current node, onto stack - skip = false; // can only skip increment from 4. - Element newEl = insert(entry.nodeName()); // todo: avoid fostering here? - // newEl.namespace(entry.namespace()); // todo: namespaces - newEl.attributes().addAll(entry.attributes()); - - // 10. replace entry with new entry - formattingElements.add(pos, newEl); - formattingElements.remove(pos + 1); - - // 11 - if (pos == size-1) // if not last entry in list, jump to 7 - break; - } - } - - void clearFormattingElementsToLastMarker() { - while (!formattingElements.isEmpty()) { - Element el = formattingElements.peekLast(); - formattingElements.removeLast(); - if (el == null) - break; - } - } - - void removeFromActiveFormattingElements(Element el) { - Iterator<Element> it = formattingElements.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - it.remove(); - break; - } - } - } - - boolean isInActiveFormattingElements(Element el) { - return isElementInQueue(formattingElements, el); - } - - Element getActiveFormattingElement(String nodeName) { - Iterator<Element> it = formattingElements.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == null) // scope marker - break; - else if (next.nodeName().equals(nodeName)) - return next; - } - return null; - } - - void replaceActiveFormattingElement(Element out, Element in) { - replaceInQueue(formattingElements, out, in); - } - - void insertMarkerToFormattingElements() { - formattingElements.add(null); - } - - void insertInFosterParent(Node in) { - Element fosterParent = null; - Element lastTable = getFromStack("table"); - boolean isLastTableParent = false; - if (lastTable != null) { - if (lastTable.parent() != null) { - fosterParent = lastTable.parent(); - isLastTableParent = true; - } else - fosterParent = aboveOnStack(lastTable); - } else { // no table == frag - fosterParent = stack.get(0); - } - - if (isLastTableParent) { - Validate.notNull(lastTable); // last table cannot be null by this point. - lastTable.before(in); - } - else - fosterParent.appendChild(in); - } - - @Override - public String toString() { - return "TreeBuilder{" + - "currentToken=" + currentToken + - ", state=" + state + - ", currentElement=" + currentElement() + - '}'; - } -} diff --git a/src/org/jsoup/parser/HtmlTreeBuilderState.java b/src/org/jsoup/parser/HtmlTreeBuilderState.java deleted file mode 100644 index ceab9faa5a..0000000000 --- a/src/org/jsoup/parser/HtmlTreeBuilderState.java +++ /dev/null @@ -1,1482 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.*; - -import java.util.Iterator; -import java.util.LinkedList; - -/** - * The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states. - */ -enum HtmlTreeBuilderState { - Initial { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return true; // ignore whitespace - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - // todo: parse error check on expected doctypes - // todo: quirk state check on doctype ids - Token.Doctype d = t.asDoctype(); - DocumentType doctype = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri()); - tb.getDocument().appendChild(doctype); - if (d.isForceQuirks()) - tb.getDocument().quirksMode(Document.QuirksMode.quirks); - tb.transition(BeforeHtml); - } else { - // todo: check not iframe srcdoc - tb.transition(BeforeHtml); - return tb.process(t); // re-process token - } - return true; - } - }, - BeforeHtml { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (isWhitespace(t)) { - return true; // ignore whitespace - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - tb.insert(t.asStartTag()); - tb.transition(BeforeHead); - } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { - return anythingElse(t, tb); - } else if (t.isEndTag()) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.insert("html"); - tb.transition(BeforeHead); - return tb.process(t); - } - }, - BeforeHead { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return true; - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return InBody.process(t, tb); // does not transition - } else if (t.isStartTag() && t.asStartTag().name().equals("head")) { - Element head = tb.insert(t.asStartTag()); - tb.setHeadElement(head); - tb.transition(InHead); - } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { - tb.process(new Token.StartTag("head")); - return tb.process(t); - } else if (t.isEndTag()) { - tb.error(this); - return false; - } else { - tb.process(new Token.StartTag("head")); - return tb.process(t); - } - return true; - } - }, - InHead { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - return true; - } - switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return InBody.process(t, tb); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { - Element el = tb.insertEmpty(start); - // jsoup special: update base the frist time it is seen - if (name.equals("base") && el.hasAttr("href")) - tb.maybeSetBaseUri(el); - } else if (name.equals("meta")) { - Element meta = tb.insertEmpty(start); - // todo: charset switches - } else if (name.equals("title")) { - handleRcData(start, tb); - } else if (StringUtil.in(name, "noframes", "style")) { - handleRawtext(start, tb); - } else if (name.equals("noscript")) { - // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) - tb.insert(start); - tb.transition(InHeadNoscript); - } else if (name.equals("script")) { - // skips some script rules as won't execute them - tb.insert(start); - tb.tokeniser.transition(TokeniserState.ScriptData); - tb.markInsertionMode(); - tb.transition(Text); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("head")) { - tb.pop(); - tb.transition(AfterHead); - } else if (StringUtil.in(name, "body", "html", "br")) { - return anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, TreeBuilder tb) { - tb.process(new Token.EndTag("head")); - return tb.process(t); - } - }, - InHeadNoscript { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isDoctype()) { - tb.error(this); - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { - tb.pop(); - tb.transition(InHead); - } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "basefont", "bgsound", "link", "meta", "noframes", "style"))) { - return tb.process(t, InHead); - } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { - return anythingElse(t, tb); - } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - tb.process(new Token.EndTag("noscript")); - return tb.process(t); - } - }, - AfterHead { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - } else if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - return tb.process(t, InBody); - } else if (name.equals("body")) { - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InBody); - } else if (name.equals("frameset")) { - tb.insert(startTag); - tb.transition(InFrameset); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { - tb.error(this); - Element head = tb.getHeadElement(); - tb.push(head); - tb.process(t, InHead); - tb.removeFromStack(head); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - anythingElse(t, tb); - } - } else if (t.isEndTag()) { - if (StringUtil.in(t.asEndTag().name(), "body", "html")) { - anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - } else { - anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.process(new Token.StartTag("body")); - tb.framesetOk(true); - return tb.process(t); - } - }, - InBody { - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: { - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - // todo confirm that check - tb.error(this); - return false; - } else if (isWhitespace(c)) { - tb.reconstructFormattingElements(); - tb.insert(c); - } else { - tb.reconstructFormattingElements(); - tb.insert(c); - tb.framesetOk(false); - } - break; - } - case Comment: { - tb.insert(t.asComment()); - break; - } - case Doctype: { - tb.error(this); - return false; - } - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - tb.error(this); - // merge attributes onto real html - Element html = tb.getStack().getFirst(); - for (Attribute attribute : startTag.getAttributes()) { - if (!html.hasAttr(attribute.getKey())) - html.attributes().put(attribute); - } - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title")) { - return tb.process(t, InHead); - } else if (name.equals("body")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { - // only in fragment case - return false; // ignore - } else { - tb.framesetOk(false); - Element body = stack.get(1); - for (Attribute attribute : startTag.getAttributes()) { - if (!body.hasAttr(attribute.getKey())) - body.attributes().put(attribute); - } - } - } else if (name.equals("frameset")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { - // only in fragment case - return false; // ignore - } else if (!tb.framesetOk()) { - return false; // ignore frameset - } else { - Element second = stack.get(1); - if (second.parent() != null) - second.remove(); - // pop up to html element - while (stack.size() > 1) - stack.removeLast(); - tb.insert(startTag); - tb.transition(InFrameset); - } - } else if (StringUtil.in(name, - "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", - "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", - "p", "section", "summary", "ul")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - if (StringUtil.in(tb.currentElement().nodeName(), "h1", "h2", "h3", "h4", "h5", "h6")) { - tb.error(this); - tb.pop(); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "pre", "listing")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - // todo: ignore LF if next token - tb.framesetOk(false); - } else if (name.equals("form")) { - if (tb.getFormElement() != null) { - tb.error(this); - return false; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - Element form = tb.insert(startTag); - tb.setFormElement(form); - } else if (name.equals("li")) { - tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (el.nodeName().equals("li")) { - tb.process(new Token.EndTag("li")); - break; - } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) - break; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "dd", "dt")) { - tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (StringUtil.in(el.nodeName(), "dd", "dt")) { - tb.process(new Token.EndTag(el.nodeName())); - break; - } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) - break; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (name.equals("plaintext")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out - } else if (name.equals("button")) { - if (tb.inButtonScope("button")) { - // close and reprocess - tb.error(this); - tb.process(new Token.EndTag("button")); - tb.process(startTag); - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); - } - } else if (name.equals("a")) { - if (tb.getActiveFormattingElement("a") != null) { - tb.error(this); - tb.process(new Token.EndTag("a")); - - // still on stack? - Element remainingA = tb.getFromStack("a"); - if (remainingA != null) { - tb.removeFromActiveFormattingElements(remainingA); - tb.removeFromStack(remainingA); - } - } - tb.reconstructFormattingElements(); - Element a = tb.insert(startTag); - tb.pushActiveFormattingElements(a); - } else if (StringUtil.in(name, - "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) { - tb.reconstructFormattingElements(); - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (name.equals("nobr")) { - tb.reconstructFormattingElements(); - if (tb.inScope("nobr")) { - tb.error(this); - tb.process(new Token.EndTag("nobr")); - tb.reconstructFormattingElements(); - } - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.insertMarkerToFormattingElements(); - tb.framesetOk(false); - } else if (name.equals("table")) { - if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InTable); - } else if (StringUtil.in(name, "area", "br", "embed", "img", "keygen", "wbr")) { - tb.reconstructFormattingElements(); - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("input")) { - tb.reconstructFormattingElements(); - Element el = tb.insertEmpty(startTag); - if (!el.attr("type").equalsIgnoreCase("hidden")) - tb.framesetOk(false); - } else if (StringUtil.in(name, "param", "source", "track")) { - tb.insertEmpty(startTag); - } else if (name.equals("hr")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("image")) { - // we're not supposed to ask. - startTag.name("img"); - return tb.process(startTag); - } else if (name.equals("isindex")) { - // how much do we care about the early 90s? - tb.error(this); - if (tb.getFormElement() != null) - return false; - - tb.tokeniser.acknowledgeSelfClosingFlag(); - tb.process(new Token.StartTag("form")); - if (startTag.attributes.hasKey("action")) { - Element form = tb.getFormElement(); - form.attr("action", startTag.attributes.get("action")); - } - tb.process(new Token.StartTag("hr")); - tb.process(new Token.StartTag("label")); - // hope you like english. - String prompt = startTag.attributes.hasKey("prompt") ? - startTag.attributes.get("prompt") : - "This is a searchable index. Enter search keywords: "; - - tb.process(new Token.Character(prompt)); - - // input - Attributes inputAttribs = new Attributes(); - for (Attribute attr : startTag.attributes) { - if (!StringUtil.in(attr.getKey(), "name", "action", "prompt")) - inputAttribs.put(attr); - } - inputAttribs.put("name", "isindex"); - tb.process(new Token.StartTag("input", inputAttribs)); - tb.process(new Token.EndTag("label")); - tb.process(new Token.StartTag("hr")); - tb.process(new Token.EndTag("form")); - } else if (name.equals("textarea")) { - tb.insert(startTag); - // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.framesetOk(false); - tb.transition(Text); - } else if (name.equals("xmp")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.reconstructFormattingElements(); - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("iframe")) { - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("noembed")) { - // also handle noscript if script enabled - handleRawtext(startTag, tb); - } else if (name.equals("select")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); - - HtmlTreeBuilderState state = tb.state(); - if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) - tb.transition(InSelectInTable); - else - tb.transition(InSelect); - } else if (StringUtil.in("optgroup", "option")) { - if (tb.currentElement().nodeName().equals("option")) - tb.process(new Token.EndTag("option")); - tb.reconstructFormattingElements(); - tb.insert(startTag); - } else if (StringUtil.in("rp", "rt")) { - if (tb.inScope("ruby")) { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("ruby")) { - tb.error(this); - tb.popStackToBefore("ruby"); // i.e. close up to but not include name - } - tb.insert(startTag); - } - } else if (name.equals("math")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (name.equals("svg")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, svg) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (StringUtil.in(name, - "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) { - tb.error(this); - return false; - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - } - break; - - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("body")) { - if (!tb.inScope("body")) { - tb.error(this); - return false; - } else { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - tb.transition(AfterBody); - } - } else if (name.equals("html")) { - boolean notIgnored = tb.process(new Token.EndTag("body")); - if (notIgnored) - return tb.process(endTag); - } else if (StringUtil.in(name, - "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", - "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", - "nav", "ol", "pre", "section", "summary", "ul")) { - // todo: refactor these lookups - if (!tb.inScope(name)) { - // nothing to close - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (name.equals("form")) { - Element currentForm = tb.getFormElement(); - tb.setFormElement(null); - if (currentForm == null || !tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - // remove currentForm from stack. will shift anything under up. - tb.removeFromStack(currentForm); - } - } else if (name.equals("p")) { - if (!tb.inButtonScope(name)) { - tb.error(this); - tb.process(new Token.StartTag(name)); // if no p to close, creates an empty <p></p> - return tb.process(endTag); - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (name.equals("li")) { - if (!tb.inListItemScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, "dd", "dt")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { - if (!tb.inScope(new String[]{"h1", "h2", "h3", "h4", "h5", "h6"})) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); - } - } else if (name.equals("sarcasm")) { - // *sigh* - return anyOtherEndTag(t, tb); - } else if (StringUtil.in(name, - "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u")) { - // Adoption Agency Algorithm. - OUTER: - for (int i = 0; i < 8; i++) { - Element formatEl = tb.getActiveFormattingElement(name); - if (formatEl == null) - return anyOtherEndTag(t, tb); - else if (!tb.onStack(formatEl)) { - tb.error(this); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } else if (!tb.inScope(formatEl.nodeName())) { - tb.error(this); - return false; - } else if (tb.currentElement() != formatEl) - tb.error(this); - - Element furthestBlock = null; - Element commonAncestor = null; - boolean seenFormattingElement = false; - LinkedList<Element> stack = tb.getStack(); - for (int si = 0; si < stack.size(); si++) { - Element el = stack.get(si); - if (el == formatEl) { - commonAncestor = stack.get(si - 1); - seenFormattingElement = true; - } else if (seenFormattingElement && tb.isSpecial(el)) { - furthestBlock = el; - break; - } - } - if (furthestBlock == null) { - tb.popStackToClose(formatEl.nodeName()); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } - - // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. - // does that mean: int pos of format el in list? - Element node = furthestBlock; - Element lastNode = furthestBlock; - INNER: - for (int j = 0; j < 3; j++) { - if (tb.onStack(node)) - node = tb.aboveOnStack(node); - if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check - tb.removeFromStack(node); - continue INNER; - } else if (node == formatEl) - break INNER; - - Element replacement = new Element(Tag.valueOf(node.nodeName()), tb.getBaseUri()); - tb.replaceActiveFormattingElement(node, replacement); - tb.replaceOnStack(node, replacement); - node = replacement; - - if (lastNode == furthestBlock) { - // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements. - // not getting how this bookmark both straddles the element above, but is inbetween here... - } - if (lastNode.parent() != null) - lastNode.remove(); - node.appendChild(lastNode); - - lastNode = node; - } - - if (StringUtil.in(commonAncestor.nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - if (lastNode.parent() != null) - lastNode.remove(); - tb.insertInFosterParent(lastNode); - } else { - if (lastNode.parent() != null) - lastNode.remove(); - commonAncestor.appendChild(lastNode); - } - - Element adopter = new Element(Tag.valueOf(name), tb.getBaseUri()); - Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodes().size()]); - for (Node childNode : childNodes) { - adopter.appendChild(childNode); // append will reparent. thus the clone to avoid concurrent mod. - } - furthestBlock.appendChild(adopter); - tb.removeFromActiveFormattingElements(formatEl); - // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. - tb.removeFromStack(formatEl); - tb.insertOnStackAfter(furthestBlock, adopter); - } - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - if (!tb.inScope("name")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); - } - } else if (name.equals("br")) { - tb.error(this); - tb.process(new Token.StartTag("br")); - return false; - } else { - return anyOtherEndTag(t, tb); - } - - break; - case EOF: - // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html - // stop parsing - break; - } - return true; - } - - boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) { - String name = t.asEndTag().name(); - DescendableLinkedList<Element> stack = tb.getStack(); - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element node = it.next(); - if (node.nodeName().equals(name)) { - tb.generateImpliedEndTags(name); - if (!name.equals(tb.currentElement().nodeName())) - tb.error(this); - tb.popStackToClose(name); - break; - } else { - if (tb.isSpecial(node)) { - tb.error(this); - return false; - } - } - } - return true; - } - }, - Text { - // in script, style etc. normally treated as data tags - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isCharacter()) { - tb.insert(t.asCharacter()); - } else if (t.isEOF()) { - tb.error(this); - // if current node is script: already started - tb.pop(); - tb.transition(tb.originalState()); - return tb.process(t); - } else if (t.isEndTag()) { - // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts - tb.pop(); - tb.transition(tb.originalState()); - } - return true; - } - }, - InTable { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isCharacter()) { - tb.newPendingTableCharacters(); - tb.markInsertionMode(); - tb.transition(InTableText); - return tb.process(t); - } else if (t.isComment()) { - tb.insert(t.asComment()); - return true; - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("caption")) { - tb.clearStackToTableContext(); - tb.insertMarkerToFormattingElements(); - tb.insert(startTag); - tb.transition(InCaption); - } else if (name.equals("colgroup")) { - tb.clearStackToTableContext(); - tb.insert(startTag); - tb.transition(InColumnGroup); - } else if (name.equals("col")) { - tb.process(new Token.StartTag("colgroup")); - return tb.process(t); - } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - tb.clearStackToTableContext(); - tb.insert(startTag); - tb.transition(InTableBody); - } else if (StringUtil.in(name, "td", "th", "tr")) { - tb.process(new Token.StartTag("tbody")); - return tb.process(t); - } else if (name.equals("table")) { - tb.error(this); - boolean processed = tb.process(new Token.EndTag("table")); - if (processed) // only ignored if in fragment - return tb.process(t); - } else if (StringUtil.in(name, "style", "script")) { - return tb.process(t, InHead); - } else if (name.equals("input")) { - if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) { - return anythingElse(t, tb); - } else { - tb.insertEmpty(startTag); - } - } else if (name.equals("form")) { - tb.error(this); - if (tb.getFormElement() != null) - return false; - else { - Element form = tb.insertEmpty(startTag); - tb.setFormElement(form); - } - } else { - return anythingElse(t, tb); - } - } else if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (name.equals("table")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose("table"); - } - tb.resetInsertionMode(); - } else if (StringUtil.in(name, - "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - } else if (t.isEOF()) { - if (tb.currentElement().nodeName().equals("html")) - tb.error(this); - return true; // stops parsing - } - return anythingElse(t, tb); - } - - boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - boolean processed = true; - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - processed = tb.process(t, InBody); - tb.setFosterInserts(false); - } else { - processed = tb.process(t, InBody); - } - return processed; - } - }, - InTableText { - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.getPendingTableCharacters().add(c); - } - break; - default: - if (tb.getPendingTableCharacters().size() > 0) { - for (Token.Character character : tb.getPendingTableCharacters()) { - if (!isWhitespace(character)) { - // InTable anything else section: - tb.error(this); - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - tb.process(character, InBody); - tb.setFosterInserts(false); - } else { - tb.process(character, InBody); - } - } else - tb.insert(character); - } - tb.newPendingTableCharacters(); - } - tb.transition(tb.originalState()); - return tb.process(t); - } - return true; - } - }, - InCaption { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isEndTag() && t.asEndTag().name().equals("caption")) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("caption")) - tb.error(this); - tb.popStackToClose("caption"); - tb.clearFormattingElementsToLastMarker(); - tb.transition(InTable); - } - } else if (( - t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || - t.isEndTag() && t.asEndTag().name().equals("table")) - ) { - tb.error(this); - boolean processed = tb.process(new Token.EndTag("caption")); - if (processed) - return tb.process(t); - } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), - "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { - tb.error(this); - return false; - } else { - return tb.process(t, InBody); - } - return true; - } - }, - InColumnGroup { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - return true; - } - switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - break; - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) - return tb.process(t, InBody); - else if (name.equals("col")) - tb.insertEmpty(startTag); - else - return anythingElse(t, tb); - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("colgroup")) { - if (tb.currentElement().nodeName().equals("html")) { // frag case - tb.error(this); - return false; - } else { - tb.pop(); - tb.transition(InTable); - } - } else - return anythingElse(t, tb); - break; - case EOF: - if (tb.currentElement().nodeName().equals("html")) - return true; // stop parsing; frag case - else - return anythingElse(t, tb); - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, TreeBuilder tb) { - boolean processed = tb.process(new Token.EndTag("colgroup")); - if (processed) // only ignored in frag case - return tb.process(t); - return true; - } - }, - InTableBody { - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("tr")) { - tb.clearStackToTableBodyContext(); - tb.insert(startTag); - tb.transition(InRow); - } else if (StringUtil.in(name, "th", "td")) { - tb.error(this); - tb.process(new Token.StartTag("tr")); - return tb.process(startTag); - } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { - return exitTableBody(t, tb); - } else - return anythingElse(t, tb); - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.clearStackToTableBodyContext(); - tb.pop(); - tb.transition(InTable); - } - } else if (name.equals("table")) { - return exitTableBody(t, tb); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { - tb.error(this); - return false; - } else - return anythingElse(t, tb); - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { - if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) { - // frag case - tb.error(this); - return false; - } - tb.clearStackToTableBodyContext(); - tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, tfoot, thead - return tb.process(t); - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InTable); - } - }, - InRow { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - - if (StringUtil.in(name, "th", "td")) { - tb.clearStackToTableRowContext(); - tb.insert(startTag); - tb.transition(InCell); - tb.insertMarkerToFormattingElements(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { - return handleMissingTr(t, tb); - } else { - return anythingElse(t, tb); - } - } else if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (name.equals("tr")) { - if (!tb.inTableScope(name)) { - tb.error(this); // frag - return false; - } - tb.clearStackToTableRowContext(); - tb.pop(); // tr - tb.transition(InTableBody); - } else if (name.equals("table")) { - return handleMissingTr(t, tb); - } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } - tb.process(new Token.EndTag("tr")); - return tb.process(t); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InTable); - } - - private boolean handleMissingTr(Token t, TreeBuilder tb) { - boolean processed = tb.process(new Token.EndTag("tr")); - if (processed) - return tb.process(t); - else - return false; - } - }, - InCell { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (StringUtil.in(name, "td", "th")) { - if (!tb.inTableScope(name)) { - tb.error(this); - tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); - tb.transition(InRow); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { - tb.error(this); - return false; - } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } - closeCell(tb); - return tb.process(t); - } else { - return anythingElse(t, tb); - } - } else if (t.isStartTag() && - StringUtil.in(t.asStartTag().name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { - if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { - tb.error(this); - return false; - } - closeCell(tb); - return tb.process(t); - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InBody); - } - - private void closeCell(HtmlTreeBuilder tb) { - if (tb.inTableScope("td")) - tb.process(new Token.EndTag("td")); - else - tb.process(new Token.EndTag("th")); // only here if th or td in scope - } - }, - InSelect { - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.insert(c); - } - break; - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) - return tb.process(start, InBody); - else if (name.equals("option")) { - tb.process(new Token.EndTag("option")); - tb.insert(start); - } else if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option")) - tb.process(new Token.EndTag("option")); - else if (tb.currentElement().nodeName().equals("optgroup")) - tb.process(new Token.EndTag("optgroup")); - tb.insert(start); - } else if (name.equals("select")) { - tb.error(this); - return tb.process(new Token.EndTag("select")); - } else if (StringUtil.in(name, "input", "keygen", "textarea")) { - tb.error(this); - if (!tb.inSelectScope("select")) - return false; // frag - tb.process(new Token.EndTag("select")); - return tb.process(start); - } else if (name.equals("script")) { - return tb.process(t, InHead); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup")) - tb.process(new Token.EndTag("option")); - if (tb.currentElement().nodeName().equals("optgroup")) - tb.pop(); - else - tb.error(this); - } else if (name.equals("option")) { - if (tb.currentElement().nodeName().equals("option")) - tb.pop(); - else - tb.error(this); - } else if (name.equals("select")) { - if (!tb.inSelectScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose(name); - tb.resetInsertionMode(); - } - } else - return anythingElse(t, tb); - break; - case EOF: - if (!tb.currentElement().nodeName().equals("html")) - tb.error(this); - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - return false; - } - }, - InSelectInTable { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { - tb.error(this); - tb.process(new Token.EndTag("select")); - return tb.process(t); - } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { - tb.error(this); - if (tb.inTableScope(t.asEndTag().name())) { - tb.process(new Token.EndTag("select")); - return (tb.process(t)); - } else - return false; - } else { - return tb.process(t, InSelect); - } - } - }, - AfterBody { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return tb.process(t, InBody); - } else if (t.isComment()) { - tb.insert(t.asComment()); // into html node - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { - if (tb.isFragmentParsing()) { - tb.error(this); - return false; - } else { - tb.transition(AfterAfterBody); - } - } else if (t.isEOF()) { - // chillax! we're done - } else { - tb.error(this); - tb.transition(InBody); - return tb.process(t); - } - return true; - } - }, - InFrameset { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag()) { - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return tb.process(start, InBody); - } else if (name.equals("frameset")) { - tb.insert(start); - } else if (name.equals("frame")) { - tb.insertEmpty(start); - } else if (name.equals("noframes")) { - return tb.process(start, InHead); - } else { - tb.error(this); - return false; - } - } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) { - if (tb.currentElement().nodeName().equals("html")) { // frag - tb.error(this); - return false; - } else { - tb.pop(); - if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) { - tb.transition(AfterFrameset); - } - } - } else if (t.isEOF()) { - if (!tb.currentElement().nodeName().equals("html")) { - tb.error(this); - return true; - } - } else { - tb.error(this); - return false; - } - return true; - } - }, - AfterFrameset { - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { - tb.transition(AfterAfterFrameset); - } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { - return tb.process(t, InHead); - } else if (t.isEOF()) { - // cool your heels, we're complete - } else { - tb.error(this); - return false; - } - return true; - } - }, - AfterAfterBody { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { - return tb.process(t, InBody); - } else if (t.isEOF()) { - // nice work chuck - } else { - tb.error(this); - tb.transition(InBody); - return tb.process(t); - } - return true; - } - }, - AfterAfterFrameset { - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { - return tb.process(t, InBody); - } else if (t.isEOF()) { - // nice work chuck - } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { - return tb.process(t, InHead); - } else { - tb.error(this); - return false; - } - return true; - } - }, - ForeignContent { - boolean process(Token t, HtmlTreeBuilder tb) { - return true; - // todo: implement. Also; how do we get here? - } - }; - - private static String nullString = String.valueOf('\u0000'); - - abstract boolean process(Token t, HtmlTreeBuilder tb); - - private static boolean isWhitespace(Token t) { - if (t.isCharacter()) { - String data = t.asCharacter().getData(); - // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " - for (int i = 0; i < data.length(); i++) { - char c = data.charAt(i); - if (!StringUtil.isWhitespace(c)) - return false; - } - return true; - } - return false; - } - - private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) { - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.transition(Text); - } - - private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) { - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.Rawtext); - tb.markInsertionMode(); - tb.transition(Text); - } -} diff --git a/src/org/jsoup/parser/ParseError.java b/src/org/jsoup/parser/ParseError.java deleted file mode 100644 index dfa090051b..0000000000 --- a/src/org/jsoup/parser/ParseError.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.jsoup.parser; - -/** - * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. - */ -public class ParseError { - private int pos; - private String errorMsg; - - ParseError(int pos, String errorMsg) { - this.pos = pos; - this.errorMsg = errorMsg; - } - - ParseError(int pos, String errorFormat, Object... args) { - this.errorMsg = String.format(errorFormat, args); - this.pos = pos; - } - - /** - * Retrieve the error message. - * @return the error message. - */ - public String getErrorMessage() { - return errorMsg; - } - - /** - * Retrieves the offset of the error. - * @return error offset within input - */ - public int getPosition() { - return pos; - } - - @Override - public String toString() { - return pos + ": " + errorMsg; - } -} diff --git a/src/org/jsoup/parser/ParseErrorList.java b/src/org/jsoup/parser/ParseErrorList.java deleted file mode 100644 index 3824ffbc4e..0000000000 --- a/src/org/jsoup/parser/ParseErrorList.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.jsoup.parser; - -import java.util.ArrayList; - -/** - * A container for ParseErrors. - * - * @author Jonathan Hedley - */ -class ParseErrorList extends ArrayList<ParseError>{ - private static final int INITIAL_CAPACITY = 16; - private final int maxSize; - - ParseErrorList(int initialCapacity, int maxSize) { - super(initialCapacity); - this.maxSize = maxSize; - } - - boolean canAddError() { - return size() < maxSize; - } - - int getMaxSize() { - return maxSize; - } - - static ParseErrorList noTracking() { - return new ParseErrorList(0, 0); - } - - static ParseErrorList tracking(int maxSize) { - return new ParseErrorList(INITIAL_CAPACITY, maxSize); - } -} diff --git a/src/org/jsoup/parser/Parser.java b/src/org/jsoup/parser/Parser.java deleted file mode 100644 index 2236219c06..0000000000 --- a/src/org/jsoup/parser/Parser.java +++ /dev/null @@ -1,157 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -import java.util.List; - -/** - * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods - * in {@link org.jsoup.Jsoup}. - */ -public class Parser { - private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. - - private TreeBuilder treeBuilder; - private int maxErrors = DEFAULT_MAX_ERRORS; - private ParseErrorList errors; - - /** - * Create a new Parser, using the specified TreeBuilder - * @param treeBuilder TreeBuilder to use to parse input into Documents. - */ - public Parser(TreeBuilder treeBuilder) { - this.treeBuilder = treeBuilder; - } - - public Document parseInput(String html, String baseUri) { - errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); - Document doc = treeBuilder.parse(html, baseUri, errors); - return doc; - } - - // gets & sets - /** - * Get the TreeBuilder currently in use. - * @return current TreeBuilder. - */ - public TreeBuilder getTreeBuilder() { - return treeBuilder; - } - - /** - * Update the TreeBuilder used when parsing content. - * @param treeBuilder current TreeBuilder - * @return this, for chaining - */ - public Parser setTreeBuilder(TreeBuilder treeBuilder) { - this.treeBuilder = treeBuilder; - return this; - } - - /** - * Check if parse error tracking is enabled. - * @return current track error state. - */ - public boolean isTrackErrors() { - return maxErrors > 0; - } - - /** - * Enable or disable parse error tracking for the next parse. - * @param maxErrors the maximum number of errors to track. Set to 0 to disable. - * @return this, for chaining - */ - public Parser setTrackErrors(int maxErrors) { - this.maxErrors = maxErrors; - return this; - } - - /** - * Retrieve the parse errors, if any, from the last parse. - * @return list of parse errors, up to the size of the maximum errors tracked. - */ - public List<ParseError> getErrors() { - return errors; - } - - // static parse functions below - /** - * Parse HTML into a Document. - * - * @param html HTML to parse - * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * - * @return parsed Document - */ - public static Document parse(String html, String baseUri) { - TreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking()); - } - - /** - * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. - * - * @param fragmentHtml the fragment of HTML to parse - * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This - * provides stack context (for implicit element creation). - * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * - * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. - */ - public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { - HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking()); - } - - /** - * Parse a fragment of HTML into the {@code body} of a Document. - * - * @param bodyHtml fragment of HTML - * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * - * @return Document, with empty head, and HTML parsed into body - */ - public static Document parseBodyFragment(String bodyHtml, String baseUri) { - Document doc = Document.createShell(baseUri); - Element body = doc.body(); - List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); - Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented - for (Node node : nodes) { - body.appendChild(node); - } - return doc; - } - - /** - * @param bodyHtml HTML to parse - * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * - * @return parsed Document - * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. - */ - public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { - return parse(bodyHtml, baseUri); - } - - // builders - - /** - * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, - * based on a knowledge of the semantics of the incoming tags. - * @return a new HTML parser. - */ - public static Parser htmlParser() { - return new Parser(new HtmlTreeBuilder()); - } - - /** - * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, - * rather creates a simple tree directly from the input. - * @return a new simple XML parser. - */ - public static Parser xmlParser() { - return new Parser(new XmlTreeBuilder()); - } -} diff --git a/src/org/jsoup/parser/Tag.java b/src/org/jsoup/parser/Tag.java deleted file mode 100644 index 40b7557b39..0000000000 --- a/src/org/jsoup/parser/Tag.java +++ /dev/null @@ -1,262 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; - -import java.util.HashMap; -import java.util.Map; - -/** - * HTML Tag capabilities. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Tag { - private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map of known tags - - private String tagName; - private boolean isBlock = true; // block or inline - private boolean formatAsBlock = true; // should be formatted as a block - private boolean canContainBlock = true; // Can this tag hold block level tags? - private boolean canContainInline = true; // only pcdata if not - private boolean empty = false; // can hold nothing; e.g. img - private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. - private boolean preserveWhitespace = false; // for pre, textarea, script etc - - private Tag(String tagName) { - this.tagName = tagName.toLowerCase(); - } - - /** - * Get this tag's name. - * - * @return the tag's name - */ - public String getName() { - return tagName; - } - - /** - * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. - * <p/> - * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). - * - * @param tagName Name of tag, e.g. "p". Case insensitive. - * @return The tag, either defined or new generic. - */ - public static Tag valueOf(String tagName) { - Validate.notNull(tagName); - tagName = tagName.trim().toLowerCase(); - Validate.notEmpty(tagName); - - synchronized (tags) { - Tag tag = tags.get(tagName); - if (tag == null) { - // not defined: create default; go anywhere, do anything! (incl be inside a <p>) - tag = new Tag(tagName); - tag.isBlock = false; - tag.canContainBlock = true; - } - return tag; - } - } - - /** - * Gets if this is a block tag. - * - * @return if block tag - */ - public boolean isBlock() { - return isBlock; - } - - /** - * Gets if this tag should be formatted as a block (or as inline) - * - * @return if should be formatted as block or inline - */ - public boolean formatAsBlock() { - return formatAsBlock; - } - - /** - * Gets if this tag can contain block tags. - * - * @return if tag can contain block tags - */ - public boolean canContainBlock() { - return canContainBlock; - } - - /** - * Gets if this tag is an inline tag. - * - * @return if this tag is an inline tag. - */ - public boolean isInline() { - return !isBlock; - } - - /** - * Gets if this tag is a data only tag. - * - * @return if this tag is a data only tag - */ - public boolean isData() { - return !canContainInline && !isEmpty(); - } - - /** - * Get if this is an empty tag - * - * @return if this is an empty tag - */ - public boolean isEmpty() { - return empty; - } - - /** - * Get if this tag is self closing. - * - * @return if this tag should be output as self closing. - */ - public boolean isSelfClosing() { - return empty || selfClosing; - } - - /** - * Get if this is a pre-defined tag, or was auto created on parsing. - * - * @return if a known tag - */ - public boolean isKnownTag() { - return tags.containsKey(tagName); - } - - /** - * Check if this tagname is a known tag. - * - * @param tagName name of tag - * @return if known HTML tag - */ - public static boolean isKnownTag(String tagName) { - return tags.containsKey(tagName); - } - - /** - * Get if this tag should preserve whitespace within child text nodes. - * - * @return if preserve whitepace - */ - public boolean preserveWhitespace() { - return preserveWhitespace; - } - - Tag setSelfClosing() { - selfClosing = true; - return this; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Tag)) return false; - - Tag tag = (Tag) o; - - if (canContainBlock != tag.canContainBlock) return false; - if (canContainInline != tag.canContainInline) return false; - if (empty != tag.empty) return false; - if (formatAsBlock != tag.formatAsBlock) return false; - if (isBlock != tag.isBlock) return false; - if (preserveWhitespace != tag.preserveWhitespace) return false; - if (selfClosing != tag.selfClosing) return false; - if (!tagName.equals(tag.tagName)) return false; - - return true; - } - - @Override - public int hashCode() { - int result = tagName.hashCode(); - result = 31 * result + (isBlock ? 1 : 0); - result = 31 * result + (formatAsBlock ? 1 : 0); - result = 31 * result + (canContainBlock ? 1 : 0); - result = 31 * result + (canContainInline ? 1 : 0); - result = 31 * result + (empty ? 1 : 0); - result = 31 * result + (selfClosing ? 1 : 0); - result = 31 * result + (preserveWhitespace ? 1 : 0); - return result; - } - - public String toString() { - return tagName; - } - - // internal static initialisers: - // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources - private static final String[] blockTags = { - "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", - "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", - "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", - "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", - "td", "video", "audio", "canvas", "details", "menu", "plaintext" - }; - private static final String[] inlineTags = { - "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", - "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", - "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", - "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", - "summary", "command", "device" - }; - private static final String[] emptyTags = { - "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", - "device" - }; - private static final String[] formatAsInlineTags = { - "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style" - }; - private static final String[] preserveWhitespaceTags = {"pre", "plaintext", "title"}; - - static { - // creates - for (String tagName : blockTags) { - Tag tag = new Tag(tagName); - register(tag); - } - for (String tagName : inlineTags) { - Tag tag = new Tag(tagName); - tag.isBlock = false; - tag.canContainBlock = false; - tag.formatAsBlock = false; - register(tag); - } - - // mods: - for (String tagName : emptyTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.canContainBlock = false; - tag.canContainInline = false; - tag.empty = true; - } - - for (String tagName : formatAsInlineTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.formatAsBlock = false; - } - - for (String tagName : preserveWhitespaceTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.preserveWhitespace = true; - } - } - - private static Tag register(Tag tag) { - synchronized (tags) { - tags.put(tag.tagName, tag); - } - return tag; - } -} diff --git a/src/org/jsoup/parser/Token.java b/src/org/jsoup/parser/Token.java deleted file mode 100644 index 9f4f9e250d..0000000000 --- a/src/org/jsoup/parser/Token.java +++ /dev/null @@ -1,252 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; - -/** - * Parse tokens for the Tokeniser. - */ -abstract class Token { - TokenType type; - - private Token() { - } - - String tokenType() { - return this.getClass().getSimpleName(); - } - - static class Doctype extends Token { - final StringBuilder name = new StringBuilder(); - final StringBuilder publicIdentifier = new StringBuilder(); - final StringBuilder systemIdentifier = new StringBuilder(); - boolean forceQuirks = false; - - Doctype() { - type = TokenType.Doctype; - } - - String getName() { - return name.toString(); - } - - String getPublicIdentifier() { - return publicIdentifier.toString(); - } - - public String getSystemIdentifier() { - return systemIdentifier.toString(); - } - - public boolean isForceQuirks() { - return forceQuirks; - } - } - - static abstract class Tag extends Token { - protected String tagName; - private String pendingAttributeName; - private String pendingAttributeValue; - - boolean selfClosing = false; - Attributes attributes = new Attributes(); // todo: allow nodes to not have attributes - - void newAttribute() { - if (pendingAttributeName != null) { - if (pendingAttributeValue == null) - pendingAttributeValue = ""; - Attribute attribute = new Attribute(pendingAttributeName, pendingAttributeValue); - attributes.put(attribute); - } - pendingAttributeName = null; - pendingAttributeValue = null; - } - - void finaliseTag() { - // finalises for emit - if (pendingAttributeName != null) { - // todo: check if attribute name exists; if so, drop and error - newAttribute(); - } - } - - String name() { - Validate.isFalse(tagName.length() == 0); - return tagName; - } - - Tag name(String name) { - tagName = name; - return this; - } - - boolean isSelfClosing() { - return selfClosing; - } - - @SuppressWarnings({"TypeMayBeWeakened"}) - Attributes getAttributes() { - return attributes; - } - - // these appenders are rarely hit in not null state-- caused by null chars. - void appendTagName(String append) { - tagName = tagName == null ? append : tagName.concat(append); - } - - void appendTagName(char append) { - appendTagName(String.valueOf(append)); - } - - void appendAttributeName(String append) { - pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); - } - - void appendAttributeName(char append) { - appendAttributeName(String.valueOf(append)); - } - - void appendAttributeValue(String append) { - pendingAttributeValue = pendingAttributeValue == null ? append : pendingAttributeValue.concat(append); - } - - void appendAttributeValue(char append) { - appendAttributeValue(String.valueOf(append)); - } - } - - static class StartTag extends Tag { - StartTag() { - super(); - type = TokenType.StartTag; - } - - StartTag(String name) { - this(); - this.tagName = name; - } - - StartTag(String name, Attributes attributes) { - this(); - this.tagName = name; - this.attributes = attributes; - } - - @Override - public String toString() { - return "<" + name() + " " + attributes.toString() + ">"; - } - } - - static class EndTag extends Tag{ - EndTag() { - super(); - type = TokenType.EndTag; - } - - EndTag(String name) { - this(); - this.tagName = name; - } - - @Override - public String toString() { - return "</" + name() + " " + attributes.toString() + ">"; - } - } - - static class Comment extends Token { - final StringBuilder data = new StringBuilder(); - - Comment() { - type = TokenType.Comment; - } - - String getData() { - return data.toString(); - } - - @Override - public String toString() { - return "<!--" + getData() + "-->"; - } - } - - static class Character extends Token { - private final String data; - - Character(String data) { - type = TokenType.Character; - this.data = data; - } - - String getData() { - return data; - } - - @Override - public String toString() { - return getData(); - } - } - - static class EOF extends Token { - EOF() { - type = Token.TokenType.EOF; - } - } - - boolean isDoctype() { - return type == TokenType.Doctype; - } - - Doctype asDoctype() { - return (Doctype) this; - } - - boolean isStartTag() { - return type == TokenType.StartTag; - } - - StartTag asStartTag() { - return (StartTag) this; - } - - boolean isEndTag() { - return type == TokenType.EndTag; - } - - EndTag asEndTag() { - return (EndTag) this; - } - - boolean isComment() { - return type == TokenType.Comment; - } - - Comment asComment() { - return (Comment) this; - } - - boolean isCharacter() { - return type == TokenType.Character; - } - - Character asCharacter() { - return (Character) this; - } - - boolean isEOF() { - return type == TokenType.EOF; - } - - enum TokenType { - Doctype, - StartTag, - EndTag, - Comment, - Character, - EOF - } -} diff --git a/src/org/jsoup/parser/TokenQueue.java b/src/org/jsoup/parser/TokenQueue.java deleted file mode 100644 index a2fdfe621a..0000000000 --- a/src/org/jsoup/parser/TokenQueue.java +++ /dev/null @@ -1,393 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; - -/** - * A character queue with parsing helpers. - * - * @author Jonathan Hedley - */ -public class TokenQueue { - private String queue; - private int pos = 0; - - private static final char ESC = '\\'; // escape char for chomp balanced. - - /** - Create a new TokenQueue. - @param data string of data to back queue. - */ - public TokenQueue(String data) { - Validate.notNull(data); - queue = data; - } - - /** - * Is the queue empty? - * @return true if no data left in queue. - */ - public boolean isEmpty() { - return remainingLength() == 0; - } - - private int remainingLength() { - return queue.length() - pos; - } - - /** - * Retrieves but does not remove the first character from the queue. - * @return First character, or 0 if empty. - */ - public char peek() { - return isEmpty() ? 0 : queue.charAt(pos); - } - - /** - Add a character to the start of the queue (will be the next character retrieved). - @param c character to add - */ - public void addFirst(Character c) { - addFirst(c.toString()); - } - - /** - Add a string to the start of the queue. - @param seq string to add. - */ - public void addFirst(String seq) { - // not very performant, but an edge case - queue = seq + queue.substring(pos); - pos = 0; - } - - /** - * Tests if the next characters on the queue match the sequence. Case insensitive. - * @param seq String to check queue for. - * @return true if the next characters match. - */ - public boolean matches(String seq) { - return queue.regionMatches(true, pos, seq, 0, seq.length()); - } - - /** - * Case sensitive match test. - * @param seq string to case sensitively check for - * @return true if matched, false if not - */ - public boolean matchesCS(String seq) { - return queue.startsWith(seq, pos); - } - - - /** - Tests if the next characters match any of the sequences. Case insensitive. - @param seq list of strings to case insensitively check for - @return true of any matched, false if none did - */ - public boolean matchesAny(String... seq) { - for (String s : seq) { - if (matches(s)) - return true; - } - return false; - } - - public boolean matchesAny(char... seq) { - if (isEmpty()) - return false; - - for (char c: seq) { - if (queue.charAt(pos) == c) - return true; - } - return false; - } - - public boolean matchesStartTag() { - // micro opt for matching "<x" - return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); - } - - /** - * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the - * queue. - * @param seq String to search for, and if found, remove from queue. - * @return true if found and removed, false if not found. - */ - public boolean matchChomp(String seq) { - if (matches(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - /** - Tests if queue starts with a whitespace character. - @return if starts with whitespace - */ - public boolean matchesWhitespace() { - return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); - } - - /** - Test if the queue matches a word character (letter or digit). - @return if matches a word character - */ - public boolean matchesWord() { - return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); - } - - /** - * Drops the next character off the queue. - */ - public void advance() { - if (!isEmpty()) pos++; - } - - /** - * Consume one character off queue. - * @return first character on queue. - */ - public char consume() { - return queue.charAt(pos++); - } - - /** - * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will - * throw an illegal state exception -- but you should be running match() against that condition. - <p> - Case insensitive. - * @param seq sequence to remove from head of queue. - */ - public void consume(String seq) { - if (!matches(seq)) - throw new IllegalStateException("Queue did not match expected sequence"); - int len = seq.length(); - if (len > remainingLength()) - throw new IllegalStateException("Queue not long enough to consume sequence"); - - pos += len; - } - - /** - * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. - * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> - * @return The matched data consumed from queue. - */ - public String consumeTo(String seq) { - int offset = queue.indexOf(seq, pos); - if (offset != -1) { - String consumed = queue.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return remainder(); - } - } - - public String consumeToIgnoreCase(String seq) { - int start = pos; - String first = seq.substring(0, 1); - boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of - while (!isEmpty()) { - if (matches(seq)) - break; - - if (canScan) { - int skip = queue.indexOf(first, pos) - pos; - if (skip == 0) // this char is the skip char, but not match, so force advance of pos - pos++; - else if (skip < 0) // no chance of finding, grab to end - pos = queue.length(); - else - pos += skip; - } - else - pos++; - } - - String data = queue.substring(start, pos); - return data; - } - - /** - Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. - @param seq any number of terminators to consume to. <b>Case insensitive.</b> - @return consumed string - */ - // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this - // is is a case sensitive time... - public String consumeToAny(String... seq) { - int start = pos; - while (!isEmpty() && !matchesAny(seq)) { - pos++; - } - - String data = queue.substring(start, pos); - return data; - } - - /** - * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). - * <p> - * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go - * isEmpty() == true). - * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> - * @return Data matched from queue. - */ - public String chompTo(String seq) { - String data = consumeTo(seq); - matchChomp(seq); - return data; - } - - public String chompToIgnoreCase(String seq) { - String data = consumeToIgnoreCase(seq); // case insensitive scan - matchChomp(seq); - return data; - } - - /** - * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", - * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left - * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for - * contains text strings; use unescape for that. - * @param open opener - * @param close closer - * @return data matched from the queue - */ - public String chompBalanced(char open, char close) { - StringBuilder accum = new StringBuilder(); - int depth = 0; - char last = 0; - - do { - if (isEmpty()) break; - Character c = consume(); - if (last == 0 || last != ESC) { - if (c.equals(open)) - depth++; - else if (c.equals(close)) - depth--; - } - - if (depth > 0 && last != 0) - accum.append(c); // don't include the outer match pair in the return - last = c; - } while (depth > 0); - return accum.toString(); - } - - /** - * Unescaped a \ escaped string. - * @param in backslash escaped string - * @return unescaped string - */ - public static String unescape(String in) { - StringBuilder out = new StringBuilder(); - char last = 0; - for (char c : in.toCharArray()) { - if (c == ESC) { - if (last != 0 && last == ESC) - out.append(c); - } - else - out.append(c); - last = c; - } - return out.toString(); - } - - /** - * Pulls the next run of whitespace characters of the queue. - */ - public boolean consumeWhitespace() { - boolean seen = false; - while (matchesWhitespace()) { - pos++; - seen = true; - } - return seen; - } - - /** - * Retrieves the next run of word type (letter or digit) off the queue. - * @return String of word characters from queue, or empty string if none. - */ - public String consumeWord() { - int start = pos; - while (matchesWord()) - pos++; - return queue.substring(start, pos); - } - - /** - * Consume an tag name off the queue (word or :, _, -) - * - * @return tag name - */ - public String consumeTagName() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) - pos++; - - return queue.substring(start, pos); - } - - /** - * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). - * - * @return tag name - */ - public String consumeElementSelector() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) - pos++; - - return queue.substring(start, pos); - } - - /** - Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) - http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier - @return identifier - */ - public String consumeCssIdentifier() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) - pos++; - - return queue.substring(start, pos); - } - - /** - Consume an attribute key off the queue (letter, digit, -, _, :") - @return attribute key - */ - public String consumeAttributeKey() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) - pos++; - - return queue.substring(start, pos); - } - - /** - Consume and return whatever is left on the queue. - @return remained of queue. - */ - public String remainder() { - StringBuilder accum = new StringBuilder(); - while (!isEmpty()) { - accum.append(consume()); - } - return accum.toString(); - } - - public String toString() { - return queue.substring(pos); - } -} diff --git a/src/org/jsoup/parser/Tokeniser.java b/src/org/jsoup/parser/Tokeniser.java deleted file mode 100644 index ce6ee690d6..0000000000 --- a/src/org/jsoup/parser/Tokeniser.java +++ /dev/null @@ -1,230 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Entities; - -import java.util.ArrayList; -import java.util.List; - -/** - * Readers the input stream into tokens. - */ -class Tokeniser { - static final char replacementChar = '\uFFFD'; // replaces null character - - private CharacterReader reader; // html input - private ParseErrorList errors; // errors found while tokenising - - private TokeniserState state = TokeniserState.Data; // current tokenisation state - private Token emitPending; // the token we are about to emit on next read - private boolean isEmitPending = false; - private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token - StringBuilder dataBuffer; // buffers data looking for </script> - - Token.Tag tagPending; // tag we are building up - Token.Doctype doctypePending; // doctype building up - Token.Comment commentPending; // comment building up - private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag - private boolean selfClosingFlagAcknowledged = true; - - Tokeniser(CharacterReader reader, ParseErrorList errors) { - this.reader = reader; - this.errors = errors; - } - - Token read() { - if (!selfClosingFlagAcknowledged) { - error("Self closing flag not acknowledged"); - selfClosingFlagAcknowledged = true; - } - - while (!isEmitPending) - state.read(this, reader); - - // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: - if (charBuffer.length() > 0) { - String str = charBuffer.toString(); - charBuffer.delete(0, charBuffer.length()); - return new Token.Character(str); - } else { - isEmitPending = false; - return emitPending; - } - } - - void emit(Token token) { - Validate.isFalse(isEmitPending, "There is an unread token pending!"); - - emitPending = token; - isEmitPending = true; - - if (token.type == Token.TokenType.StartTag) { - Token.StartTag startTag = (Token.StartTag) token; - lastStartTag = startTag; - if (startTag.selfClosing) - selfClosingFlagAcknowledged = false; - } else if (token.type == Token.TokenType.EndTag) { - Token.EndTag endTag = (Token.EndTag) token; - if (endTag.attributes.size() > 0) - error("Attributes incorrectly present on end tag"); - } - } - - void emit(String str) { - // buffer strings up until last string token found, to emit only one token for a run of character refs etc. - // does not set isEmitPending; read checks that - charBuffer.append(str); - } - - void emit(char c) { - charBuffer.append(c); - } - - TokeniserState getState() { - return state; - } - - void transition(TokeniserState state) { - this.state = state; - } - - void advanceTransition(TokeniserState state) { - reader.advance(); - this.state = state; - } - - void acknowledgeSelfClosingFlag() { - selfClosingFlagAcknowledged = true; - } - - Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { - if (reader.isEmpty()) - return null; - if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) - return null; - if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) - return null; - - reader.mark(); - if (reader.matchConsume("#")) { // numbered - boolean isHexMode = reader.matchConsumeIgnoreCase("X"); - String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); - if (numRef.length() == 0) { // didn't match anything - characterReferenceError("numeric reference with no numerals"); - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) - characterReferenceError("missing semicolon"); // missing semi - int charval = -1; - try { - int base = isHexMode ? 16 : 10; - charval = Integer.valueOf(numRef, base); - } catch (NumberFormatException e) { - } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { - characterReferenceError("character outside of valid range"); - return replacementChar; - } else { - // todo: implement number replacement table - // todo: check for extra illegal unicode points as parse errors - return (char) charval; - } - } else { // named - // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found - String nameRef = reader.consumeLetterThenDigitSequence(); - String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches - boolean looksLegit = reader.matches(';'); - boolean found = false; - while (nameRef.length() > 0 && !found) { - if (Entities.isNamedEntity(nameRef)) - found = true; - else { - nameRef = nameRef.substring(0, nameRef.length()-1); - reader.unconsume(); - } - } - if (!found) { - if (looksLegit) // named with semicolon - characterReferenceError(String.format("invalid named referenece '%s'", origNameRef)); - reader.rewindToMark(); - return null; - } - if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { - // don't want that to match - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) - characterReferenceError("missing semicolon"); // missing semi - return Entities.getCharacterByName(nameRef); - } - } - - Token.Tag createTagPending(boolean start) { - tagPending = start ? new Token.StartTag() : new Token.EndTag(); - return tagPending; - } - - void emitTagPending() { - tagPending.finaliseTag(); - emit(tagPending); - } - - void createCommentPending() { - commentPending = new Token.Comment(); - } - - void emitCommentPending() { - emit(commentPending); - } - - void createDoctypePending() { - doctypePending = new Token.Doctype(); - } - - void emitDoctypePending() { - emit(doctypePending); - } - - void createTempBuffer() { - dataBuffer = new StringBuilder(); - } - - boolean isAppropriateEndTagToken() { - if (lastStartTag == null) - return false; - return tagPending.tagName.equals(lastStartTag.tagName); - } - - String appropriateEndTagName() { - return lastStartTag.tagName; - } - - void error(TokeniserState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state)); - } - - void eofError(TokeniserState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state)); - } - - private void characterReferenceError(String message) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message)); - } - - private void error(String errorMsg) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), errorMsg)); - } - - boolean currentNodeInHtmlNS() { - // todo: implement namespaces correctly - return true; - // Element currentNode = currentNode(); - // return currentNode != null && currentNode.namespace().equals("HTML"); - } -} diff --git a/src/org/jsoup/parser/TokeniserState.java b/src/org/jsoup/parser/TokeniserState.java deleted file mode 100644 index e3013c73e9..0000000000 --- a/src/org/jsoup/parser/TokeniserState.java +++ /dev/null @@ -1,1778 +0,0 @@ -package org.jsoup.parser; - -/** - * States and transition activations for the Tokeniser. - */ -enum TokeniserState { - Data { - // in data state, gather characters until a character reference or tag is found - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInData); - break; - case '<': - t.advanceTransition(TagOpen); - break; - case nullChar: - t.error(this); // NOT replacement character (oddly?) - t.emit(r.consume()); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; - } - } - }, - CharacterReferenceInData { - // from & in data - void read(Tokeniser t, CharacterReader r) { - Character c = t.consumeCharacterReference(null, false); - if (c == null) - t.emit('&'); - else - t.emit(c); - t.transition(Data); - } - }, - Rcdata { - /// handles data in title, textarea etc - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInRcdata); - break; - case '<': - t.advanceTransition(RcdataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; - } - } - }, - CharacterReferenceInRcdata { - void read(Tokeniser t, CharacterReader r) { - Character c = t.consumeCharacterReference(null, false); - if (c == null) - t.emit('&'); - else - t.emit(c); - t.transition(Rcdata); - } - }, - Rawtext { - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '<': - t.advanceTransition(RawtextLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; - } - } - }, - ScriptData { - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '<': - t.advanceTransition(ScriptDataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; - } - } - }, - PLAINTEXT { - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeTo(nullChar); - t.emit(data); - break; - } - } - }, - TagOpen { - // from < in data - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '!': - t.advanceTransition(MarkupDeclarationOpen); - break; - case '/': - t.advanceTransition(EndTagOpen); - break; - case '?': - t.advanceTransition(BogusComment); - break; - default: - if (r.matchesLetter()) { - t.createTagPending(true); - t.transition(TagName); - } else { - t.error(this); - t.emit('<'); // char that got us here - t.transition(Data); - } - break; - } - } - }, - EndTagOpen { - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.emit("</"); - t.transition(Data); - } else if (r.matchesLetter()) { - t.createTagPending(false); - t.transition(TagName); - } else if (r.matches('>')) { - t.error(this); - t.advanceTransition(Data); - } else { - t.error(this); - t.advanceTransition(BogusComment); - } - } - }, - TagName { - // from < or </ in data, will have start or end tag pending - void read(Tokeniser t, CharacterReader r) { - // previous TagOpen state did NOT consume, will have a letter char in current - String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase(); - t.tagPending.appendTagName(tagName); - - switch (r.consume()) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: // replacement - t.tagPending.appendTagName(replacementStr); - break; - case eof: // should emit pending tag? - t.eofError(this); - t.transition(Data); - // no default, as covered with above consumeToAny - } - } - }, - RcdataLessthanSign { - // from < in rcdata - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(RCDATAEndTagOpen); - } else if (r.matchesLetter() && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) { - // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than - // consuming to EOF; break out here - t.tagPending = new Token.EndTag(t.appropriateEndTagName()); - t.emitTagPending(); - r.unconsume(); // undo "<" - t.transition(Data); - } else { - t.emit("<"); - t.transition(Rcdata); - } - } - }, - RCDATAEndTagOpen { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.tagPending.appendTagName(Character.toLowerCase(r.current())); - t.dataBuffer.append(Character.toLowerCase(r.current())); - t.advanceTransition(RCDATAEndTagName); - } else { - t.emit("</"); - t.transition(Rcdata); - } - } - }, - RCDATAEndTagName { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - if (t.isAppropriateEndTagToken()) - t.transition(BeforeAttributeName); - else - anythingElse(t, r); - break; - case '/': - if (t.isAppropriateEndTagToken()) - t.transition(SelfClosingStartTag); - else - anythingElse(t, r); - break; - case '>': - if (t.isAppropriateEndTagToken()) { - t.emitTagPending(); - t.transition(Data); - } - else - anythingElse(t, r); - break; - default: - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(Rcdata); - } - }, - RawtextLessthanSign { - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(RawtextEndTagOpen); - } else { - t.emit('<'); - t.transition(Rawtext); - } - } - }, - RawtextEndTagOpen { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.transition(RawtextEndTagName); - } else { - t.emit("</"); - t.transition(Rawtext); - } - } - }, - RawtextEndTagName { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - if (t.isAppropriateEndTagToken() && !r.isEmpty()) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - } - } else - anythingElse(t, r); - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(Rawtext); - } - }, - ScriptDataLessthanSign { - void read(Tokeniser t, CharacterReader r) { - switch (r.consume()) { - case '/': - t.createTempBuffer(); - t.transition(ScriptDataEndTagOpen); - break; - case '!': - t.emit("<!"); - t.transition(ScriptDataEscapeStart); - break; - default: - t.emit("<"); - r.unconsume(); - t.transition(ScriptData); - } - } - }, - ScriptDataEndTagOpen { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.transition(ScriptDataEndTagName); - } else { - t.emit("</"); - t.transition(ScriptData); - } - - } - }, - ScriptDataEndTagName { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - if (t.isAppropriateEndTagToken() && !r.isEmpty()) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(ScriptData); - } - }, - ScriptDataEscapeStart { - void read(Tokeniser t, CharacterReader r) { - if (r.matches('-')) { - t.emit('-'); - t.advanceTransition(ScriptDataEscapeStartDash); - } else { - t.transition(ScriptData); - } - } - }, - ScriptDataEscapeStartDash { - void read(Tokeniser t, CharacterReader r) { - if (r.matches('-')) { - t.emit('-'); - t.advanceTransition(ScriptDataEscapedDashDash); - } else { - t.transition(ScriptData); - } - } - }, - ScriptDataEscaped { - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.transition(Data); - return; - } - - switch (r.current()) { - case '-': - t.emit('-'); - t.advanceTransition(ScriptDataEscapedDash); - break; - case '<': - t.advanceTransition(ScriptDataEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); - } - } - }, - ScriptDataEscapedDash { - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.transition(Data); - return; - } - - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataEscapedDashDash); - break; - case '<': - t.transition(ScriptDataEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedDashDash { - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.transition(Data); - return; - } - - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.transition(ScriptDataEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedLessthanSign { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTempBuffer(); - t.dataBuffer.append(Character.toLowerCase(r.current())); - t.emit("<" + r.current()); - t.advanceTransition(ScriptDataDoubleEscapeStart); - } else if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(ScriptDataEscapedEndTagOpen); - } else { - t.emit('<'); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedEndTagOpen { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.tagPending.appendTagName(Character.toLowerCase(r.current())); - t.dataBuffer.append(r.current()); - t.advanceTransition(ScriptDataEscapedEndTagName); - } else { - t.emit("</"); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedEndTagName { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - if (t.isAppropriateEndTagToken() && !r.isEmpty()) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - break; - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(ScriptDataEscaped); - } - }, - ScriptDataDoubleEscapeStart { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.dataBuffer.append(name.toLowerCase()); - t.emit(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) - t.transition(ScriptDataDoubleEscaped); - else - t.transition(ScriptDataEscaped); - t.emit(c); - break; - default: - r.unconsume(); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataDoubleEscaped { - void read(Tokeniser t, CharacterReader r) { - char c = r.current(); - switch (c) { - case '-': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedDash); - break; - case '<': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); - } - } - }, - ScriptDataDoubleEscapedDash { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataDoubleEscapedDashDash); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapedDashDash { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapedLessthanSign { - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.emit('/'); - t.createTempBuffer(); - t.advanceTransition(ScriptDataDoubleEscapeEnd); - } else { - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapeEnd { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.dataBuffer.append(name.toLowerCase()); - t.emit(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) - t.transition(ScriptDataEscaped); - else - t.transition(ScriptDataDoubleEscaped); - t.emit(c); - break; - default: - r.unconsume(); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - BeforeAttributeName { - // from tagname <xxx - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - } - } - }, - AttributeName { - // from before attribute name - void read(Tokeniser t, CharacterReader r) { - String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'); - t.tagPending.appendAttributeName(name.toLowerCase()); - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.appendAttributeName(c); - // no default, as covered in consumeToAny - } - } - }, - AfterAttributeName { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - } - } - }, - BeforeAttributeValue { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '"': - t.transition(AttributeValue_doubleQuoted); - break; - case '&': - r.unconsume(); - t.transition(AttributeValue_unquoted); - break; - case '\'': - t.transition(AttributeValue_singleQuoted); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - t.transition(AttributeValue_unquoted); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '>': - t.error(this); - t.emitTagPending(); - t.transition(Data); - break; - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - t.transition(AttributeValue_unquoted); - break; - default: - r.unconsume(); - t.transition(AttributeValue_unquoted); - } - } - }, - AttributeValue_doubleQuoted { - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('"', '&', nullChar); - if (value.length() > 0) - t.tagPending.appendAttributeValue(value); - - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('"', true); - if (ref != null) - t.tagPending.appendAttributeValue(ref); - else - t.tagPending.appendAttributeValue('&'); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above - } - } - }, - AttributeValue_singleQuoted { - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\'', '&', nullChar); - if (value.length() > 0) - t.tagPending.appendAttributeValue(value); - - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('\'', true); - if (ref != null) - t.tagPending.appendAttributeValue(ref); - else - t.tagPending.appendAttributeValue('&'); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above - } - } - }, - AttributeValue_unquoted { - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'); - if (value.length() > 0) - t.tagPending.appendAttributeValue(value); - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '&': - Character ref = t.consumeCharacterReference('>', true); - if (ref != null) - t.tagPending.appendAttributeValue(ref); - else - t.tagPending.appendAttributeValue('&'); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - break; - // no default, handled in consume to any above - } - - } - }, - // CharacterReferenceInAttributeValue state handled inline - AfterAttributeValue_quoted { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - r.unconsume(); - t.transition(BeforeAttributeName); - } - - } - }, - SelfClosingStartTag { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.tagPending.selfClosing = true; - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeAttributeName); - } - } - }, - BogusComment { - void read(Tokeniser t, CharacterReader r) { - // todo: handle bogus comment starting from eof. when does that trigger? - // rewind to capture character that lead us here - r.unconsume(); - Token.Comment comment = new Token.Comment(); - comment.data.append(r.consumeTo('>')); - // todo: replace nullChar with replaceChar - t.emit(comment); - t.advanceTransition(Data); - } - }, - MarkupDeclarationOpen { - void read(Tokeniser t, CharacterReader r) { - if (r.matchConsume("--")) { - t.createCommentPending(); - t.transition(CommentStart); - } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { - t.transition(Doctype); - } else if (r.matchConsume("[CDATA[")) { - // todo: should actually check current namepspace, and only non-html allows cdata. until namespace - // is implemented properly, keep handling as cdata - //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) { - t.transition(CdataSection); - } else { - t.error(this); - t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind - } - } - }, - CommentStart { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); - } - } - }, - CommentStartDash { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); - } - } - }, - Comment { - void read(Tokeniser t, CharacterReader r) { - char c = r.current(); - switch (c) { - case '-': - t.advanceTransition(CommentEndDash); - break; - case nullChar: - t.error(this); - r.advance(); - t.commentPending.data.append(replacementChar); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(r.consumeToAny('-', nullChar)); - } - } - }, - CommentEndDash { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentEnd); - break; - case nullChar: - t.error(this); - t.commentPending.data.append('-').append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append('-').append(c); - t.transition(Comment); - } - } - }, - CommentEnd { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--").append(replacementChar); - t.transition(Comment); - break; - case '!': - t.error(this); - t.transition(CommentEndBang); - break; - case '-': - t.error(this); - t.commentPending.data.append('-'); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.error(this); - t.commentPending.data.append("--").append(c); - t.transition(Comment); - } - } - }, - CommentEndBang { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.commentPending.data.append("--!"); - t.transition(CommentEndDash); - break; - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--!").append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append("--!").append(c); - t.transition(Comment); - } - } - }, - Doctype { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeDoctypeName); - } - } - }, - BeforeDoctypeName { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createDoctypePending(); - t.transition(DoctypeName); - return; - } - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - t.transition(DoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.createDoctypePending(); - t.doctypePending.name.append(c); - t.transition(DoctypeName); - } - } - }, - DoctypeName { - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.doctypePending.name.append(name.toLowerCase()); - return; - } - char c = r.consume(); - switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterDoctypeName); - break; - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.name.append(c); - } - } - }, - AfterDoctypeName { - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - return; - } - if (r.matchesAny('\t', '\n', '\f', ' ')) - r.advance(); // ignore whitespace - else if (r.matches('>')) { - t.emitDoctypePending(); - t.advanceTransition(Data); - } else if (r.matchConsumeIgnoreCase("PUBLIC")) { - t.transition(AfterDoctypePublicKeyword); - } else if (r.matchConsumeIgnoreCase("SYSTEM")) { - t.transition(AfterDoctypeSystemKeyword); - } else { - t.error(this); - t.doctypePending.forceQuirks = true; - t.advanceTransition(BogusDoctype); - } - - } - }, - AfterDoctypePublicKeyword { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypePublicIdentifier); - break; - case '"': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - BeforeDoctypePublicIdentifier { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - DoctypePublicIdentifier_doubleQuoted { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); - } - } - }, - DoctypePublicIdentifier_singleQuoted { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); - } - } - }, - AfterDoctypePublicIdentifier { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BetweenDoctypePublicAndSystemIdentifiers); - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - BetweenDoctypePublicAndSystemIdentifiers { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - AfterDoctypeSystemKeyword { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeSystemIdentifier); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - } - } - }, - BeforeDoctypeSystemIdentifier { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set system id to empty string - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - DoctypeSystemIdentifier_doubleQuoted { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); - } - } - }, - DoctypeSystemIdentifier_singleQuoted { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); - } - } - }, - AfterDoctypeSystemIdentifier { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BogusDoctype); - // NOT force quirks - } - } - }, - BogusDoctype { - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.emitDoctypePending(); - t.transition(Data); - break; - default: - // ignore char - break; - } - } - }, - CdataSection { - void read(Tokeniser t, CharacterReader r) { - String data = r.consumeTo("]]>"); - t.emit(data); - r.matchConsume("]]>"); - t.transition(Data); - } - }; - - - abstract void read(Tokeniser t, CharacterReader r); - - private static final char nullChar = '\u0000'; - private static final char replacementChar = Tokeniser.replacementChar; - private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); - private static final char eof = CharacterReader.EOF; -} diff --git a/src/org/jsoup/parser/TreeBuilder.java b/src/org/jsoup/parser/TreeBuilder.java deleted file mode 100644 index e06caad501..0000000000 --- a/src/org/jsoup/parser/TreeBuilder.java +++ /dev/null @@ -1,60 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.ArrayList; -import java.util.List; - -/** - * @author Jonathan Hedley - */ -abstract class TreeBuilder { - CharacterReader reader; - Tokeniser tokeniser; - protected Document doc; // current doc we are building into - protected DescendableLinkedList<Element> stack; // the stack of open elements - protected String baseUri; // current base uri, for creating new elements - protected Token currentToken; // currentToken is used only for error tracking. - protected ParseErrorList errors; // null when not tracking errors - - protected void initialiseParse(String input, String baseUri, ParseErrorList errors) { - Validate.notNull(input, "String input must not be null"); - Validate.notNull(baseUri, "BaseURI must not be null"); - - doc = new Document(baseUri); - reader = new CharacterReader(input); - this.errors = errors; - tokeniser = new Tokeniser(reader, errors); - stack = new DescendableLinkedList<Element>(); - this.baseUri = baseUri; - } - - Document parse(String input, String baseUri) { - return parse(input, baseUri, ParseErrorList.noTracking()); - } - - Document parse(String input, String baseUri, ParseErrorList errors) { - initialiseParse(input, baseUri, errors); - runParser(); - return doc; - } - - protected void runParser() { - while (true) { - Token token = tokeniser.read(); - process(token); - - if (token.type == Token.TokenType.EOF) - break; - } - } - - protected abstract boolean process(Token token); - - protected Element currentElement() { - return stack.getLast(); - } -} diff --git a/src/org/jsoup/parser/XmlTreeBuilder.java b/src/org/jsoup/parser/XmlTreeBuilder.java deleted file mode 100644 index 3f03ad26ac..0000000000 --- a/src/org/jsoup/parser/XmlTreeBuilder.java +++ /dev/null @@ -1,111 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.*; - -import java.util.Iterator; - -/** - * @author Jonathan Hedley - */ -public class XmlTreeBuilder extends TreeBuilder { - @Override - protected void initialiseParse(String input, String baseUri, ParseErrorList errors) { - super.initialiseParse(input, baseUri, errors); - stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) - } - - @Override - protected boolean process(Token token) { - // start tag, end tag, doctype, comment, character, eof - switch (token.type) { - case StartTag: - insert(token.asStartTag()); - break; - case EndTag: - popStackToClose(token.asEndTag()); - break; - case Comment: - insert(token.asComment()); - break; - case Character: - insert(token.asCharacter()); - break; - case Doctype: - insert(token.asDoctype()); - break; - case EOF: // could put some normalisation here if desired - break; - default: - Validate.fail("Unexpected token type: " + token.type); - } - return true; - } - - private void insertNode(Node node) { - currentElement().appendChild(node); - } - - Element insert(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name()); - // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. - Element el = new Element(tag, baseUri, startTag.attributes); - insertNode(el); - if (startTag.isSelfClosing()) { - tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. - tag.setSelfClosing(); - } else { - stack.add(el); - } - return el; - } - - void insert(Token.Comment commentToken) { - Comment comment = new Comment(commentToken.getData(), baseUri); - insertNode(comment); - } - - void insert(Token.Character characterToken) { - Node node = new TextNode(characterToken.getData(), baseUri); - insertNode(node); - } - - void insert(Token.Doctype d) { - DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); - insertNode(doctypeNode); - } - - /** - * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not - * found, skips. - * - * @param endTag - */ - private void popStackToClose(Token.EndTag endTag) { - String elName = endTag.name(); - Element firstFound = null; - - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - firstFound = next; - break; - } - } - if (firstFound == null) - return; // not found, skip - - it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == firstFound) { - it.remove(); - break; - } else { - it.remove(); - } - } - } -} diff --git a/src/org/jsoup/parser/package-info.java b/src/org/jsoup/parser/package-info.java deleted file mode 100644 index 168fdf4086..0000000000 --- a/src/org/jsoup/parser/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - Contains the HTML parser, tag specifications, and HTML tokeniser. - */ -package org.jsoup.parser; |