diff options
author | Artur Signell <artur@vaadin.com> | 2012-08-28 20:00:00 +0300 |
---|---|---|
committer | Artur Signell <artur@vaadin.com> | 2012-09-09 11:22:54 +0300 |
commit | 38212596d91e9e167253d7debb154d18e3ff38b0 (patch) | |
tree | 99775812644e3ef421cfa3a6039677bc4cdb8093 /server/src/org/jsoup/parser | |
parent | 0a77dae8b57a99cb5112a387b2a374c14e1fae1b (diff) | |
download | vaadin-framework-38212596d91e9e167253d7debb154d18e3ff38b0.tar.gz vaadin-framework-38212596d91e9e167253d7debb154d18e3ff38b0.zip |
Jsoup is now declared as a dependency (#9299)
Diffstat (limited to 'server/src/org/jsoup/parser')
-rw-r--r-- | server/src/org/jsoup/parser/CharacterReader.java | 244 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/HtmlTreeBuilder.java | 754 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/HtmlTreeBuilderState.java | 1671 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/ParseError.java | 43 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/ParseErrorList.java | 34 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/Parser.java | 198 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/Tag.java | 298 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/Token.java | 253 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/TokenQueue.java | 473 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/Tokeniser.java | 264 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/TokeniserState.java | 1870 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/TreeBuilder.java | 61 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/XmlTreeBuilder.java | 121 | ||||
-rw-r--r-- | server/src/org/jsoup/parser/package-info.java | 5 |
14 files changed, 0 insertions, 6289 deletions
diff --git a/server/src/org/jsoup/parser/CharacterReader.java b/server/src/org/jsoup/parser/CharacterReader.java deleted file mode 100644 index 30fbca07f1..0000000000 --- a/server/src/org/jsoup/parser/CharacterReader.java +++ /dev/null @@ -1,244 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; - -/** - * CharacterReader consumes tokens off a string. To replace the old TokenQueue. - */ -class CharacterReader { - static final char EOF = (char) -1; - - private final String input; - private final int length; - private int pos = 0; - private int mark = 0; - - CharacterReader(String input) { - Validate.notNull(input); - input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns - // to newlines - - this.input = input; - length = input.length(); - } - - int pos() { - return pos; - } - - boolean isEmpty() { - return pos >= length; - } - - char current() { - return isEmpty() ? EOF : input.charAt(pos); - } - - char consume() { - char val = isEmpty() ? EOF : input.charAt(pos); - pos++; - return val; - } - - void unconsume() { - pos--; - } - - void advance() { - pos++; - } - - void mark() { - mark = pos; - } - - void rewindToMark() { - pos = mark; - } - - String consumeAsString() { - return input.substring(pos, pos++); - } - - String consumeTo(char c) { - int offset = input.indexOf(c, pos); - if (offset != -1) { - String consumed = input.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return consumeToEnd(); - } - } - - String consumeTo(String seq) { - int offset = input.indexOf(seq, pos); - if (offset != -1) { - String consumed = input.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return consumeToEnd(); - } - } - - String consumeToAny(char... seq) { - int start = pos; - - OUTER: while (!isEmpty()) { - char c = input.charAt(pos); - for (char seek : seq) { - if (seek == c) { - break OUTER; - } - } - pos++; - } - - return pos > start ? input.substring(start, pos) : ""; - } - - String consumeToEnd() { - String data = input.substring(pos, input.length()); - pos = input.length(); - return data; - } - - String consumeLetterSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { - pos++; - } else { - break; - } - } - - return input.substring(start, pos); - } - - String consumeLetterThenDigitSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { - pos++; - } else { - break; - } - } - while (!isEmpty()) { - char c = input.charAt(pos); - if (c >= '0' && c <= '9') { - pos++; - } else { - break; - } - } - - return input.substring(start, pos); - } - - String consumeHexSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') - || (c >= 'a' && c <= 'f')) { - pos++; - } else { - break; - } - } - return input.substring(start, pos); - } - - String consumeDigitSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if (c >= '0' && c <= '9') { - pos++; - } else { - break; - } - } - return input.substring(start, pos); - } - - boolean matches(char c) { - return !isEmpty() && input.charAt(pos) == c; - - } - - boolean matches(String seq) { - return input.startsWith(seq, pos); - } - - boolean matchesIgnoreCase(String seq) { - return input.regionMatches(true, pos, seq, 0, seq.length()); - } - - boolean matchesAny(char... seq) { - if (isEmpty()) { - return false; - } - - char c = input.charAt(pos); - for (char seek : seq) { - if (seek == c) { - return true; - } - } - return false; - } - - boolean matchesLetter() { - if (isEmpty()) { - return false; - } - char c = input.charAt(pos); - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - } - - boolean matchesDigit() { - if (isEmpty()) { - return false; - } - char c = input.charAt(pos); - return (c >= '0' && c <= '9'); - } - - boolean matchConsume(String seq) { - if (matches(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - boolean matchConsumeIgnoreCase(String seq) { - if (matchesIgnoreCase(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - boolean containsIgnoreCase(String seq) { - // used to check presence of </title>, </style>. only finds consistent - // case. - String loScan = seq.toLowerCase(); - String hiScan = seq.toUpperCase(); - return (input.indexOf(loScan, pos) > -1) - || (input.indexOf(hiScan, pos) > -1); - } - - @Override - public String toString() { - return input.substring(pos); - } -} diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilder.java b/server/src/org/jsoup/parser/HtmlTreeBuilder.java deleted file mode 100644 index f09ab8794c..0000000000 --- a/server/src/org/jsoup/parser/HtmlTreeBuilder.java +++ /dev/null @@ -1,754 +0,0 @@ -package org.jsoup.parser; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Comment; -import org.jsoup.nodes.DataNode; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; - -/** - * HTML Tree Builder; creates a DOM from Tokens. - */ -class HtmlTreeBuilder extends TreeBuilder { - - private HtmlTreeBuilderState state; // the current state - private HtmlTreeBuilderState originalState; // original / marked state - - private boolean baseUriSetFromDoc = false; - private Element headElement; // the current head element - private Element formElement; // the current form element - private Element contextElement; // fragment parse context -- could be null - // even if fragment parsing - private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active - // (open) - // formatting - // elements - private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars - // in - // table - // to - // be - // shifted - // out - - private boolean framesetOk = true; // if ok to go into frameset - private boolean fosterInserts = false; // if next inserts should be fostered - private boolean fragmentParsing = false; // if parsing a fragment of html - - HtmlTreeBuilder() { - } - - @Override - Document parse(String input, String baseUri, ParseErrorList errors) { - state = HtmlTreeBuilderState.Initial; - return super.parse(input, baseUri, errors); - } - - List<Node> parseFragment(String inputFragment, Element context, - String baseUri, ParseErrorList errors) { - // context may be null - state = HtmlTreeBuilderState.Initial; - initialiseParse(inputFragment, baseUri, errors); - contextElement = context; - fragmentParsing = true; - Element root = null; - - if (context != null) { - if (context.ownerDocument() != null) { - doc.quirksMode(context.ownerDocument().quirksMode()); - } - - // initialise the tokeniser state: - String contextTag = context.tagName(); - if (StringUtil.in(contextTag, "title", "textarea")) { - tokeniser.transition(TokeniserState.Rcdata); - } else if (StringUtil.in(contextTag, "iframe", "noembed", - "noframes", "style", "xmp")) { - tokeniser.transition(TokeniserState.Rawtext); - } else if (contextTag.equals("script")) { - tokeniser.transition(TokeniserState.ScriptData); - } else if (contextTag.equals(("noscript"))) { - tokeniser.transition(TokeniserState.Data); // if scripting - // enabled, rawtext - } else if (contextTag.equals("plaintext")) { - tokeniser.transition(TokeniserState.Data); - } else { - tokeniser.transition(TokeniserState.Data); // default - } - - root = new Element(Tag.valueOf("html"), baseUri); - doc.appendChild(root); - stack.push(root); - resetInsertionMode(); - // todo: setup form element to nearest form on context (up ancestor - // chain) - } - - runParser(); - if (context != null) { - return root.childNodes(); - } else { - return doc.childNodes(); - } - } - - @Override - protected boolean process(Token token) { - currentToken = token; - return state.process(token, this); - } - - boolean process(Token token, HtmlTreeBuilderState state) { - currentToken = token; - return state.process(token, this); - } - - void transition(HtmlTreeBuilderState state) { - this.state = state; - } - - HtmlTreeBuilderState state() { - return state; - } - - void markInsertionMode() { - originalState = state; - } - - HtmlTreeBuilderState originalState() { - return originalState; - } - - void framesetOk(boolean framesetOk) { - this.framesetOk = framesetOk; - } - - boolean framesetOk() { - return framesetOk; - } - - Document getDocument() { - return doc; - } - - String getBaseUri() { - return baseUri; - } - - void maybeSetBaseUri(Element base) { - if (baseUriSetFromDoc) { - return; - } - - String href = base.absUrl("href"); - if (href.length() != 0) { // ignore <base target> etc - baseUri = href; - baseUriSetFromDoc = true; - doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) - // will get updated base, and to update all - // descendants - } - } - - boolean isFragmentParsing() { - return fragmentParsing; - } - - void error(HtmlTreeBuilderState state) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Unexpected token [%s] when in state [%s]", currentToken - .tokenType(), state)); - } - } - - Element insert(Token.StartTag startTag) { - // handle empty unknown tags - // when the spec expects an empty tag, will directly hit insertEmpty, so - // won't generate fake end tag. - if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) { - Element el = insertEmpty(startTag); - process(new Token.EndTag(el.tagName())); // ensure we get out of - // whatever state we are in - return el; - } - - Element el = new Element(Tag.valueOf(startTag.name()), baseUri, - startTag.attributes); - insert(el); - return el; - } - - Element insert(String startTagName) { - Element el = new Element(Tag.valueOf(startTagName), baseUri); - insert(el); - return el; - } - - void insert(Element el) { - insertNode(el); - stack.add(el); - } - - Element insertEmpty(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name()); - Element el = new Element(tag, baseUri, startTag.attributes); - insertNode(el); - if (startTag.isSelfClosing()) { - tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) { - tag.setSelfClosing(); - } - } - return el; - } - - void insert(Token.Comment commentToken) { - Comment comment = new Comment(commentToken.getData(), baseUri); - insertNode(comment); - } - - void insert(Token.Character characterToken) { - Node node; - // characters in script and style go in as datanodes, not text nodes - if (StringUtil.in(currentElement().tagName(), "script", "style")) { - node = new DataNode(characterToken.getData(), baseUri); - } else { - node = new TextNode(characterToken.getData(), baseUri); - } - currentElement().appendChild(node); // doesn't use insertNode, because - // we don't foster these; and will - // always have a stack. - } - - private void insertNode(Node node) { - // if the stack hasn't been set up yet, elements (doctype, comments) go - // into the doc - if (stack.size() == 0) { - doc.appendChild(node); - } else if (isFosterInserts()) { - insertInFosterParent(node); - } else { - currentElement().appendChild(node); - } - } - - Element pop() { - // todo - dev, remove validation check - if (stack.peekLast().nodeName().equals("td") - && !state.name().equals("InCell")) { - Validate.isFalse(true, "pop td not in cell"); - } - if (stack.peekLast().nodeName().equals("html")) { - Validate.isFalse(true, "popping html!"); - } - return stack.pollLast(); - } - - void push(Element element) { - stack.add(element); - } - - DescendableLinkedList<Element> getStack() { - return stack; - } - - boolean onStack(Element el) { - return isElementInQueue(stack, el); - } - - private boolean isElementInQueue(DescendableLinkedList<Element> queue, - Element element) { - Iterator<Element> it = queue.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == element) { - return true; - } - } - return false; - } - - Element getFromStack(String elName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - return next; - } - } - return null; - } - - boolean removeFromStack(Element el) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - it.remove(); - return true; - } - } - return false; - } - - void popStackToClose(String elName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - it.remove(); - break; - } else { - it.remove(); - } - } - } - - void popStackToClose(String... elNames) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (StringUtil.in(next.nodeName(), elNames)) { - it.remove(); - break; - } else { - it.remove(); - } - } - } - - void popStackToBefore(String elName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - break; - } else { - it.remove(); - } - } - } - - void clearStackToTableContext() { - clearStackToContext("table"); - } - - void clearStackToTableBodyContext() { - clearStackToContext("tbody", "tfoot", "thead"); - } - - void clearStackToTableRowContext() { - clearStackToContext("tr"); - } - - private void clearStackToContext(String... nodeNames) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (StringUtil.in(next.nodeName(), nodeNames) - || next.nodeName().equals("html")) { - break; - } else { - it.remove(); - } - } - } - - Element aboveOnStack(Element el) { - assert onStack(el); - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - return it.next(); - } - } - return null; - } - - void insertOnStackAfter(Element after, Element in) { - int i = stack.lastIndexOf(after); - Validate.isTrue(i != -1); - stack.add(i + 1, in); - } - - void replaceOnStack(Element out, Element in) { - replaceInQueue(stack, out, in); - } - - private void replaceInQueue(LinkedList<Element> queue, Element out, - Element in) { - int i = queue.lastIndexOf(out); - Validate.isTrue(i != -1); - queue.remove(i); - queue.add(i, in); - } - - void resetInsertionMode() { - boolean last = false; - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element node = it.next(); - if (!it.hasNext()) { - last = true; - node = contextElement; - } - String name = node.nodeName(); - if ("select".equals(name)) { - transition(HtmlTreeBuilderState.InSelect); - break; // frag - } else if (("td".equals(name) || "td".equals(name) && !last)) { - transition(HtmlTreeBuilderState.InCell); - break; - } else if ("tr".equals(name)) { - transition(HtmlTreeBuilderState.InRow); - break; - } else if ("tbody".equals(name) || "thead".equals(name) - || "tfoot".equals(name)) { - transition(HtmlTreeBuilderState.InTableBody); - break; - } else if ("caption".equals(name)) { - transition(HtmlTreeBuilderState.InCaption); - break; - } else if ("colgroup".equals(name)) { - transition(HtmlTreeBuilderState.InColumnGroup); - break; // frag - } else if ("table".equals(name)) { - transition(HtmlTreeBuilderState.InTable); - break; - } else if ("head".equals(name)) { - transition(HtmlTreeBuilderState.InBody); - break; // frag - } else if ("body".equals(name)) { - transition(HtmlTreeBuilderState.InBody); - break; - } else if ("frameset".equals(name)) { - transition(HtmlTreeBuilderState.InFrameset); - break; // frag - } else if ("html".equals(name)) { - transition(HtmlTreeBuilderState.BeforeHead); - break; // frag - } else if (last) { - transition(HtmlTreeBuilderState.InBody); - break; // frag - } - } - } - - // todo: tidy up in specific scope methods - private boolean inSpecificScope(String targetName, String[] baseTypes, - String[] extraTypes) { - return inSpecificScope(new String[] { targetName }, baseTypes, - extraTypes); - } - - private boolean inSpecificScope(String[] targetNames, String[] baseTypes, - String[] extraTypes) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element el = it.next(); - String elName = el.nodeName(); - if (StringUtil.in(elName, targetNames)) { - return true; - } - if (StringUtil.in(elName, baseTypes)) { - return false; - } - if (extraTypes != null && StringUtil.in(elName, extraTypes)) { - return false; - } - } - Validate.fail("Should not be reachable"); - return false; - } - - boolean inScope(String[] targetNames) { - return inSpecificScope(targetNames, new String[] { "applet", "caption", - "html", "table", "td", "th", "marquee", "object" }, null); - } - - boolean inScope(String targetName) { - return inScope(targetName, null); - } - - boolean inScope(String targetName, String[] extras) { - return inSpecificScope(targetName, new String[] { "applet", "caption", - "html", "table", "td", "th", "marquee", "object" }, extras); - // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml - // todo: in svg namespace: forignOjbect, desc, title - } - - boolean inListItemScope(String targetName) { - return inScope(targetName, new String[] { "ol", "ul" }); - } - - boolean inButtonScope(String targetName) { - return inScope(targetName, new String[] { "button" }); - } - - boolean inTableScope(String targetName) { - return inSpecificScope(targetName, new String[] { "html", "table" }, - null); - } - - boolean inSelectScope(String targetName) { - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element el = it.next(); - String elName = el.nodeName(); - if (elName.equals(targetName)) { - return true; - } - if (!StringUtil.in(elName, "optgroup", "option")) { - return false; - } - } - Validate.fail("Should not be reachable"); - return false; - } - - void setHeadElement(Element headElement) { - this.headElement = headElement; - } - - Element getHeadElement() { - return headElement; - } - - boolean isFosterInserts() { - return fosterInserts; - } - - void setFosterInserts(boolean fosterInserts) { - this.fosterInserts = fosterInserts; - } - - Element getFormElement() { - return formElement; - } - - void setFormElement(Element formElement) { - this.formElement = formElement; - } - - void newPendingTableCharacters() { - pendingTableCharacters = new ArrayList<Token.Character>(); - } - - List<Token.Character> getPendingTableCharacters() { - return pendingTableCharacters; - } - - void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) { - this.pendingTableCharacters = pendingTableCharacters; - } - - /** - * 11.2.5.2 Closing elements that have implied end tags - * <p/> - * When the steps below require the UA to generate implied end tags, then, - * while the current node is a dd element, a dt element, an li element, an - * option element, an optgroup element, a p element, an rp element, or an rt - * element, the UA must pop the current node off the stack of open elements. - * - * @param excludeTag - * If a step requires the UA to generate implied end tags but - * lists an element to exclude from the process, then the UA must - * perform the above steps as if that element was not in the - * above list. - */ - void generateImpliedEndTags(String excludeTag) { - while ((excludeTag != null && !currentElement().nodeName().equals( - excludeTag)) - && StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", - "option", "optgroup", "p", "rp", "rt")) { - pop(); - } - } - - void generateImpliedEndTags() { - generateImpliedEndTags(null); - } - - boolean isSpecial(Element el) { - // todo: mathml's mi, mo, mn - // todo: svg's foreigObject, desc, title - String name = el.nodeName(); - return StringUtil.in(name, "address", "applet", "area", "article", - "aside", "base", "basefont", "bgsound", "blockquote", "body", - "br", "button", "caption", "center", "col", "colgroup", - "command", "dd", "details", "dir", "div", "dl", "dt", "embed", - "fieldset", "figcaption", "figure", "footer", "form", "frame", - "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", - "header", "hgroup", "hr", "html", "iframe", "img", "input", - "isindex", "li", "link", "listing", "marquee", "menu", "meta", - "nav", "noembed", "noframes", "noscript", "object", "ol", "p", - "param", "plaintext", "pre", "script", "section", "select", - "style", "summary", "table", "tbody", "td", "textarea", - "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp"); - } - - // active formatting elements - void pushActiveFormattingElements(Element in) { - int numSeen = 0; - Iterator<Element> iter = formattingElements.descendingIterator(); - while (iter.hasNext()) { - Element el = iter.next(); - if (el == null) { - break; - } - - if (isSameFormattingElement(in, el)) { - numSeen++; - } - - if (numSeen == 3) { - iter.remove(); - break; - } - } - formattingElements.add(in); - } - - private boolean isSameFormattingElement(Element a, Element b) { - // same if: same namespace, tag, and attributes. Element.equals only - // checks tag, might in future check children - return a.nodeName().equals(b.nodeName()) && - // a.namespace().equals(b.namespace()) && - a.attributes().equals(b.attributes()); - // todo: namespaces - } - - void reconstructFormattingElements() { - int size = formattingElements.size(); - if (size == 0 || formattingElements.getLast() == null - || onStack(formattingElements.getLast())) { - return; - } - - Element entry = formattingElements.getLast(); - int pos = size - 1; - boolean skip = false; - while (true) { - if (pos == 0) { // step 4. if none before, skip to 8 - skip = true; - break; - } - entry = formattingElements.get(--pos); // step 5. one earlier than - // entry - if (entry == null || onStack(entry)) { - break; // jump to 8, else continue back to 4 - } - } - while (true) { - if (!skip) { - entry = formattingElements.get(++pos); - } - Validate.notNull(entry); // should not occur, as we break at last - // element - - // 8. create new element from element, 9 insert into current node, - // onto stack - skip = false; // can only skip increment from 4. - Element newEl = insert(entry.nodeName()); // todo: avoid fostering - // here? - // newEl.namespace(entry.namespace()); // todo: namespaces - newEl.attributes().addAll(entry.attributes()); - - // 10. replace entry with new entry - formattingElements.add(pos, newEl); - formattingElements.remove(pos + 1); - - // 11 - if (pos == size - 1) { - break; - } - } - } - - void clearFormattingElementsToLastMarker() { - while (!formattingElements.isEmpty()) { - Element el = formattingElements.peekLast(); - formattingElements.removeLast(); - if (el == null) { - break; - } - } - } - - void removeFromActiveFormattingElements(Element el) { - Iterator<Element> it = formattingElements.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - it.remove(); - break; - } - } - } - - boolean isInActiveFormattingElements(Element el) { - return isElementInQueue(formattingElements, el); - } - - Element getActiveFormattingElement(String nodeName) { - Iterator<Element> it = formattingElements.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == null) { - break; - } else if (next.nodeName().equals(nodeName)) { - return next; - } - } - return null; - } - - void replaceActiveFormattingElement(Element out, Element in) { - replaceInQueue(formattingElements, out, in); - } - - void insertMarkerToFormattingElements() { - formattingElements.add(null); - } - - void insertInFosterParent(Node in) { - Element fosterParent = null; - Element lastTable = getFromStack("table"); - boolean isLastTableParent = false; - if (lastTable != null) { - if (lastTable.parent() != null) { - fosterParent = lastTable.parent(); - isLastTableParent = true; - } else { - fosterParent = aboveOnStack(lastTable); - } - } else { // no table == frag - fosterParent = stack.get(0); - } - - if (isLastTableParent) { - Validate.notNull(lastTable); // last table cannot be null by this - // point. - lastTable.before(in); - } else { - fosterParent.appendChild(in); - } - } - - @Override - public String toString() { - return "TreeBuilder{" + "currentToken=" + currentToken + ", state=" - + state + ", currentElement=" + currentElement() + '}'; - } -} diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java deleted file mode 100644 index 258d547a49..0000000000 --- a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java +++ /dev/null @@ -1,1671 +0,0 @@ -package org.jsoup.parser; - -import java.util.Iterator; -import java.util.LinkedList; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.DocumentType; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -/** - * The Tree Builder's current state. Each state embodies the processing for the - * state, and transitions to other states. - */ -enum HtmlTreeBuilderState { - Initial { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return true; // ignore whitespace - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - // todo: parse error check on expected doctypes - // todo: quirk state check on doctype ids - Token.Doctype d = t.asDoctype(); - DocumentType doctype = new DocumentType(d.getName(), - d.getPublicIdentifier(), d.getSystemIdentifier(), - tb.getBaseUri()); - tb.getDocument().appendChild(doctype); - if (d.isForceQuirks()) { - tb.getDocument().quirksMode(Document.QuirksMode.quirks); - } - tb.transition(BeforeHtml); - } else { - // todo: check not iframe srcdoc - tb.transition(BeforeHtml); - return tb.process(t); // re-process token - } - return true; - } - }, - BeforeHtml { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (isWhitespace(t)) { - return true; // ignore whitespace - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - tb.insert(t.asStartTag()); - tb.transition(BeforeHead); - } else if (t.isEndTag() - && (StringUtil.in(t.asEndTag().name(), "head", "body", - "html", "br"))) { - return anythingElse(t, tb); - } else if (t.isEndTag()) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.insert("html"); - tb.transition(BeforeHead); - return tb.process(t); - } - }, - BeforeHead { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return true; - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return InBody.process(t, tb); // does not transition - } else if (t.isStartTag() && t.asStartTag().name().equals("head")) { - Element head = tb.insert(t.asStartTag()); - tb.setHeadElement(head); - tb.transition(InHead); - } else if (t.isEndTag() - && (StringUtil.in(t.asEndTag().name(), "head", "body", - "html", "br"))) { - tb.process(new Token.StartTag("head")); - return tb.process(t); - } else if (t.isEndTag()) { - tb.error(this); - return false; - } else { - tb.process(new Token.StartTag("head")); - return tb.process(t); - } - return true; - } - }, - InHead { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - return true; - } - switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return InBody.process(t, tb); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", - "command", "link")) { - Element el = tb.insertEmpty(start); - // jsoup special: update base the frist time it is seen - if (name.equals("base") && el.hasAttr("href")) { - tb.maybeSetBaseUri(el); - } - } else if (name.equals("meta")) { - Element meta = tb.insertEmpty(start); - // todo: charset switches - } else if (name.equals("title")) { - handleRcData(start, tb); - } else if (StringUtil.in(name, "noframes", "style")) { - handleRawtext(start, tb); - } else if (name.equals("noscript")) { - // else if noscript && scripting flag = true: rawtext (jsoup - // doesn't run script, to handle as noscript) - tb.insert(start); - tb.transition(InHeadNoscript); - } else if (name.equals("script")) { - // skips some script rules as won't execute them - tb.insert(start); - tb.tokeniser.transition(TokeniserState.ScriptData); - tb.markInsertionMode(); - tb.transition(Text); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("head")) { - tb.pop(); - tb.transition(AfterHead); - } else if (StringUtil.in(name, "body", "html", "br")) { - return anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, TreeBuilder tb) { - tb.process(new Token.EndTag("head")); - return tb.process(t); - } - }, - InHeadNoscript { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isDoctype()) { - tb.error(this); - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { - tb.pop(); - tb.transition(InHead); - } else if (isWhitespace(t) - || t.isComment() - || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "basefont", "bgsound", "link", "meta", "noframes", - "style"))) { - return tb.process(t, InHead); - } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { - return anythingElse(t, tb); - } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "head", "noscript")) || t.isEndTag()) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - tb.process(new Token.EndTag("noscript")); - return tb.process(t); - } - }, - AfterHead { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - } else if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - return tb.process(t, InBody); - } else if (name.equals("body")) { - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InBody); - } else if (name.equals("frameset")) { - tb.insert(startTag); - tb.transition(InFrameset); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", - "link", "meta", "noframes", "script", "style", "title")) { - tb.error(this); - Element head = tb.getHeadElement(); - tb.push(head); - tb.process(t, InHead); - tb.removeFromStack(head); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - anythingElse(t, tb); - } - } else if (t.isEndTag()) { - if (StringUtil.in(t.asEndTag().name(), "body", "html")) { - anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - } else { - anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.process(new Token.StartTag("body")); - tb.framesetOk(true); - return tb.process(t); - } - }, - InBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: { - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - // todo confirm that check - tb.error(this); - return false; - } else if (isWhitespace(c)) { - tb.reconstructFormattingElements(); - tb.insert(c); - } else { - tb.reconstructFormattingElements(); - tb.insert(c); - tb.framesetOk(false); - } - break; - } - case Comment: { - tb.insert(t.asComment()); - break; - } - case Doctype: { - tb.error(this); - return false; - } - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - tb.error(this); - // merge attributes onto real html - Element html = tb.getStack().getFirst(); - for (Attribute attribute : startTag.getAttributes()) { - if (!html.hasAttr(attribute.getKey())) { - html.attributes().put(attribute); - } - } - } else if (StringUtil.in(name, "base", "basefont", "bgsound", - "command", "link", "meta", "noframes", "script", - "style", "title")) { - return tb.process(t, InHead); - } else if (name.equals("body")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 - || (stack.size() > 2 && !stack.get(1).nodeName() - .equals("body"))) { - // only in fragment case - return false; // ignore - } else { - tb.framesetOk(false); - Element body = stack.get(1); - for (Attribute attribute : startTag.getAttributes()) { - if (!body.hasAttr(attribute.getKey())) { - body.attributes().put(attribute); - } - } - } - } else if (name.equals("frameset")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 - || (stack.size() > 2 && !stack.get(1).nodeName() - .equals("body"))) { - // only in fragment case - return false; // ignore - } else if (!tb.framesetOk()) { - return false; // ignore frameset - } else { - Element second = stack.get(1); - if (second.parent() != null) { - second.remove(); - } - // pop up to html element - while (stack.size() > 1) { - stack.removeLast(); - } - tb.insert(startTag); - tb.transition(InFrameset); - } - } else if (StringUtil.in(name, "address", "article", "aside", - "blockquote", "center", "details", "dir", "div", "dl", - "fieldset", "figcaption", "figure", "footer", "header", - "hgroup", "menu", "nav", "ol", "p", "section", - "summary", "ul")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", - "h6")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - if (StringUtil.in(tb.currentElement().nodeName(), "h1", - "h2", "h3", "h4", "h5", "h6")) { - tb.error(this); - tb.pop(); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "pre", "listing")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - // todo: ignore LF if next token - tb.framesetOk(false); - } else if (name.equals("form")) { - if (tb.getFormElement() != null) { - tb.error(this); - return false; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - Element form = tb.insert(startTag); - tb.setFormElement(form); - } else if (name.equals("li")) { - tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (el.nodeName().equals("li")) { - tb.process(new Token.EndTag("li")); - break; - } - if (tb.isSpecial(el) - && !StringUtil.in(el.nodeName(), "address", - "div", "p")) { - break; - } - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "dd", "dt")) { - tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (StringUtil.in(el.nodeName(), "dd", "dt")) { - tb.process(new Token.EndTag(el.nodeName())); - break; - } - if (tb.isSpecial(el) - && !StringUtil.in(el.nodeName(), "address", - "div", "p")) { - break; - } - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (name.equals("plaintext")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once - // in, - // never - // gets - // out - } else if (name.equals("button")) { - if (tb.inButtonScope("button")) { - // close and reprocess - tb.error(this); - tb.process(new Token.EndTag("button")); - tb.process(startTag); - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); - } - } else if (name.equals("a")) { - if (tb.getActiveFormattingElement("a") != null) { - tb.error(this); - tb.process(new Token.EndTag("a")); - - // still on stack? - Element remainingA = tb.getFromStack("a"); - if (remainingA != null) { - tb.removeFromActiveFormattingElements(remainingA); - tb.removeFromStack(remainingA); - } - } - tb.reconstructFormattingElements(); - Element a = tb.insert(startTag); - tb.pushActiveFormattingElements(a); - } else if (StringUtil.in(name, "b", "big", "code", "em", - "font", "i", "s", "small", "strike", "strong", "tt", - "u")) { - tb.reconstructFormattingElements(); - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (name.equals("nobr")) { - tb.reconstructFormattingElements(); - if (tb.inScope("nobr")) { - tb.error(this); - tb.process(new Token.EndTag("nobr")); - tb.reconstructFormattingElements(); - } - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.insertMarkerToFormattingElements(); - tb.framesetOk(false); - } else if (name.equals("table")) { - if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks - && tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InTable); - } else if (StringUtil.in(name, "area", "br", "embed", "img", - "keygen", "wbr")) { - tb.reconstructFormattingElements(); - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("input")) { - tb.reconstructFormattingElements(); - Element el = tb.insertEmpty(startTag); - if (!el.attr("type").equalsIgnoreCase("hidden")) { - tb.framesetOk(false); - } - } else if (StringUtil.in(name, "param", "source", "track")) { - tb.insertEmpty(startTag); - } else if (name.equals("hr")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("image")) { - // we're not supposed to ask. - startTag.name("img"); - return tb.process(startTag); - } else if (name.equals("isindex")) { - // how much do we care about the early 90s? - tb.error(this); - if (tb.getFormElement() != null) { - return false; - } - - tb.tokeniser.acknowledgeSelfClosingFlag(); - tb.process(new Token.StartTag("form")); - if (startTag.attributes.hasKey("action")) { - Element form = tb.getFormElement(); - form.attr("action", startTag.attributes.get("action")); - } - tb.process(new Token.StartTag("hr")); - tb.process(new Token.StartTag("label")); - // hope you like english. - String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes - .get("prompt") - : "This is a searchable index. Enter search keywords: "; - - tb.process(new Token.Character(prompt)); - - // input - Attributes inputAttribs = new Attributes(); - for (Attribute attr : startTag.attributes) { - if (!StringUtil.in(attr.getKey(), "name", "action", - "prompt")) { - inputAttribs.put(attr); - } - } - inputAttribs.put("name", "isindex"); - tb.process(new Token.StartTag("input", inputAttribs)); - tb.process(new Token.EndTag("label")); - tb.process(new Token.StartTag("hr")); - tb.process(new Token.EndTag("form")); - } else if (name.equals("textarea")) { - tb.insert(startTag); - // todo: If the next token is a U+000A LINE FEED (LF) - // character token, then ignore that token and move on to - // the next one. (Newlines at the start of textarea elements - // are ignored as an authoring convenience.) - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.framesetOk(false); - tb.transition(Text); - } else if (name.equals("xmp")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.reconstructFormattingElements(); - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("iframe")) { - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("noembed")) { - // also handle noscript if script enabled - handleRawtext(startTag, tb); - } else if (name.equals("select")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); - - HtmlTreeBuilderState state = tb.state(); - if (state.equals(InTable) || state.equals(InCaption) - || state.equals(InTableBody) || state.equals(InRow) - || state.equals(InCell)) { - tb.transition(InSelectInTable); - } else { - tb.transition(InSelect); - } - } else if (StringUtil.in("optgroup", "option")) { - if (tb.currentElement().nodeName().equals("option")) { - tb.process(new Token.EndTag("option")); - } - tb.reconstructFormattingElements(); - tb.insert(startTag); - } else if (StringUtil.in("rp", "rt")) { - if (tb.inScope("ruby")) { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("ruby")) { - tb.error(this); - tb.popStackToBefore("ruby"); // i.e. close up to but - // not include name - } - tb.insert(startTag); - } - } else if (name.equals("math")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. - // foreign, mathml) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (name.equals("svg")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, - // svg) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", - "frame", "head", "tbody", "td", "tfoot", "th", "thead", - "tr")) { - tb.error(this); - return false; - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - } - break; - - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("body")) { - if (!tb.inScope("body")) { - tb.error(this); - return false; - } else { - // todo: error if stack contains something not dd, dt, - // li, optgroup, option, p, rp, rt, tbody, td, tfoot, - // th, thead, tr, body, html - tb.transition(AfterBody); - } - } else if (name.equals("html")) { - boolean notIgnored = tb.process(new Token.EndTag("body")); - if (notIgnored) { - return tb.process(endTag); - } - } else if (StringUtil.in(name, "address", "article", "aside", - "blockquote", "button", "center", "details", "dir", - "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "listing", "menu", "nav", - "ol", "pre", "section", "summary", "ul")) { - // todo: refactor these lookups - if (!tb.inScope(name)) { - // nothing to close - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (name.equals("form")) { - Element currentForm = tb.getFormElement(); - tb.setFormElement(null); - if (currentForm == null || !tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - // remove currentForm from stack. will shift anything - // under up. - tb.removeFromStack(currentForm); - } - } else if (name.equals("p")) { - if (!tb.inButtonScope(name)) { - tb.error(this); - tb.process(new Token.StartTag(name)); // if no p to - // close, creates - // an empty - // <p></p> - return tb.process(endTag); - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (name.equals("li")) { - if (!tb.inListItemScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, "dd", "dt")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", - "h6")) { - if (!tb.inScope(new String[] { "h1", "h2", "h3", "h4", - "h5", "h6" })) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); - } - } else if (name.equals("sarcasm")) { - // *sigh* - return anyOtherEndTag(t, tb); - } else if (StringUtil.in(name, "a", "b", "big", "code", "em", - "font", "i", "nobr", "s", "small", "strike", "strong", - "tt", "u")) { - // Adoption Agency Algorithm. - OUTER: for (int i = 0; i < 8; i++) { - Element formatEl = tb.getActiveFormattingElement(name); - if (formatEl == null) { - return anyOtherEndTag(t, tb); - } else if (!tb.onStack(formatEl)) { - tb.error(this); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } else if (!tb.inScope(formatEl.nodeName())) { - tb.error(this); - return false; - } else if (tb.currentElement() != formatEl) { - tb.error(this); - } - - Element furthestBlock = null; - Element commonAncestor = null; - boolean seenFormattingElement = false; - LinkedList<Element> stack = tb.getStack(); - for (int si = 0; si < stack.size(); si++) { - Element el = stack.get(si); - if (el == formatEl) { - commonAncestor = stack.get(si - 1); - seenFormattingElement = true; - } else if (seenFormattingElement - && tb.isSpecial(el)) { - furthestBlock = el; - break; - } - } - if (furthestBlock == null) { - tb.popStackToClose(formatEl.nodeName()); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } - - // todo: Let a bookmark note the position of the - // formatting element in the list of active formatting - // elements relative to the elements on either side of - // it in the list. - // does that mean: int pos of format el in list? - Element node = furthestBlock; - Element lastNode = furthestBlock; - INNER: for (int j = 0; j < 3; j++) { - if (tb.onStack(node)) { - node = tb.aboveOnStack(node); - } - if (!tb.isInActiveFormattingElements(node)) { // note - // no - // bookmark - // check - tb.removeFromStack(node); - continue INNER; - } else if (node == formatEl) { - break INNER; - } - - Element replacement = new Element(Tag.valueOf(node - .nodeName()), tb.getBaseUri()); - tb.replaceActiveFormattingElement(node, replacement); - tb.replaceOnStack(node, replacement); - node = replacement; - - if (lastNode == furthestBlock) { - // todo: move the aforementioned bookmark to be - // immediately after the new node in the list of - // active formatting elements. - // not getting how this bookmark both straddles - // the element above, but is inbetween here... - } - if (lastNode.parent() != null) { - lastNode.remove(); - } - node.appendChild(lastNode); - - lastNode = node; - } - - if (StringUtil.in(commonAncestor.nodeName(), "table", - "tbody", "tfoot", "thead", "tr")) { - if (lastNode.parent() != null) { - lastNode.remove(); - } - tb.insertInFosterParent(lastNode); - } else { - if (lastNode.parent() != null) { - lastNode.remove(); - } - commonAncestor.appendChild(lastNode); - } - - Element adopter = new Element(Tag.valueOf(name), - tb.getBaseUri()); - Node[] childNodes = furthestBlock.childNodes().toArray( - new Node[furthestBlock.childNodes().size()]); - for (Node childNode : childNodes) { - adopter.appendChild(childNode); // append will - // reparent. thus - // the clone to - // avoid concurrent - // mod. - } - furthestBlock.appendChild(adopter); - tb.removeFromActiveFormattingElements(formatEl); - // todo: insert the new element into the list of active - // formatting elements at the position of the - // aforementioned bookmark. - tb.removeFromStack(formatEl); - tb.insertOnStackAfter(furthestBlock, adopter); - } - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - if (!tb.inScope("name")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); - } - } else if (name.equals("br")) { - tb.error(this); - tb.process(new Token.StartTag("br")); - return false; - } else { - return anyOtherEndTag(t, tb); - } - - break; - case EOF: - // todo: error if stack contains something not dd, dt, li, p, - // tbody, td, tfoot, th, thead, tr, body, html - // stop parsing - break; - } - return true; - } - - boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) { - String name = t.asEndTag().name(); - DescendableLinkedList<Element> stack = tb.getStack(); - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element node = it.next(); - if (node.nodeName().equals(name)) { - tb.generateImpliedEndTags(name); - if (!name.equals(tb.currentElement().nodeName())) { - tb.error(this); - } - tb.popStackToClose(name); - break; - } else { - if (tb.isSpecial(node)) { - tb.error(this); - return false; - } - } - } - return true; - } - }, - Text { - // in script, style etc. normally treated as data tags - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isCharacter()) { - tb.insert(t.asCharacter()); - } else if (t.isEOF()) { - tb.error(this); - // if current node is script: already started - tb.pop(); - tb.transition(tb.originalState()); - return tb.process(t); - } else if (t.isEndTag()) { - // if: An end tag whose tag name is "script" -- scripting - // nesting level, if evaluating scripts - tb.pop(); - tb.transition(tb.originalState()); - } - return true; - } - }, - InTable { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isCharacter()) { - tb.newPendingTableCharacters(); - tb.markInsertionMode(); - tb.transition(InTableText); - return tb.process(t); - } else if (t.isComment()) { - tb.insert(t.asComment()); - return true; - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("caption")) { - tb.clearStackToTableContext(); - tb.insertMarkerToFormattingElements(); - tb.insert(startTag); - tb.transition(InCaption); - } else if (name.equals("colgroup")) { - tb.clearStackToTableContext(); - tb.insert(startTag); - tb.transition(InColumnGroup); - } else if (name.equals("col")) { - tb.process(new Token.StartTag("colgroup")); - return tb.process(t); - } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - tb.clearStackToTableContext(); - tb.insert(startTag); - tb.transition(InTableBody); - } else if (StringUtil.in(name, "td", "th", "tr")) { - tb.process(new Token.StartTag("tbody")); - return tb.process(t); - } else if (name.equals("table")) { - tb.error(this); - boolean processed = tb.process(new Token.EndTag("table")); - if (processed) { - return tb.process(t); - } - } else if (StringUtil.in(name, "style", "script")) { - return tb.process(t, InHead); - } else if (name.equals("input")) { - if (!startTag.attributes.get("type").equalsIgnoreCase( - "hidden")) { - return anythingElse(t, tb); - } else { - tb.insertEmpty(startTag); - } - } else if (name.equals("form")) { - tb.error(this); - if (tb.getFormElement() != null) { - return false; - } else { - Element form = tb.insertEmpty(startTag); - tb.setFormElement(form); - } - } else { - return anythingElse(t, tb); - } - } else if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (name.equals("table")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose("table"); - } - tb.resetInsertionMode(); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - } else if (t.isEOF()) { - if (tb.currentElement().nodeName().equals("html")) { - tb.error(this); - } - return true; // stops parsing - } - return anythingElse(t, tb); - } - - boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - boolean processed = true; - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", - "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - processed = tb.process(t, InBody); - tb.setFosterInserts(false); - } else { - processed = tb.process(t, InBody); - } - return processed; - } - }, - InTableText { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.getPendingTableCharacters().add(c); - } - break; - default: - if (tb.getPendingTableCharacters().size() > 0) { - for (Token.Character character : tb - .getPendingTableCharacters()) { - if (!isWhitespace(character)) { - // InTable anything else section: - tb.error(this); - if (StringUtil.in(tb.currentElement().nodeName(), - "table", "tbody", "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - tb.process(character, InBody); - tb.setFosterInserts(false); - } else { - tb.process(character, InBody); - } - } else { - tb.insert(character); - } - } - tb.newPendingTableCharacters(); - } - tb.transition(tb.originalState()); - return tb.process(t); - } - return true; - } - }, - InCaption { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isEndTag() && t.asEndTag().name().equals("caption")) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("caption")) { - tb.error(this); - } - tb.popStackToClose("caption"); - tb.clearFormattingElementsToLastMarker(); - tb.transition(InTable); - } - } else if ((t.isStartTag() - && StringUtil.in(t.asStartTag().name(), "caption", "col", - "colgroup", "tbody", "td", "tfoot", "th", "thead", - "tr") || t.isEndTag() - && t.asEndTag().name().equals("table"))) { - tb.error(this); - boolean processed = tb.process(new Token.EndTag("caption")); - if (processed) { - return tb.process(t); - } - } else if (t.isEndTag() - && StringUtil.in(t.asEndTag().name(), "body", "col", - "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr")) { - tb.error(this); - return false; - } else { - return tb.process(t, InBody); - } - return true; - } - }, - InColumnGroup { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - return true; - } - switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - break; - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - return tb.process(t, InBody); - } else if (name.equals("col")) { - tb.insertEmpty(startTag); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("colgroup")) { - if (tb.currentElement().nodeName().equals("html")) { // frag - // case - tb.error(this); - return false; - } else { - tb.pop(); - tb.transition(InTable); - } - } else { - return anythingElse(t, tb); - } - break; - case EOF: - if (tb.currentElement().nodeName().equals("html")) { - return true; // stop parsing; frag case - } else { - return anythingElse(t, tb); - } - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, TreeBuilder tb) { - boolean processed = tb.process(new Token.EndTag("colgroup")); - if (processed) { - return tb.process(t); - } - return true; - } - }, - InTableBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("tr")) { - tb.clearStackToTableBodyContext(); - tb.insert(startTag); - tb.transition(InRow); - } else if (StringUtil.in(name, "th", "td")) { - tb.error(this); - tb.process(new Token.StartTag("tr")); - return tb.process(startTag); - } else if (StringUtil.in(name, "caption", "col", "colgroup", - "tbody", "tfoot", "thead")) { - return exitTableBody(t, tb); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.clearStackToTableBodyContext(); - tb.pop(); - tb.transition(InTable); - } - } else if (name.equals("table")) { - return exitTableBody(t, tb); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html", "td", "th", "tr")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { - if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb - .inScope("tfoot"))) { - // frag case - tb.error(this); - return false; - } - tb.clearStackToTableBodyContext(); - tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, - // tfoot, - // thead - return tb.process(t); - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InTable); - } - }, - InRow { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - - if (StringUtil.in(name, "th", "td")) { - tb.clearStackToTableRowContext(); - tb.insert(startTag); - tb.transition(InCell); - tb.insertMarkerToFormattingElements(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", - "tbody", "tfoot", "thead", "tr")) { - return handleMissingTr(t, tb); - } else { - return anythingElse(t, tb); - } - } else if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (name.equals("tr")) { - if (!tb.inTableScope(name)) { - tb.error(this); // frag - return false; - } - tb.clearStackToTableRowContext(); - tb.pop(); // tr - tb.transition(InTableBody); - } else if (name.equals("table")) { - return handleMissingTr(t, tb); - } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } - tb.process(new Token.EndTag("tr")); - return tb.process(t); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html", "td", "th")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InTable); - } - - private boolean handleMissingTr(Token t, TreeBuilder tb) { - boolean processed = tb.process(new Token.EndTag("tr")); - if (processed) { - return tb.process(t); - } else { - return false; - } - } - }, - InCell { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (StringUtil.in(name, "td", "th")) { - if (!tb.inTableScope(name)) { - tb.error(this); - tb.transition(InRow); // might not be in scope if empty: - // <td /> and processing fake end - // tag - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); - tb.transition(InRow); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html")) { - tb.error(this); - return false; - } else if (StringUtil.in(name, "table", "tbody", "tfoot", - "thead", "tr")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } - closeCell(tb); - return tb.process(t); - } else { - return anythingElse(t, tb); - } - } else if (t.isStartTag() - && StringUtil.in(t.asStartTag().name(), "caption", "col", - "colgroup", "tbody", "td", "tfoot", "th", "thead", - "tr")) { - if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { - tb.error(this); - return false; - } - closeCell(tb); - return tb.process(t); - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InBody); - } - - private void closeCell(HtmlTreeBuilder tb) { - if (tb.inTableScope("td")) { - tb.process(new Token.EndTag("td")); - } else { - tb.process(new Token.EndTag("th")); // only here if th or td in - // scope - } - } - }, - InSelect { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.insert(c); - } - break; - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return tb.process(start, InBody); - } else if (name.equals("option")) { - tb.process(new Token.EndTag("option")); - tb.insert(start); - } else if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option")) { - tb.process(new Token.EndTag("option")); - } else if (tb.currentElement().nodeName() - .equals("optgroup")) { - tb.process(new Token.EndTag("optgroup")); - } - tb.insert(start); - } else if (name.equals("select")) { - tb.error(this); - return tb.process(new Token.EndTag("select")); - } else if (StringUtil.in(name, "input", "keygen", "textarea")) { - tb.error(this); - if (!tb.inSelectScope("select")) { - return false; // frag - } - tb.process(new Token.EndTag("select")); - return tb.process(start); - } else if (name.equals("script")) { - return tb.process(t, InHead); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option") - && tb.aboveOnStack(tb.currentElement()) != null - && tb.aboveOnStack(tb.currentElement()).nodeName() - .equals("optgroup")) { - tb.process(new Token.EndTag("option")); - } - if (tb.currentElement().nodeName().equals("optgroup")) { - tb.pop(); - } else { - tb.error(this); - } - } else if (name.equals("option")) { - if (tb.currentElement().nodeName().equals("option")) { - tb.pop(); - } else { - tb.error(this); - } - } else if (name.equals("select")) { - if (!tb.inSelectScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose(name); - tb.resetInsertionMode(); - } - } else { - return anythingElse(t, tb); - } - break; - case EOF: - if (!tb.currentElement().nodeName().equals("html")) { - tb.error(this); - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - return false; - } - }, - InSelectInTable { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag() - && StringUtil.in(t.asStartTag().name(), "caption", "table", - "tbody", "tfoot", "thead", "tr", "td", "th")) { - tb.error(this); - tb.process(new Token.EndTag("select")); - return tb.process(t); - } else if (t.isEndTag() - && StringUtil.in(t.asEndTag().name(), "caption", "table", - "tbody", "tfoot", "thead", "tr", "td", "th")) { - tb.error(this); - if (tb.inTableScope(t.asEndTag().name())) { - tb.process(new Token.EndTag("select")); - return (tb.process(t)); - } else { - return false; - } - } else { - return tb.process(t, InSelect); - } - } - }, - AfterBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return tb.process(t, InBody); - } else if (t.isComment()) { - tb.insert(t.asComment()); // into html node - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { - if (tb.isFragmentParsing()) { - tb.error(this); - return false; - } else { - tb.transition(AfterAfterBody); - } - } else if (t.isEOF()) { - // chillax! we're done - } else { - tb.error(this); - tb.transition(InBody); - return tb.process(t); - } - return true; - } - }, - InFrameset { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag()) { - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return tb.process(start, InBody); - } else if (name.equals("frameset")) { - tb.insert(start); - } else if (name.equals("frame")) { - tb.insertEmpty(start); - } else if (name.equals("noframes")) { - return tb.process(start, InHead); - } else { - tb.error(this); - return false; - } - } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) { - if (tb.currentElement().nodeName().equals("html")) { // frag - tb.error(this); - return false; - } else { - tb.pop(); - if (!tb.isFragmentParsing() - && !tb.currentElement().nodeName() - .equals("frameset")) { - tb.transition(AfterFrameset); - } - } - } else if (t.isEOF()) { - if (!tb.currentElement().nodeName().equals("html")) { - tb.error(this); - return true; - } - } else { - tb.error(this); - return false; - } - return true; - } - }, - AfterFrameset { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { - tb.transition(AfterAfterFrameset); - } else if (t.isStartTag() - && t.asStartTag().name().equals("noframes")) { - return tb.process(t, InHead); - } else if (t.isEOF()) { - // cool your heels, we're complete - } else { - tb.error(this); - return false; - } - return true; - } - }, - AfterAfterBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) - || (t.isStartTag() && t.asStartTag().name().equals("html"))) { - return tb.process(t, InBody); - } else if (t.isEOF()) { - // nice work chuck - } else { - tb.error(this); - tb.transition(InBody); - return tb.process(t); - } - return true; - } - }, - AfterAfterFrameset { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) - || (t.isStartTag() && t.asStartTag().name().equals("html"))) { - return tb.process(t, InBody); - } else if (t.isEOF()) { - // nice work chuck - } else if (t.isStartTag() - && t.asStartTag().name().equals("noframes")) { - return tb.process(t, InHead); - } else { - tb.error(this); - return false; - } - return true; - } - }, - ForeignContent { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - return true; - // todo: implement. Also; how do we get here? - } - }; - - private static String nullString = String.valueOf('\u0000'); - - abstract boolean process(Token t, HtmlTreeBuilder tb); - - private static boolean isWhitespace(Token t) { - if (t.isCharacter()) { - String data = t.asCharacter().getData(); - // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " - for (int i = 0; i < data.length(); i++) { - char c = data.charAt(i); - if (!StringUtil.isWhitespace(c)) { - return false; - } - } - return true; - } - return false; - } - - private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) { - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.transition(Text); - } - - private static void handleRawtext(Token.StartTag startTag, - HtmlTreeBuilder tb) { - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.Rawtext); - tb.markInsertionMode(); - tb.transition(Text); - } -} diff --git a/server/src/org/jsoup/parser/ParseError.java b/server/src/org/jsoup/parser/ParseError.java deleted file mode 100644 index eb3c240a59..0000000000 --- a/server/src/org/jsoup/parser/ParseError.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.jsoup.parser; - -/** - * A Parse Error records an error in the input HTML that occurs in either the - * tokenisation or the tree building phase. - */ -public class ParseError { - private int pos; - private String errorMsg; - - ParseError(int pos, String errorMsg) { - this.pos = pos; - this.errorMsg = errorMsg; - } - - ParseError(int pos, String errorFormat, Object... args) { - errorMsg = String.format(errorFormat, args); - this.pos = pos; - } - - /** - * Retrieve the error message. - * - * @return the error message. - */ - public String getErrorMessage() { - return errorMsg; - } - - /** - * Retrieves the offset of the error. - * - * @return error offset within input - */ - public int getPosition() { - return pos; - } - - @Override - public String toString() { - return pos + ": " + errorMsg; - } -} diff --git a/server/src/org/jsoup/parser/ParseErrorList.java b/server/src/org/jsoup/parser/ParseErrorList.java deleted file mode 100644 index 773dfcae24..0000000000 --- a/server/src/org/jsoup/parser/ParseErrorList.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.jsoup.parser; - -import java.util.ArrayList; - -/** - * A container for ParseErrors. - * - * @author Jonathan Hedley - */ -class ParseErrorList extends ArrayList<ParseError> { - private static final int INITIAL_CAPACITY = 16; - private final int maxSize; - - ParseErrorList(int initialCapacity, int maxSize) { - super(initialCapacity); - this.maxSize = maxSize; - } - - boolean canAddError() { - return size() < maxSize; - } - - int getMaxSize() { - return maxSize; - } - - static ParseErrorList noTracking() { - return new ParseErrorList(0, 0); - } - - static ParseErrorList tracking(int maxSize) { - return new ParseErrorList(INITIAL_CAPACITY, maxSize); - } -} diff --git a/server/src/org/jsoup/parser/Parser.java b/server/src/org/jsoup/parser/Parser.java deleted file mode 100644 index a1f6fd5184..0000000000 --- a/server/src/org/jsoup/parser/Parser.java +++ /dev/null @@ -1,198 +0,0 @@ -package org.jsoup.parser; - -import java.util.List; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -/** - * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use - * one of the more convenient parse methods in {@link org.jsoup.Jsoup}. - */ -public class Parser { - private static final int DEFAULT_MAX_ERRORS = 0; // by default, error - // tracking is disabled. - - private TreeBuilder treeBuilder; - private int maxErrors = DEFAULT_MAX_ERRORS; - private ParseErrorList errors; - - /** - * Create a new Parser, using the specified TreeBuilder - * - * @param treeBuilder - * TreeBuilder to use to parse input into Documents. - */ - public Parser(TreeBuilder treeBuilder) { - this.treeBuilder = treeBuilder; - } - - public Document parseInput(String html, String baseUri) { - errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) - : ParseErrorList.noTracking(); - Document doc = treeBuilder.parse(html, baseUri, errors); - return doc; - } - - // gets & sets - /** - * Get the TreeBuilder currently in use. - * - * @return current TreeBuilder. - */ - public TreeBuilder getTreeBuilder() { - return treeBuilder; - } - - /** - * Update the TreeBuilder used when parsing content. - * - * @param treeBuilder - * current TreeBuilder - * @return this, for chaining - */ - public Parser setTreeBuilder(TreeBuilder treeBuilder) { - this.treeBuilder = treeBuilder; - return this; - } - - /** - * Check if parse error tracking is enabled. - * - * @return current track error state. - */ - public boolean isTrackErrors() { - return maxErrors > 0; - } - - /** - * Enable or disable parse error tracking for the next parse. - * - * @param maxErrors - * the maximum number of errors to track. Set to 0 to disable. - * @return this, for chaining - */ - public Parser setTrackErrors(int maxErrors) { - this.maxErrors = maxErrors; - return this; - } - - /** - * Retrieve the parse errors, if any, from the last parse. - * - * @return list of parse errors, up to the size of the maximum errors - * tracked. - */ - public List<ParseError> getErrors() { - return errors; - } - - // static parse functions below - /** - * Parse HTML into a Document. - * - * @param html - * HTML to parse - * @param baseUri - * base URI of document (i.e. original fetch location), for - * resolving relative URLs. - * - * @return parsed Document - */ - public static Document parse(String html, String baseUri) { - TreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking()); - } - - /** - * Parse a fragment of HTML into a list of nodes. The context element, if - * supplied, supplies parsing context. - * - * @param fragmentHtml - * the fragment of HTML to parse - * @param context - * (optional) the element that this HTML fragment is being parsed - * for (i.e. for inner HTML). This provides stack context (for - * implicit element creation). - * @param baseUri - * base URI of document (i.e. original fetch location), for - * resolving relative URLs. - * - * @return list of nodes parsed from the input HTML. Note that the context - * element, if supplied, is not modified. - */ - public static List<Node> parseFragment(String fragmentHtml, - Element context, String baseUri) { - HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parseFragment(fragmentHtml, context, baseUri, - ParseErrorList.noTracking()); - } - - /** - * Parse a fragment of HTML into the {@code body} of a Document. - * - * @param bodyHtml - * fragment of HTML - * @param baseUri - * base URI of document (i.e. original fetch location), for - * resolving relative URLs. - * - * @return Document, with empty head, and HTML parsed into body - */ - public static Document parseBodyFragment(String bodyHtml, String baseUri) { - Document doc = Document.createShell(baseUri); - Element body = doc.body(); - List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); - Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node - // list gets - // modified - // when - // re-parented - for (Node node : nodes) { - body.appendChild(node); - } - return doc; - } - - /** - * @param bodyHtml - * HTML to parse - * @param baseUri - * baseUri base URI of document (i.e. original fetch location), - * for resolving relative URLs. - * - * @return parsed Document - * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} - * instead. - */ - @Deprecated - public static Document parseBodyFragmentRelaxed(String bodyHtml, - String baseUri) { - return parse(bodyHtml, baseUri); - } - - // builders - - /** - * Create a new HTML parser. This parser treats input as HTML5, and enforces - * the creation of a normalised document, based on a knowledge of the - * semantics of the incoming tags. - * - * @return a new HTML parser. - */ - public static Parser htmlParser() { - return new Parser(new HtmlTreeBuilder()); - } - - /** - * Create a new XML parser. This parser assumes no knowledge of the incoming - * tags and does not treat it as HTML, rather creates a simple tree directly - * from the input. - * - * @return a new simple XML parser. - */ - public static Parser xmlParser() { - return new Parser(new XmlTreeBuilder()); - } -} diff --git a/server/src/org/jsoup/parser/Tag.java b/server/src/org/jsoup/parser/Tag.java deleted file mode 100644 index c43f27aff3..0000000000 --- a/server/src/org/jsoup/parser/Tag.java +++ /dev/null @@ -1,298 +0,0 @@ -package org.jsoup.parser; - -import java.util.HashMap; -import java.util.Map; - -import org.jsoup.helper.Validate; - -/** - * HTML Tag capabilities. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Tag { - private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map - // of - // known - // tags - - private String tagName; - private boolean isBlock = true; // block or inline - private boolean formatAsBlock = true; // should be formatted as a block - private boolean canContainBlock = true; // Can this tag hold block level - // tags? - private boolean canContainInline = true; // only pcdata if not - private boolean empty = false; // can hold nothing; e.g. img - private boolean selfClosing = false; // can self close (<foo />). used for - // unknown tags that self close, - // without forcing them as empty. - private boolean preserveWhitespace = false; // for pre, textarea, script etc - - private Tag(String tagName) { - this.tagName = tagName.toLowerCase(); - } - - /** - * Get this tag's name. - * - * @return the tag's name - */ - public String getName() { - return tagName; - } - - /** - * Get a Tag by name. If not previously defined (unknown), returns a new - * generic tag, that can do anything. - * <p/> - * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not - * registered and will only .equals(). - * - * @param tagName - * Name of tag, e.g. "p". Case insensitive. - * @return The tag, either defined or new generic. - */ - public static Tag valueOf(String tagName) { - Validate.notNull(tagName); - tagName = tagName.trim().toLowerCase(); - Validate.notEmpty(tagName); - - synchronized (tags) { - Tag tag = tags.get(tagName); - if (tag == null) { - // not defined: create default; go anywhere, do anything! (incl - // be inside a <p>) - tag = new Tag(tagName); - tag.isBlock = false; - tag.canContainBlock = true; - } - return tag; - } - } - - /** - * Gets if this is a block tag. - * - * @return if block tag - */ - public boolean isBlock() { - return isBlock; - } - - /** - * Gets if this tag should be formatted as a block (or as inline) - * - * @return if should be formatted as block or inline - */ - public boolean formatAsBlock() { - return formatAsBlock; - } - - /** - * Gets if this tag can contain block tags. - * - * @return if tag can contain block tags - */ - public boolean canContainBlock() { - return canContainBlock; - } - - /** - * Gets if this tag is an inline tag. - * - * @return if this tag is an inline tag. - */ - public boolean isInline() { - return !isBlock; - } - - /** - * Gets if this tag is a data only tag. - * - * @return if this tag is a data only tag - */ - public boolean isData() { - return !canContainInline && !isEmpty(); - } - - /** - * Get if this is an empty tag - * - * @return if this is an empty tag - */ - public boolean isEmpty() { - return empty; - } - - /** - * Get if this tag is self closing. - * - * @return if this tag should be output as self closing. - */ - public boolean isSelfClosing() { - return empty || selfClosing; - } - - /** - * Get if this is a pre-defined tag, or was auto created on parsing. - * - * @return if a known tag - */ - public boolean isKnownTag() { - return tags.containsKey(tagName); - } - - /** - * Check if this tagname is a known tag. - * - * @param tagName - * name of tag - * @return if known HTML tag - */ - public static boolean isKnownTag(String tagName) { - return tags.containsKey(tagName); - } - - /** - * Get if this tag should preserve whitespace within child text nodes. - * - * @return if preserve whitepace - */ - public boolean preserveWhitespace() { - return preserveWhitespace; - } - - Tag setSelfClosing() { - selfClosing = true; - return this; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Tag)) { - return false; - } - - Tag tag = (Tag) o; - - if (canContainBlock != tag.canContainBlock) { - return false; - } - if (canContainInline != tag.canContainInline) { - return false; - } - if (empty != tag.empty) { - return false; - } - if (formatAsBlock != tag.formatAsBlock) { - return false; - } - if (isBlock != tag.isBlock) { - return false; - } - if (preserveWhitespace != tag.preserveWhitespace) { - return false; - } - if (selfClosing != tag.selfClosing) { - return false; - } - if (!tagName.equals(tag.tagName)) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - int result = tagName.hashCode(); - result = 31 * result + (isBlock ? 1 : 0); - result = 31 * result + (formatAsBlock ? 1 : 0); - result = 31 * result + (canContainBlock ? 1 : 0); - result = 31 * result + (canContainInline ? 1 : 0); - result = 31 * result + (empty ? 1 : 0); - result = 31 * result + (selfClosing ? 1 : 0); - result = 31 * result + (preserveWhitespace ? 1 : 0); - return result; - } - - @Override - public String toString() { - return tagName; - } - - // internal static initialisers: - // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other - // sources - private static final String[] blockTags = { "html", "head", "body", - "frameset", "script", "noscript", "style", "meta", "link", "title", - "frame", "noframes", "section", "nav", "aside", "hgroup", "header", - "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", - "pre", "div", "blockquote", "hr", "address", "figure", - "figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd", - "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", - "col", "tr", "th", "td", "video", "audio", "canvas", "details", - "menu", "plaintext" }; - private static final String[] inlineTags = { "object", "base", "font", - "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", - "samp", "kbd", "var", "cite", "abbr", "time", "acronym", "mark", - "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", "sub", - "sup", "bdo", "iframe", "embed", "span", "input", "select", - "textarea", "label", "button", "optgroup", "option", "legend", - "datalist", "keygen", "output", "progress", "meter", "area", - "param", "source", "track", "summary", "command", "device" }; - private static final String[] emptyTags = { "meta", "link", "base", - "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", - "col", "command", "device" }; - private static final String[] formatAsInlineTags = { "title", "a", "p", - "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", - "td", "script", "style" }; - private static final String[] preserveWhitespaceTags = { "pre", - "plaintext", "title" }; - - static { - // creates - for (String tagName : blockTags) { - Tag tag = new Tag(tagName); - register(tag); - } - for (String tagName : inlineTags) { - Tag tag = new Tag(tagName); - tag.isBlock = false; - tag.canContainBlock = false; - tag.formatAsBlock = false; - register(tag); - } - - // mods: - for (String tagName : emptyTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.canContainBlock = false; - tag.canContainInline = false; - tag.empty = true; - } - - for (String tagName : formatAsInlineTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.formatAsBlock = false; - } - - for (String tagName : preserveWhitespaceTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.preserveWhitespace = true; - } - } - - private static Tag register(Tag tag) { - synchronized (tags) { - tags.put(tag.tagName, tag); - } - return tag; - } -} diff --git a/server/src/org/jsoup/parser/Token.java b/server/src/org/jsoup/parser/Token.java deleted file mode 100644 index e465eb74e3..0000000000 --- a/server/src/org/jsoup/parser/Token.java +++ /dev/null @@ -1,253 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; - -/** - * Parse tokens for the Tokeniser. - */ -abstract class Token { - TokenType type; - - private Token() { - } - - String tokenType() { - return this.getClass().getSimpleName(); - } - - static class Doctype extends Token { - final StringBuilder name = new StringBuilder(); - final StringBuilder publicIdentifier = new StringBuilder(); - final StringBuilder systemIdentifier = new StringBuilder(); - boolean forceQuirks = false; - - Doctype() { - type = TokenType.Doctype; - } - - String getName() { - return name.toString(); - } - - String getPublicIdentifier() { - return publicIdentifier.toString(); - } - - public String getSystemIdentifier() { - return systemIdentifier.toString(); - } - - public boolean isForceQuirks() { - return forceQuirks; - } - } - - static abstract class Tag extends Token { - protected String tagName; - private String pendingAttributeName; - private String pendingAttributeValue; - - boolean selfClosing = false; - Attributes attributes = new Attributes(); // todo: allow nodes to not - // have attributes - - void newAttribute() { - if (pendingAttributeName != null) { - if (pendingAttributeValue == null) { - pendingAttributeValue = ""; - } - Attribute attribute = new Attribute(pendingAttributeName, - pendingAttributeValue); - attributes.put(attribute); - } - pendingAttributeName = null; - pendingAttributeValue = null; - } - - void finaliseTag() { - // finalises for emit - if (pendingAttributeName != null) { - // todo: check if attribute name exists; if so, drop and error - newAttribute(); - } - } - - String name() { - Validate.isFalse(tagName.length() == 0); - return tagName; - } - - Tag name(String name) { - tagName = name; - return this; - } - - boolean isSelfClosing() { - return selfClosing; - } - - @SuppressWarnings({ "TypeMayBeWeakened" }) - Attributes getAttributes() { - return attributes; - } - - // these appenders are rarely hit in not null state-- caused by null - // chars. - void appendTagName(String append) { - tagName = tagName == null ? append : tagName.concat(append); - } - - void appendTagName(char append) { - appendTagName(String.valueOf(append)); - } - - void appendAttributeName(String append) { - pendingAttributeName = pendingAttributeName == null ? append - : pendingAttributeName.concat(append); - } - - void appendAttributeName(char append) { - appendAttributeName(String.valueOf(append)); - } - - void appendAttributeValue(String append) { - pendingAttributeValue = pendingAttributeValue == null ? append - : pendingAttributeValue.concat(append); - } - - void appendAttributeValue(char append) { - appendAttributeValue(String.valueOf(append)); - } - } - - static class StartTag extends Tag { - StartTag() { - super(); - type = TokenType.StartTag; - } - - StartTag(String name) { - this(); - tagName = name; - } - - StartTag(String name, Attributes attributes) { - this(); - tagName = name; - this.attributes = attributes; - } - - @Override - public String toString() { - return "<" + name() + " " + attributes.toString() + ">"; - } - } - - static class EndTag extends Tag { - EndTag() { - super(); - type = TokenType.EndTag; - } - - EndTag(String name) { - this(); - tagName = name; - } - - @Override - public String toString() { - return "</" + name() + " " + attributes.toString() + ">"; - } - } - - static class Comment extends Token { - final StringBuilder data = new StringBuilder(); - - Comment() { - type = TokenType.Comment; - } - - String getData() { - return data.toString(); - } - - @Override - public String toString() { - return "<!--" + getData() + "-->"; - } - } - - static class Character extends Token { - private final String data; - - Character(String data) { - type = TokenType.Character; - this.data = data; - } - - String getData() { - return data; - } - - @Override - public String toString() { - return getData(); - } - } - - static class EOF extends Token { - EOF() { - type = Token.TokenType.EOF; - } - } - - boolean isDoctype() { - return type == TokenType.Doctype; - } - - Doctype asDoctype() { - return (Doctype) this; - } - - boolean isStartTag() { - return type == TokenType.StartTag; - } - - StartTag asStartTag() { - return (StartTag) this; - } - - boolean isEndTag() { - return type == TokenType.EndTag; - } - - EndTag asEndTag() { - return (EndTag) this; - } - - boolean isComment() { - return type == TokenType.Comment; - } - - Comment asComment() { - return (Comment) this; - } - - boolean isCharacter() { - return type == TokenType.Character; - } - - Character asCharacter() { - return (Character) this; - } - - boolean isEOF() { - return type == TokenType.EOF; - } - - enum TokenType { - Doctype, StartTag, EndTag, Comment, Character, EOF - } -} diff --git a/server/src/org/jsoup/parser/TokenQueue.java b/server/src/org/jsoup/parser/TokenQueue.java deleted file mode 100644 index 3e7127e640..0000000000 --- a/server/src/org/jsoup/parser/TokenQueue.java +++ /dev/null @@ -1,473 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; - -/** - * A character queue with parsing helpers. - * - * @author Jonathan Hedley - */ -public class TokenQueue { - private String queue; - private int pos = 0; - - private static final char ESC = '\\'; // escape char for chomp balanced. - - /** - * Create a new TokenQueue. - * - * @param data - * string of data to back queue. - */ - public TokenQueue(String data) { - Validate.notNull(data); - queue = data; - } - - /** - * Is the queue empty? - * - * @return true if no data left in queue. - */ - public boolean isEmpty() { - return remainingLength() == 0; - } - - private int remainingLength() { - return queue.length() - pos; - } - - /** - * Retrieves but does not remove the first character from the queue. - * - * @return First character, or 0 if empty. - */ - public char peek() { - return isEmpty() ? 0 : queue.charAt(pos); - } - - /** - * Add a character to the start of the queue (will be the next character - * retrieved). - * - * @param c - * character to add - */ - public void addFirst(Character c) { - addFirst(c.toString()); - } - - /** - * Add a string to the start of the queue. - * - * @param seq - * string to add. - */ - public void addFirst(String seq) { - // not very performant, but an edge case - queue = seq + queue.substring(pos); - pos = 0; - } - - /** - * Tests if the next characters on the queue match the sequence. Case - * insensitive. - * - * @param seq - * String to check queue for. - * @return true if the next characters match. - */ - public boolean matches(String seq) { - return queue.regionMatches(true, pos, seq, 0, seq.length()); - } - - /** - * Case sensitive match test. - * - * @param seq - * string to case sensitively check for - * @return true if matched, false if not - */ - public boolean matchesCS(String seq) { - return queue.startsWith(seq, pos); - } - - /** - * Tests if the next characters match any of the sequences. Case - * insensitive. - * - * @param seq - * list of strings to case insensitively check for - * @return true of any matched, false if none did - */ - public boolean matchesAny(String... seq) { - for (String s : seq) { - if (matches(s)) { - return true; - } - } - return false; - } - - public boolean matchesAny(char... seq) { - if (isEmpty()) { - return false; - } - - for (char c : seq) { - if (queue.charAt(pos) == c) { - return true; - } - } - return false; - } - - public boolean matchesStartTag() { - // micro opt for matching "<x" - return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character - .isLetter(queue.charAt(pos + 1))); - } - - /** - * Tests if the queue matches the sequence (as with match), and if they do, - * removes the matched string from the queue. - * - * @param seq - * String to search for, and if found, remove from queue. - * @return true if found and removed, false if not found. - */ - public boolean matchChomp(String seq) { - if (matches(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - /** - * Tests if queue starts with a whitespace character. - * - * @return if starts with whitespace - */ - public boolean matchesWhitespace() { - return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); - } - - /** - * Test if the queue matches a word character (letter or digit). - * - * @return if matches a word character - */ - public boolean matchesWord() { - return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); - } - - /** - * Drops the next character off the queue. - */ - public void advance() { - if (!isEmpty()) { - pos++; - } - } - - /** - * Consume one character off queue. - * - * @return first character on queue. - */ - public char consume() { - return queue.charAt(pos++); - } - - /** - * Consumes the supplied sequence of the queue. If the queue does not start - * with the supplied sequence, will throw an illegal state exception -- but - * you should be running match() against that condition. - * <p> - * Case insensitive. - * - * @param seq - * sequence to remove from head of queue. - */ - public void consume(String seq) { - if (!matches(seq)) { - throw new IllegalStateException( - "Queue did not match expected sequence"); - } - int len = seq.length(); - if (len > remainingLength()) { - throw new IllegalStateException( - "Queue not long enough to consume sequence"); - } - - pos += len; - } - - /** - * Pulls a string off the queue, up to but exclusive of the match sequence, - * or to the queue running out. - * - * @param seq - * String to end on (and not include in return, but leave on - * queue). <b>Case sensitive.</b> - * @return The matched data consumed from queue. - */ - public String consumeTo(String seq) { - int offset = queue.indexOf(seq, pos); - if (offset != -1) { - String consumed = queue.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return remainder(); - } - } - - public String consumeToIgnoreCase(String seq) { - int start = pos; - String first = seq.substring(0, 1); - boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if - // first - // is - // not - // cased, - // use - // index - // of - while (!isEmpty()) { - if (matches(seq)) { - break; - } - - if (canScan) { - int skip = queue.indexOf(first, pos) - pos; - if (skip == 0) { - pos++; - } else if (skip < 0) { - pos = queue.length(); - } else { - pos += skip; - } - } else { - pos++; - } - } - - String data = queue.substring(start, pos); - return data; - } - - /** - * Consumes to the first sequence provided, or to the end of the queue. - * Leaves the terminator on the queue. - * - * @param seq - * any number of terminators to consume to. <b>Case - * insensitive.</b> - * @return consumed string - */ - // todo: method name. not good that consumeTo cares for case, and consume to - // any doesn't. And the only use for this - // is is a case sensitive time... - public String consumeToAny(String... seq) { - int start = pos; - while (!isEmpty() && !matchesAny(seq)) { - pos++; - } - - String data = queue.substring(start, pos); - return data; - } - - /** - * Pulls a string off the queue (like consumeTo), and then pulls off the - * matched string (but does not return it). - * <p> - * If the queue runs out of characters before finding the seq, will return - * as much as it can (and queue will go isEmpty() == true). - * - * @param seq - * String to match up to, and not include in return, and to pull - * off queue. <b>Case sensitive.</b> - * @return Data matched from queue. - */ - public String chompTo(String seq) { - String data = consumeTo(seq); - matchChomp(seq); - return data; - } - - public String chompToIgnoreCase(String seq) { - String data = consumeToIgnoreCase(seq); // case insensitive scan - matchChomp(seq); - return data; - } - - /** - * Pulls a balanced string off the queue. E.g. if queue is - * "(one (two) three) four", (,) will return "one (two) three", and leave - * " four" on the queue. Unbalanced openers and closers can be escaped (with - * \). Those escapes will be left in the returned string, which is suitable - * for regexes (where we need to preserve the escape), but unsuitable for - * contains text strings; use unescape for that. - * - * @param open - * opener - * @param close - * closer - * @return data matched from the queue - */ - public String chompBalanced(char open, char close) { - StringBuilder accum = new StringBuilder(); - int depth = 0; - char last = 0; - - do { - if (isEmpty()) { - break; - } - Character c = consume(); - if (last == 0 || last != ESC) { - if (c.equals(open)) { - depth++; - } else if (c.equals(close)) { - depth--; - } - } - - if (depth > 0 && last != 0) { - accum.append(c); // don't include the outer match pair in the - // return - } - last = c; - } while (depth > 0); - return accum.toString(); - } - - /** - * Unescaped a \ escaped string. - * - * @param in - * backslash escaped string - * @return unescaped string - */ - public static String unescape(String in) { - StringBuilder out = new StringBuilder(); - char last = 0; - for (char c : in.toCharArray()) { - if (c == ESC) { - if (last != 0 && last == ESC) { - out.append(c); - } - } else { - out.append(c); - } - last = c; - } - return out.toString(); - } - - /** - * Pulls the next run of whitespace characters of the queue. - */ - public boolean consumeWhitespace() { - boolean seen = false; - while (matchesWhitespace()) { - pos++; - seen = true; - } - return seen; - } - - /** - * Retrieves the next run of word type (letter or digit) off the queue. - * - * @return String of word characters from queue, or empty string if none. - */ - public String consumeWord() { - int start = pos; - while (matchesWord()) { - pos++; - } - return queue.substring(start, pos); - } - - /** - * Consume an tag name off the queue (word or :, _, -) - * - * @return tag name - */ - public String consumeTagName() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume a CSS element selector (tag name, but | instead of : for - * namespaces, to not conflict with :pseudo selects). - * - * @return tag name - */ - public String consumeElementSelector() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume a CSS identifier (ID or class) off the queue (letter, digit, -, - * _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier - * - * @return identifier - */ - public String consumeCssIdentifier() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume an attribute key off the queue (letter, digit, -, _, :") - * - * @return attribute key - */ - public String consumeAttributeKey() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume and return whatever is left on the queue. - * - * @return remained of queue. - */ - public String remainder() { - StringBuilder accum = new StringBuilder(); - while (!isEmpty()) { - accum.append(consume()); - } - return accum.toString(); - } - - @Override - public String toString() { - return queue.substring(pos); - } -} diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java deleted file mode 100644 index f46c962281..0000000000 --- a/server/src/org/jsoup/parser/Tokeniser.java +++ /dev/null @@ -1,264 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Entities; - -/** - * Readers the input stream into tokens. - */ -class Tokeniser { - static final char replacementChar = '\uFFFD'; // replaces null character - - private CharacterReader reader; // html input - private ParseErrorList errors; // errors found while tokenising - - private TokeniserState state = TokeniserState.Data; // current tokenisation - // state - private Token emitPending; // the token we are about to emit on next read - private boolean isEmitPending = false; - private StringBuilder charBuffer = new StringBuilder(); // buffers - // characters to - // output as one - // token - StringBuilder dataBuffer; // buffers data looking for </script> - - Token.Tag tagPending; // tag we are building up - Token.Doctype doctypePending; // doctype building up - Token.Comment commentPending; // comment building up - private Token.StartTag lastStartTag; // the last start tag emitted, to test - // appropriate end tag - private boolean selfClosingFlagAcknowledged = true; - - Tokeniser(CharacterReader reader, ParseErrorList errors) { - this.reader = reader; - this.errors = errors; - } - - Token read() { - if (!selfClosingFlagAcknowledged) { - error("Self closing flag not acknowledged"); - selfClosingFlagAcknowledged = true; - } - - while (!isEmitPending) { - state.read(this, reader); - } - - // if emit is pending, a non-character token was found: return any chars - // in buffer, and leave token for next read: - if (charBuffer.length() > 0) { - String str = charBuffer.toString(); - charBuffer.delete(0, charBuffer.length()); - return new Token.Character(str); - } else { - isEmitPending = false; - return emitPending; - } - } - - void emit(Token token) { - Validate.isFalse(isEmitPending, "There is an unread token pending!"); - - emitPending = token; - isEmitPending = true; - - if (token.type == Token.TokenType.StartTag) { - Token.StartTag startTag = (Token.StartTag) token; - lastStartTag = startTag; - if (startTag.selfClosing) { - selfClosingFlagAcknowledged = false; - } - } else if (token.type == Token.TokenType.EndTag) { - Token.EndTag endTag = (Token.EndTag) token; - if (endTag.attributes.size() > 0) { - error("Attributes incorrectly present on end tag"); - } - } - } - - void emit(String str) { - // buffer strings up until last string token found, to emit only one - // token for a run of character refs etc. - // does not set isEmitPending; read checks that - charBuffer.append(str); - } - - void emit(char c) { - charBuffer.append(c); - } - - TokeniserState getState() { - return state; - } - - void transition(TokeniserState state) { - this.state = state; - } - - void advanceTransition(TokeniserState state) { - reader.advance(); - this.state = state; - } - - void acknowledgeSelfClosingFlag() { - selfClosingFlagAcknowledged = true; - } - - Character consumeCharacterReference(Character additionalAllowedCharacter, - boolean inAttribute) { - if (reader.isEmpty()) { - return null; - } - if (additionalAllowedCharacter != null - && additionalAllowedCharacter == reader.current()) { - return null; - } - if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) { - return null; - } - - reader.mark(); - if (reader.matchConsume("#")) { // numbered - boolean isHexMode = reader.matchConsumeIgnoreCase("X"); - String numRef = isHexMode ? reader.consumeHexSequence() : reader - .consumeDigitSequence(); - if (numRef.length() == 0) { // didn't match anything - characterReferenceError("numeric reference with no numerals"); - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) { - characterReferenceError("missing semicolon"); // missing semi - } - int charval = -1; - try { - int base = isHexMode ? 16 : 10; - charval = Integer.valueOf(numRef, base); - } catch (NumberFormatException e) { - } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) - || charval > 0x10FFFF) { - characterReferenceError("character outside of valid range"); - return replacementChar; - } else { - // todo: implement number replacement table - // todo: check for extra illegal unicode points as parse errors - return (char) charval; - } - } else { // named - // get as many letters as possible, and look for matching entities. - // unconsume backwards till a match is found - String nameRef = reader.consumeLetterThenDigitSequence(); - String origNameRef = new String(nameRef); // for error reporting. - // nameRef gets chomped - // looking for matches - boolean looksLegit = reader.matches(';'); - boolean found = false; - while (nameRef.length() > 0 && !found) { - if (Entities.isNamedEntity(nameRef)) { - found = true; - } else { - nameRef = nameRef.substring(0, nameRef.length() - 1); - reader.unconsume(); - } - } - if (!found) { - if (looksLegit) { - characterReferenceError(String.format( - "invalid named referenece '%s'", origNameRef)); - } - reader.rewindToMark(); - return null; - } - if (inAttribute - && (reader.matchesLetter() || reader.matchesDigit() || reader - .matchesAny('=', '-', '_'))) { - // don't want that to match - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) { - characterReferenceError("missing semicolon"); // missing semi - } - return Entities.getCharacterByName(nameRef); - } - } - - Token.Tag createTagPending(boolean start) { - tagPending = start ? new Token.StartTag() : new Token.EndTag(); - return tagPending; - } - - void emitTagPending() { - tagPending.finaliseTag(); - emit(tagPending); - } - - void createCommentPending() { - commentPending = new Token.Comment(); - } - - void emitCommentPending() { - emit(commentPending); - } - - void createDoctypePending() { - doctypePending = new Token.Doctype(); - } - - void emitDoctypePending() { - emit(doctypePending); - } - - void createTempBuffer() { - dataBuffer = new StringBuilder(); - } - - boolean isAppropriateEndTagToken() { - if (lastStartTag == null) { - return false; - } - return tagPending.tagName.equals(lastStartTag.tagName); - } - - String appropriateEndTagName() { - return lastStartTag.tagName; - } - - void error(TokeniserState state) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Unexpected character '%s' in input state [%s]", reader - .current(), state)); - } - } - - void eofError(TokeniserState state) { - if (errors.canAddError()) { - errors.add(new ParseError( - reader.pos(), - "Unexpectedly reached end of file (EOF) in input state [%s]", - state)); - } - } - - private void characterReferenceError(String message) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Invalid character reference: %s", message)); - } - } - - private void error(String errorMsg) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), errorMsg)); - } - } - - boolean currentNodeInHtmlNS() { - // todo: implement namespaces correctly - return true; - // Element currentNode = currentNode(); - // return currentNode != null && currentNode.namespace().equals("HTML"); - } -} diff --git a/server/src/org/jsoup/parser/TokeniserState.java b/server/src/org/jsoup/parser/TokeniserState.java deleted file mode 100644 index 7f7315d769..0000000000 --- a/server/src/org/jsoup/parser/TokeniserState.java +++ /dev/null @@ -1,1870 +0,0 @@ -package org.jsoup.parser; - -/** - * States and transition activations for the Tokeniser. - */ -enum TokeniserState { - Data { - // in data state, gather characters until a character reference or tag - // is found - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInData); - break; - case '<': - t.advanceTransition(TagOpen); - break; - case nullChar: - t.error(this); // NOT replacement character (oddly?) - t.emit(r.consume()); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; - } - } - }, - CharacterReferenceInData { - // from & in data - @Override - void read(Tokeniser t, CharacterReader r) { - Character c = t.consumeCharacterReference(null, false); - if (c == null) { - t.emit('&'); - } else { - t.emit(c); - } - t.transition(Data); - } - }, - Rcdata { - // / handles data in title, textarea etc - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInRcdata); - break; - case '<': - t.advanceTransition(RcdataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; - } - } - }, - CharacterReferenceInRcdata { - @Override - void read(Tokeniser t, CharacterReader r) { - Character c = t.consumeCharacterReference(null, false); - if (c == null) { - t.emit('&'); - } else { - t.emit(c); - } - t.transition(Rcdata); - } - }, - Rawtext { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '<': - t.advanceTransition(RawtextLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; - } - } - }, - ScriptData { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '<': - t.advanceTransition(ScriptDataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; - } - } - }, - PLAINTEXT { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeTo(nullChar); - t.emit(data); - break; - } - } - }, - TagOpen { - // from < in data - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '!': - t.advanceTransition(MarkupDeclarationOpen); - break; - case '/': - t.advanceTransition(EndTagOpen); - break; - case '?': - t.advanceTransition(BogusComment); - break; - default: - if (r.matchesLetter()) { - t.createTagPending(true); - t.transition(TagName); - } else { - t.error(this); - t.emit('<'); // char that got us here - t.transition(Data); - } - break; - } - } - }, - EndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.emit("</"); - t.transition(Data); - } else if (r.matchesLetter()) { - t.createTagPending(false); - t.transition(TagName); - } else if (r.matches('>')) { - t.error(this); - t.advanceTransition(Data); - } else { - t.error(this); - t.advanceTransition(BogusComment); - } - } - }, - TagName { - // from < or </ in data, will have start or end tag pending - @Override - void read(Tokeniser t, CharacterReader r) { - // previous TagOpen state did NOT consume, will have a letter char - // in current - String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', - nullChar).toLowerCase(); - t.tagPending.appendTagName(tagName); - - switch (r.consume()) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: // replacement - t.tagPending.appendTagName(replacementStr); - break; - case eof: // should emit pending tag? - t.eofError(this); - t.transition(Data); - // no default, as covered with above consumeToAny - } - } - }, - RcdataLessthanSign { - // from < in rcdata - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(RCDATAEndTagOpen); - } else if (r.matchesLetter() - && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) { - // diverge from spec: got a start tag, but there's no - // appropriate end tag (</title>), so rather than - // consuming to EOF; break out here - t.tagPending = new Token.EndTag(t.appropriateEndTagName()); - t.emitTagPending(); - r.unconsume(); // undo "<" - t.transition(Data); - } else { - t.emit("<"); - t.transition(Rcdata); - } - } - }, - RCDATAEndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.tagPending.appendTagName(Character.toLowerCase(r.current())); - t.dataBuffer.append(Character.toLowerCase(r.current())); - t.advanceTransition(RCDATAEndTagName); - } else { - t.emit("</"); - t.transition(Rcdata); - } - } - }, - RCDATAEndTagName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - if (t.isAppropriateEndTagToken()) { - t.transition(BeforeAttributeName); - } else { - anythingElse(t, r); - } - break; - case '/': - if (t.isAppropriateEndTagToken()) { - t.transition(SelfClosingStartTag); - } else { - anythingElse(t, r); - } - break; - case '>': - if (t.isAppropriateEndTagToken()) { - t.emitTagPending(); - t.transition(Data); - } else { - anythingElse(t, r); - } - break; - default: - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(Rcdata); - } - }, - RawtextLessthanSign { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(RawtextEndTagOpen); - } else { - t.emit('<'); - t.transition(Rawtext); - } - } - }, - RawtextEndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.transition(RawtextEndTagName); - } else { - t.emit("</"); - t.transition(Rawtext); - } - } - }, - RawtextEndTagName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - if (t.isAppropriateEndTagToken() && !r.isEmpty()) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(Rawtext); - } - }, - ScriptDataLessthanSign { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.consume()) { - case '/': - t.createTempBuffer(); - t.transition(ScriptDataEndTagOpen); - break; - case '!': - t.emit("<!"); - t.transition(ScriptDataEscapeStart); - break; - default: - t.emit("<"); - r.unconsume(); - t.transition(ScriptData); - } - } - }, - ScriptDataEndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.transition(ScriptDataEndTagName); - } else { - t.emit("</"); - t.transition(ScriptData); - } - - } - }, - ScriptDataEndTagName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - if (t.isAppropriateEndTagToken() && !r.isEmpty()) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(ScriptData); - } - }, - ScriptDataEscapeStart { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('-')) { - t.emit('-'); - t.advanceTransition(ScriptDataEscapeStartDash); - } else { - t.transition(ScriptData); - } - } - }, - ScriptDataEscapeStartDash { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('-')) { - t.emit('-'); - t.advanceTransition(ScriptDataEscapedDashDash); - } else { - t.transition(ScriptData); - } - } - }, - ScriptDataEscaped { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.transition(Data); - return; - } - - switch (r.current()) { - case '-': - t.emit('-'); - t.advanceTransition(ScriptDataEscapedDash); - break; - case '<': - t.advanceTransition(ScriptDataEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); - } - } - }, - ScriptDataEscapedDash { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.transition(Data); - return; - } - - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataEscapedDashDash); - break; - case '<': - t.transition(ScriptDataEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedDashDash { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.transition(Data); - return; - } - - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.transition(ScriptDataEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedLessthanSign { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTempBuffer(); - t.dataBuffer.append(Character.toLowerCase(r.current())); - t.emit("<" + r.current()); - t.advanceTransition(ScriptDataDoubleEscapeStart); - } else if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(ScriptDataEscapedEndTagOpen); - } else { - t.emit('<'); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedEndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.tagPending.appendTagName(Character.toLowerCase(r.current())); - t.dataBuffer.append(r.current()); - t.advanceTransition(ScriptDataEscapedEndTagName); - } else { - t.emit("</"); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedEndTagName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.tagPending.appendTagName(name.toLowerCase()); - t.dataBuffer.append(name); - return; - } - - if (t.isAppropriateEndTagToken() && !r.isEmpty()) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - break; - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("</" + t.dataBuffer.toString()); - t.transition(ScriptDataEscaped); - } - }, - ScriptDataDoubleEscapeStart { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.dataBuffer.append(name.toLowerCase()); - t.emit(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) { - t.transition(ScriptDataDoubleEscaped); - } else { - t.transition(ScriptDataEscaped); - } - t.emit(c); - break; - default: - r.unconsume(); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataDoubleEscaped { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.current(); - switch (c) { - case '-': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedDash); - break; - case '<': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); - } - } - }, - ScriptDataDoubleEscapedDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataDoubleEscapedDashDash); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapedDashDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapedLessthanSign { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.emit('/'); - t.createTempBuffer(); - t.advanceTransition(ScriptDataDoubleEscapeEnd); - } else { - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapeEnd { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.dataBuffer.append(name.toLowerCase()); - t.emit(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) { - t.transition(ScriptDataEscaped); - } else { - t.transition(ScriptDataDoubleEscaped); - } - t.emit(c); - break; - default: - r.unconsume(); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - BeforeAttributeName { - // from tagname <xxx - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - } - } - }, - AttributeName { - // from before attribute name - @Override - void read(Tokeniser t, CharacterReader r) { - String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', - nullChar, '"', '\'', '<'); - t.tagPending.appendAttributeName(name.toLowerCase()); - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.appendAttributeName(c); - // no default, as covered in consumeToAny - } - } - }, - AfterAttributeName { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - } - } - }, - BeforeAttributeValue { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '"': - t.transition(AttributeValue_doubleQuoted); - break; - case '&': - r.unconsume(); - t.transition(AttributeValue_unquoted); - break; - case '\'': - t.transition(AttributeValue_singleQuoted); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - t.transition(AttributeValue_unquoted); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '>': - t.error(this); - t.emitTagPending(); - t.transition(Data); - break; - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - t.transition(AttributeValue_unquoted); - break; - default: - r.unconsume(); - t.transition(AttributeValue_unquoted); - } - } - }, - AttributeValue_doubleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('"', '&', nullChar); - if (value.length() > 0) { - t.tagPending.appendAttributeValue(value); - } - - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('"', true); - if (ref != null) { - t.tagPending.appendAttributeValue(ref); - } else { - t.tagPending.appendAttributeValue('&'); - } - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above - } - } - }, - AttributeValue_singleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\'', '&', nullChar); - if (value.length() > 0) { - t.tagPending.appendAttributeValue(value); - } - - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('\'', true); - if (ref != null) { - t.tagPending.appendAttributeValue(ref); - } else { - t.tagPending.appendAttributeValue('&'); - } - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above - } - } - }, - AttributeValue_unquoted { - @Override - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', - nullChar, '"', '\'', '<', '=', '`'); - if (value.length() > 0) { - t.tagPending.appendAttributeValue(value); - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '&': - Character ref = t.consumeCharacterReference('>', true); - if (ref != null) { - t.tagPending.appendAttributeValue(ref); - } else { - t.tagPending.appendAttributeValue('&'); - } - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - break; - // no default, handled in consume to any above - } - - } - }, - // CharacterReferenceInAttributeValue state handled inline - AfterAttributeValue_quoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - r.unconsume(); - t.transition(BeforeAttributeName); - } - - } - }, - SelfClosingStartTag { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.tagPending.selfClosing = true; - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeAttributeName); - } - } - }, - BogusComment { - @Override - void read(Tokeniser t, CharacterReader r) { - // todo: handle bogus comment starting from eof. when does that - // trigger? - // rewind to capture character that lead us here - r.unconsume(); - Token.Comment comment = new Token.Comment(); - comment.data.append(r.consumeTo('>')); - // todo: replace nullChar with replaceChar - t.emit(comment); - t.advanceTransition(Data); - } - }, - MarkupDeclarationOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchConsume("--")) { - t.createCommentPending(); - t.transition(CommentStart); - } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { - t.transition(Doctype); - } else if (r.matchConsume("[CDATA[")) { - // todo: should actually check current namepspace, and only - // non-html allows cdata. until namespace - // is implemented properly, keep handling as cdata - // } else if (!t.currentNodeInHtmlNS() && - // r.matchConsume("[CDATA[")) { - t.transition(CdataSection); - } else { - t.error(this); - t.advanceTransition(BogusComment); // advance so this character - // gets in bogus comment - // data's rewind - } - } - }, - CommentStart { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); - } - } - }, - CommentStartDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); - } - } - }, - Comment { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.current(); - switch (c) { - case '-': - t.advanceTransition(CommentEndDash); - break; - case nullChar: - t.error(this); - r.advance(); - t.commentPending.data.append(replacementChar); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(r.consumeToAny('-', nullChar)); - } - } - }, - CommentEndDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentEnd); - break; - case nullChar: - t.error(this); - t.commentPending.data.append('-').append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append('-').append(c); - t.transition(Comment); - } - } - }, - CommentEnd { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--").append(replacementChar); - t.transition(Comment); - break; - case '!': - t.error(this); - t.transition(CommentEndBang); - break; - case '-': - t.error(this); - t.commentPending.data.append('-'); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.error(this); - t.commentPending.data.append("--").append(c); - t.transition(Comment); - } - } - }, - CommentEndBang { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.commentPending.data.append("--!"); - t.transition(CommentEndDash); - break; - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--!").append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append("--!").append(c); - t.transition(Comment); - } - } - }, - Doctype { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeDoctypeName); - } - } - }, - BeforeDoctypeName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createDoctypePending(); - t.transition(DoctypeName); - return; - } - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - t.transition(DoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.createDoctypePending(); - t.doctypePending.name.append(c); - t.transition(DoctypeName); - } - } - }, - DoctypeName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.doctypePending.name.append(name.toLowerCase()); - return; - } - char c = r.consume(); - switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterDoctypeName); - break; - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.name.append(c); - } - } - }, - AfterDoctypeName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - return; - } - if (r.matchesAny('\t', '\n', '\f', ' ')) { - r.advance(); // ignore whitespace - } else if (r.matches('>')) { - t.emitDoctypePending(); - t.advanceTransition(Data); - } else if (r.matchConsumeIgnoreCase("PUBLIC")) { - t.transition(AfterDoctypePublicKeyword); - } else if (r.matchConsumeIgnoreCase("SYSTEM")) { - t.transition(AfterDoctypeSystemKeyword); - } else { - t.error(this); - t.doctypePending.forceQuirks = true; - t.advanceTransition(BogusDoctype); - } - - } - }, - AfterDoctypePublicKeyword { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypePublicIdentifier); - break; - case '"': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - BeforeDoctypePublicIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - DoctypePublicIdentifier_doubleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); - } - } - }, - DoctypePublicIdentifier_singleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); - } - } - }, - AfterDoctypePublicIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BetweenDoctypePublicAndSystemIdentifiers); - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - BetweenDoctypePublicAndSystemIdentifiers { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - AfterDoctypeSystemKeyword { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeSystemIdentifier); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - } - } - }, - BeforeDoctypeSystemIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set system id to empty string - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - DoctypeSystemIdentifier_doubleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); - } - } - }, - DoctypeSystemIdentifier_singleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); - } - } - }, - AfterDoctypeSystemIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BogusDoctype); - // NOT force quirks - } - } - }, - BogusDoctype { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.emitDoctypePending(); - t.transition(Data); - break; - default: - // ignore char - break; - } - } - }, - CdataSection { - @Override - void read(Tokeniser t, CharacterReader r) { - String data = r.consumeTo("]]>"); - t.emit(data); - r.matchConsume("]]>"); - t.transition(Data); - } - }; - - abstract void read(Tokeniser t, CharacterReader r); - - private static final char nullChar = '\u0000'; - private static final char replacementChar = Tokeniser.replacementChar; - private static final String replacementStr = String - .valueOf(Tokeniser.replacementChar); - private static final char eof = CharacterReader.EOF; -} diff --git a/server/src/org/jsoup/parser/TreeBuilder.java b/server/src/org/jsoup/parser/TreeBuilder.java deleted file mode 100644 index 5e2dbebc66..0000000000 --- a/server/src/org/jsoup/parser/TreeBuilder.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -/** - * @author Jonathan Hedley - */ -abstract class TreeBuilder { - CharacterReader reader; - Tokeniser tokeniser; - protected Document doc; // current doc we are building into - protected DescendableLinkedList<Element> stack; // the stack of open - // elements - protected String baseUri; // current base uri, for creating new elements - protected Token currentToken; // currentToken is used only for error - // tracking. - protected ParseErrorList errors; // null when not tracking errors - - protected void initialiseParse(String input, String baseUri, - ParseErrorList errors) { - Validate.notNull(input, "String input must not be null"); - Validate.notNull(baseUri, "BaseURI must not be null"); - - doc = new Document(baseUri); - reader = new CharacterReader(input); - this.errors = errors; - tokeniser = new Tokeniser(reader, errors); - stack = new DescendableLinkedList<Element>(); - this.baseUri = baseUri; - } - - Document parse(String input, String baseUri) { - return parse(input, baseUri, ParseErrorList.noTracking()); - } - - Document parse(String input, String baseUri, ParseErrorList errors) { - initialiseParse(input, baseUri, errors); - runParser(); - return doc; - } - - protected void runParser() { - while (true) { - Token token = tokeniser.read(); - process(token); - - if (token.type == Token.TokenType.EOF) { - break; - } - } - } - - protected abstract boolean process(Token token); - - protected Element currentElement() { - return stack.getLast(); - } -} diff --git a/server/src/org/jsoup/parser/XmlTreeBuilder.java b/server/src/org/jsoup/parser/XmlTreeBuilder.java deleted file mode 100644 index c2a3635b3d..0000000000 --- a/server/src/org/jsoup/parser/XmlTreeBuilder.java +++ /dev/null @@ -1,121 +0,0 @@ -package org.jsoup.parser; - -import java.util.Iterator; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Comment; -import org.jsoup.nodes.DocumentType; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; - -/** - * @author Jonathan Hedley - */ -public class XmlTreeBuilder extends TreeBuilder { - @Override - protected void initialiseParse(String input, String baseUri, - ParseErrorList errors) { - super.initialiseParse(input, baseUri, errors); - stack.add(doc); // place the document onto the stack. differs from - // HtmlTreeBuilder (not on stack) - } - - @Override - protected boolean process(Token token) { - // start tag, end tag, doctype, comment, character, eof - switch (token.type) { - case StartTag: - insert(token.asStartTag()); - break; - case EndTag: - popStackToClose(token.asEndTag()); - break; - case Comment: - insert(token.asComment()); - break; - case Character: - insert(token.asCharacter()); - break; - case Doctype: - insert(token.asDoctype()); - break; - case EOF: // could put some normalisation here if desired - break; - default: - Validate.fail("Unexpected token type: " + token.type); - } - return true; - } - - private void insertNode(Node node) { - currentElement().appendChild(node); - } - - Element insert(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name()); - // todo: wonder if for xml parsing, should treat all tags as unknown? - // because it's not html. - Element el = new Element(tag, baseUri, startTag.attributes); - insertNode(el); - if (startTag.isSelfClosing()) { - tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) { - tag.setSelfClosing(); - } - } else { - stack.add(el); - } - return el; - } - - void insert(Token.Comment commentToken) { - Comment comment = new Comment(commentToken.getData(), baseUri); - insertNode(comment); - } - - void insert(Token.Character characterToken) { - Node node = new TextNode(characterToken.getData(), baseUri); - insertNode(node); - } - - void insert(Token.Doctype d) { - DocumentType doctypeNode = new DocumentType(d.getName(), - d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); - insertNode(doctypeNode); - } - - /** - * If the stack contains an element with this tag's name, pop up the stack - * to remove the first occurrence. If not found, skips. - * - * @param endTag - */ - private void popStackToClose(Token.EndTag endTag) { - String elName = endTag.name(); - Element firstFound = null; - - Iterator<Element> it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - firstFound = next; - break; - } - } - if (firstFound == null) { - return; // not found, skip - } - - it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == firstFound) { - it.remove(); - break; - } else { - it.remove(); - } - } - } -} diff --git a/server/src/org/jsoup/parser/package-info.java b/server/src/org/jsoup/parser/package-info.java deleted file mode 100644 index c6c3d9a029..0000000000 --- a/server/src/org/jsoup/parser/package-info.java +++ /dev/null @@ -1,5 +0,0 @@ -/** - Contains the HTML parser, tag specifications, and HTML tokeniser. - */ -package org.jsoup.parser; - |