aboutsummaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/parser
diff options
context:
space:
mode:
authorArtur Signell <artur@vaadin.com>2012-08-28 20:00:00 +0300
committerArtur Signell <artur@vaadin.com>2012-09-09 11:22:54 +0300
commit38212596d91e9e167253d7debb154d18e3ff38b0 (patch)
tree99775812644e3ef421cfa3a6039677bc4cdb8093 /server/src/org/jsoup/parser
parent0a77dae8b57a99cb5112a387b2a374c14e1fae1b (diff)
downloadvaadin-framework-38212596d91e9e167253d7debb154d18e3ff38b0.tar.gz
vaadin-framework-38212596d91e9e167253d7debb154d18e3ff38b0.zip
Jsoup is now declared as a dependency (#9299)
Diffstat (limited to 'server/src/org/jsoup/parser')
-rw-r--r--server/src/org/jsoup/parser/CharacterReader.java244
-rw-r--r--server/src/org/jsoup/parser/HtmlTreeBuilder.java754
-rw-r--r--server/src/org/jsoup/parser/HtmlTreeBuilderState.java1671
-rw-r--r--server/src/org/jsoup/parser/ParseError.java43
-rw-r--r--server/src/org/jsoup/parser/ParseErrorList.java34
-rw-r--r--server/src/org/jsoup/parser/Parser.java198
-rw-r--r--server/src/org/jsoup/parser/Tag.java298
-rw-r--r--server/src/org/jsoup/parser/Token.java253
-rw-r--r--server/src/org/jsoup/parser/TokenQueue.java473
-rw-r--r--server/src/org/jsoup/parser/Tokeniser.java264
-rw-r--r--server/src/org/jsoup/parser/TokeniserState.java1870
-rw-r--r--server/src/org/jsoup/parser/TreeBuilder.java61
-rw-r--r--server/src/org/jsoup/parser/XmlTreeBuilder.java121
-rw-r--r--server/src/org/jsoup/parser/package-info.java5
14 files changed, 0 insertions, 6289 deletions
diff --git a/server/src/org/jsoup/parser/CharacterReader.java b/server/src/org/jsoup/parser/CharacterReader.java
deleted file mode 100644
index 30fbca07f1..0000000000
--- a/server/src/org/jsoup/parser/CharacterReader.java
+++ /dev/null
@@ -1,244 +0,0 @@
-package org.jsoup.parser;
-
-import org.jsoup.helper.Validate;
-
-/**
- * CharacterReader consumes tokens off a string. To replace the old TokenQueue.
- */
-class CharacterReader {
- static final char EOF = (char) -1;
-
- private final String input;
- private final int length;
- private int pos = 0;
- private int mark = 0;
-
- CharacterReader(String input) {
- Validate.notNull(input);
- input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns
- // to newlines
-
- this.input = input;
- length = input.length();
- }
-
- int pos() {
- return pos;
- }
-
- boolean isEmpty() {
- return pos >= length;
- }
-
- char current() {
- return isEmpty() ? EOF : input.charAt(pos);
- }
-
- char consume() {
- char val = isEmpty() ? EOF : input.charAt(pos);
- pos++;
- return val;
- }
-
- void unconsume() {
- pos--;
- }
-
- void advance() {
- pos++;
- }
-
- void mark() {
- mark = pos;
- }
-
- void rewindToMark() {
- pos = mark;
- }
-
- String consumeAsString() {
- return input.substring(pos, pos++);
- }
-
- String consumeTo(char c) {
- int offset = input.indexOf(c, pos);
- if (offset != -1) {
- String consumed = input.substring(pos, offset);
- pos += consumed.length();
- return consumed;
- } else {
- return consumeToEnd();
- }
- }
-
- String consumeTo(String seq) {
- int offset = input.indexOf(seq, pos);
- if (offset != -1) {
- String consumed = input.substring(pos, offset);
- pos += consumed.length();
- return consumed;
- } else {
- return consumeToEnd();
- }
- }
-
- String consumeToAny(char... seq) {
- int start = pos;
-
- OUTER: while (!isEmpty()) {
- char c = input.charAt(pos);
- for (char seek : seq) {
- if (seek == c) {
- break OUTER;
- }
- }
- pos++;
- }
-
- return pos > start ? input.substring(start, pos) : "";
- }
-
- String consumeToEnd() {
- String data = input.substring(pos, input.length());
- pos = input.length();
- return data;
- }
-
- String consumeLetterSequence() {
- int start = pos;
- while (!isEmpty()) {
- char c = input.charAt(pos);
- if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
- pos++;
- } else {
- break;
- }
- }
-
- return input.substring(start, pos);
- }
-
- String consumeLetterThenDigitSequence() {
- int start = pos;
- while (!isEmpty()) {
- char c = input.charAt(pos);
- if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
- pos++;
- } else {
- break;
- }
- }
- while (!isEmpty()) {
- char c = input.charAt(pos);
- if (c >= '0' && c <= '9') {
- pos++;
- } else {
- break;
- }
- }
-
- return input.substring(start, pos);
- }
-
- String consumeHexSequence() {
- int start = pos;
- while (!isEmpty()) {
- char c = input.charAt(pos);
- if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')
- || (c >= 'a' && c <= 'f')) {
- pos++;
- } else {
- break;
- }
- }
- return input.substring(start, pos);
- }
-
- String consumeDigitSequence() {
- int start = pos;
- while (!isEmpty()) {
- char c = input.charAt(pos);
- if (c >= '0' && c <= '9') {
- pos++;
- } else {
- break;
- }
- }
- return input.substring(start, pos);
- }
-
- boolean matches(char c) {
- return !isEmpty() && input.charAt(pos) == c;
-
- }
-
- boolean matches(String seq) {
- return input.startsWith(seq, pos);
- }
-
- boolean matchesIgnoreCase(String seq) {
- return input.regionMatches(true, pos, seq, 0, seq.length());
- }
-
- boolean matchesAny(char... seq) {
- if (isEmpty()) {
- return false;
- }
-
- char c = input.charAt(pos);
- for (char seek : seq) {
- if (seek == c) {
- return true;
- }
- }
- return false;
- }
-
- boolean matchesLetter() {
- if (isEmpty()) {
- return false;
- }
- char c = input.charAt(pos);
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
- }
-
- boolean matchesDigit() {
- if (isEmpty()) {
- return false;
- }
- char c = input.charAt(pos);
- return (c >= '0' && c <= '9');
- }
-
- boolean matchConsume(String seq) {
- if (matches(seq)) {
- pos += seq.length();
- return true;
- } else {
- return false;
- }
- }
-
- boolean matchConsumeIgnoreCase(String seq) {
- if (matchesIgnoreCase(seq)) {
- pos += seq.length();
- return true;
- } else {
- return false;
- }
- }
-
- boolean containsIgnoreCase(String seq) {
- // used to check presence of </title>, </style>. only finds consistent
- // case.
- String loScan = seq.toLowerCase();
- String hiScan = seq.toUpperCase();
- return (input.indexOf(loScan, pos) > -1)
- || (input.indexOf(hiScan, pos) > -1);
- }
-
- @Override
- public String toString() {
- return input.substring(pos);
- }
-}
diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilder.java b/server/src/org/jsoup/parser/HtmlTreeBuilder.java
deleted file mode 100644
index f09ab8794c..0000000000
--- a/server/src/org/jsoup/parser/HtmlTreeBuilder.java
+++ /dev/null
@@ -1,754 +0,0 @@
-package org.jsoup.parser;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-
-import org.jsoup.helper.DescendableLinkedList;
-import org.jsoup.helper.StringUtil;
-import org.jsoup.helper.Validate;
-import org.jsoup.nodes.Comment;
-import org.jsoup.nodes.DataNode;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-import org.jsoup.nodes.TextNode;
-
-/**
- * HTML Tree Builder; creates a DOM from Tokens.
- */
-class HtmlTreeBuilder extends TreeBuilder {
-
- private HtmlTreeBuilderState state; // the current state
- private HtmlTreeBuilderState originalState; // original / marked state
-
- private boolean baseUriSetFromDoc = false;
- private Element headElement; // the current head element
- private Element formElement; // the current form element
- private Element contextElement; // fragment parse context -- could be null
- // even if fragment parsing
- private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active
- // (open)
- // formatting
- // elements
- private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars
- // in
- // table
- // to
- // be
- // shifted
- // out
-
- private boolean framesetOk = true; // if ok to go into frameset
- private boolean fosterInserts = false; // if next inserts should be fostered
- private boolean fragmentParsing = false; // if parsing a fragment of html
-
- HtmlTreeBuilder() {
- }
-
- @Override
- Document parse(String input, String baseUri, ParseErrorList errors) {
- state = HtmlTreeBuilderState.Initial;
- return super.parse(input, baseUri, errors);
- }
-
- List<Node> parseFragment(String inputFragment, Element context,
- String baseUri, ParseErrorList errors) {
- // context may be null
- state = HtmlTreeBuilderState.Initial;
- initialiseParse(inputFragment, baseUri, errors);
- contextElement = context;
- fragmentParsing = true;
- Element root = null;
-
- if (context != null) {
- if (context.ownerDocument() != null) {
- doc.quirksMode(context.ownerDocument().quirksMode());
- }
-
- // initialise the tokeniser state:
- String contextTag = context.tagName();
- if (StringUtil.in(contextTag, "title", "textarea")) {
- tokeniser.transition(TokeniserState.Rcdata);
- } else if (StringUtil.in(contextTag, "iframe", "noembed",
- "noframes", "style", "xmp")) {
- tokeniser.transition(TokeniserState.Rawtext);
- } else if (contextTag.equals("script")) {
- tokeniser.transition(TokeniserState.ScriptData);
- } else if (contextTag.equals(("noscript"))) {
- tokeniser.transition(TokeniserState.Data); // if scripting
- // enabled, rawtext
- } else if (contextTag.equals("plaintext")) {
- tokeniser.transition(TokeniserState.Data);
- } else {
- tokeniser.transition(TokeniserState.Data); // default
- }
-
- root = new Element(Tag.valueOf("html"), baseUri);
- doc.appendChild(root);
- stack.push(root);
- resetInsertionMode();
- // todo: setup form element to nearest form on context (up ancestor
- // chain)
- }
-
- runParser();
- if (context != null) {
- return root.childNodes();
- } else {
- return doc.childNodes();
- }
- }
-
- @Override
- protected boolean process(Token token) {
- currentToken = token;
- return state.process(token, this);
- }
-
- boolean process(Token token, HtmlTreeBuilderState state) {
- currentToken = token;
- return state.process(token, this);
- }
-
- void transition(HtmlTreeBuilderState state) {
- this.state = state;
- }
-
- HtmlTreeBuilderState state() {
- return state;
- }
-
- void markInsertionMode() {
- originalState = state;
- }
-
- HtmlTreeBuilderState originalState() {
- return originalState;
- }
-
- void framesetOk(boolean framesetOk) {
- this.framesetOk = framesetOk;
- }
-
- boolean framesetOk() {
- return framesetOk;
- }
-
- Document getDocument() {
- return doc;
- }
-
- String getBaseUri() {
- return baseUri;
- }
-
- void maybeSetBaseUri(Element base) {
- if (baseUriSetFromDoc) {
- return;
- }
-
- String href = base.absUrl("href");
- if (href.length() != 0) { // ignore <base target> etc
- baseUri = href;
- baseUriSetFromDoc = true;
- doc.setBaseUri(href); // set on the doc so doc.createElement(Tag)
- // will get updated base, and to update all
- // descendants
- }
- }
-
- boolean isFragmentParsing() {
- return fragmentParsing;
- }
-
- void error(HtmlTreeBuilderState state) {
- if (errors.canAddError()) {
- errors.add(new ParseError(reader.pos(),
- "Unexpected token [%s] when in state [%s]", currentToken
- .tokenType(), state));
- }
- }
-
- Element insert(Token.StartTag startTag) {
- // handle empty unknown tags
- // when the spec expects an empty tag, will directly hit insertEmpty, so
- // won't generate fake end tag.
- if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) {
- Element el = insertEmpty(startTag);
- process(new Token.EndTag(el.tagName())); // ensure we get out of
- // whatever state we are in
- return el;
- }
-
- Element el = new Element(Tag.valueOf(startTag.name()), baseUri,
- startTag.attributes);
- insert(el);
- return el;
- }
-
- Element insert(String startTagName) {
- Element el = new Element(Tag.valueOf(startTagName), baseUri);
- insert(el);
- return el;
- }
-
- void insert(Element el) {
- insertNode(el);
- stack.add(el);
- }
-
- Element insertEmpty(Token.StartTag startTag) {
- Tag tag = Tag.valueOf(startTag.name());
- Element el = new Element(tag, baseUri, startTag.attributes);
- insertNode(el);
- if (startTag.isSelfClosing()) {
- tokeniser.acknowledgeSelfClosingFlag();
- if (!tag.isKnownTag()) {
- tag.setSelfClosing();
- }
- }
- return el;
- }
-
- void insert(Token.Comment commentToken) {
- Comment comment = new Comment(commentToken.getData(), baseUri);
- insertNode(comment);
- }
-
- void insert(Token.Character characterToken) {
- Node node;
- // characters in script and style go in as datanodes, not text nodes
- if (StringUtil.in(currentElement().tagName(), "script", "style")) {
- node = new DataNode(characterToken.getData(), baseUri);
- } else {
- node = new TextNode(characterToken.getData(), baseUri);
- }
- currentElement().appendChild(node); // doesn't use insertNode, because
- // we don't foster these; and will
- // always have a stack.
- }
-
- private void insertNode(Node node) {
- // if the stack hasn't been set up yet, elements (doctype, comments) go
- // into the doc
- if (stack.size() == 0) {
- doc.appendChild(node);
- } else if (isFosterInserts()) {
- insertInFosterParent(node);
- } else {
- currentElement().appendChild(node);
- }
- }
-
- Element pop() {
- // todo - dev, remove validation check
- if (stack.peekLast().nodeName().equals("td")
- && !state.name().equals("InCell")) {
- Validate.isFalse(true, "pop td not in cell");
- }
- if (stack.peekLast().nodeName().equals("html")) {
- Validate.isFalse(true, "popping html!");
- }
- return stack.pollLast();
- }
-
- void push(Element element) {
- stack.add(element);
- }
-
- DescendableLinkedList<Element> getStack() {
- return stack;
- }
-
- boolean onStack(Element el) {
- return isElementInQueue(stack, el);
- }
-
- private boolean isElementInQueue(DescendableLinkedList<Element> queue,
- Element element) {
- Iterator<Element> it = queue.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next == element) {
- return true;
- }
- }
- return false;
- }
-
- Element getFromStack(String elName) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next.nodeName().equals(elName)) {
- return next;
- }
- }
- return null;
- }
-
- boolean removeFromStack(Element el) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next == el) {
- it.remove();
- return true;
- }
- }
- return false;
- }
-
- void popStackToClose(String elName) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next.nodeName().equals(elName)) {
- it.remove();
- break;
- } else {
- it.remove();
- }
- }
- }
-
- void popStackToClose(String... elNames) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (StringUtil.in(next.nodeName(), elNames)) {
- it.remove();
- break;
- } else {
- it.remove();
- }
- }
- }
-
- void popStackToBefore(String elName) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next.nodeName().equals(elName)) {
- break;
- } else {
- it.remove();
- }
- }
- }
-
- void clearStackToTableContext() {
- clearStackToContext("table");
- }
-
- void clearStackToTableBodyContext() {
- clearStackToContext("tbody", "tfoot", "thead");
- }
-
- void clearStackToTableRowContext() {
- clearStackToContext("tr");
- }
-
- private void clearStackToContext(String... nodeNames) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (StringUtil.in(next.nodeName(), nodeNames)
- || next.nodeName().equals("html")) {
- break;
- } else {
- it.remove();
- }
- }
- }
-
- Element aboveOnStack(Element el) {
- assert onStack(el);
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next == el) {
- return it.next();
- }
- }
- return null;
- }
-
- void insertOnStackAfter(Element after, Element in) {
- int i = stack.lastIndexOf(after);
- Validate.isTrue(i != -1);
- stack.add(i + 1, in);
- }
-
- void replaceOnStack(Element out, Element in) {
- replaceInQueue(stack, out, in);
- }
-
- private void replaceInQueue(LinkedList<Element> queue, Element out,
- Element in) {
- int i = queue.lastIndexOf(out);
- Validate.isTrue(i != -1);
- queue.remove(i);
- queue.add(i, in);
- }
-
- void resetInsertionMode() {
- boolean last = false;
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element node = it.next();
- if (!it.hasNext()) {
- last = true;
- node = contextElement;
- }
- String name = node.nodeName();
- if ("select".equals(name)) {
- transition(HtmlTreeBuilderState.InSelect);
- break; // frag
- } else if (("td".equals(name) || "td".equals(name) && !last)) {
- transition(HtmlTreeBuilderState.InCell);
- break;
- } else if ("tr".equals(name)) {
- transition(HtmlTreeBuilderState.InRow);
- break;
- } else if ("tbody".equals(name) || "thead".equals(name)
- || "tfoot".equals(name)) {
- transition(HtmlTreeBuilderState.InTableBody);
- break;
- } else if ("caption".equals(name)) {
- transition(HtmlTreeBuilderState.InCaption);
- break;
- } else if ("colgroup".equals(name)) {
- transition(HtmlTreeBuilderState.InColumnGroup);
- break; // frag
- } else if ("table".equals(name)) {
- transition(HtmlTreeBuilderState.InTable);
- break;
- } else if ("head".equals(name)) {
- transition(HtmlTreeBuilderState.InBody);
- break; // frag
- } else if ("body".equals(name)) {
- transition(HtmlTreeBuilderState.InBody);
- break;
- } else if ("frameset".equals(name)) {
- transition(HtmlTreeBuilderState.InFrameset);
- break; // frag
- } else if ("html".equals(name)) {
- transition(HtmlTreeBuilderState.BeforeHead);
- break; // frag
- } else if (last) {
- transition(HtmlTreeBuilderState.InBody);
- break; // frag
- }
- }
- }
-
- // todo: tidy up in specific scope methods
- private boolean inSpecificScope(String targetName, String[] baseTypes,
- String[] extraTypes) {
- return inSpecificScope(new String[] { targetName }, baseTypes,
- extraTypes);
- }
-
- private boolean inSpecificScope(String[] targetNames, String[] baseTypes,
- String[] extraTypes) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element el = it.next();
- String elName = el.nodeName();
- if (StringUtil.in(elName, targetNames)) {
- return true;
- }
- if (StringUtil.in(elName, baseTypes)) {
- return false;
- }
- if (extraTypes != null && StringUtil.in(elName, extraTypes)) {
- return false;
- }
- }
- Validate.fail("Should not be reachable");
- return false;
- }
-
- boolean inScope(String[] targetNames) {
- return inSpecificScope(targetNames, new String[] { "applet", "caption",
- "html", "table", "td", "th", "marquee", "object" }, null);
- }
-
- boolean inScope(String targetName) {
- return inScope(targetName, null);
- }
-
- boolean inScope(String targetName, String[] extras) {
- return inSpecificScope(targetName, new String[] { "applet", "caption",
- "html", "table", "td", "th", "marquee", "object" }, extras);
- // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml
- // todo: in svg namespace: forignOjbect, desc, title
- }
-
- boolean inListItemScope(String targetName) {
- return inScope(targetName, new String[] { "ol", "ul" });
- }
-
- boolean inButtonScope(String targetName) {
- return inScope(targetName, new String[] { "button" });
- }
-
- boolean inTableScope(String targetName) {
- return inSpecificScope(targetName, new String[] { "html", "table" },
- null);
- }
-
- boolean inSelectScope(String targetName) {
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element el = it.next();
- String elName = el.nodeName();
- if (elName.equals(targetName)) {
- return true;
- }
- if (!StringUtil.in(elName, "optgroup", "option")) {
- return false;
- }
- }
- Validate.fail("Should not be reachable");
- return false;
- }
-
- void setHeadElement(Element headElement) {
- this.headElement = headElement;
- }
-
- Element getHeadElement() {
- return headElement;
- }
-
- boolean isFosterInserts() {
- return fosterInserts;
- }
-
- void setFosterInserts(boolean fosterInserts) {
- this.fosterInserts = fosterInserts;
- }
-
- Element getFormElement() {
- return formElement;
- }
-
- void setFormElement(Element formElement) {
- this.formElement = formElement;
- }
-
- void newPendingTableCharacters() {
- pendingTableCharacters = new ArrayList<Token.Character>();
- }
-
- List<Token.Character> getPendingTableCharacters() {
- return pendingTableCharacters;
- }
-
- void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) {
- this.pendingTableCharacters = pendingTableCharacters;
- }
-
- /**
- * 11.2.5.2 Closing elements that have implied end tags
- * <p/>
- * When the steps below require the UA to generate implied end tags, then,
- * while the current node is a dd element, a dt element, an li element, an
- * option element, an optgroup element, a p element, an rp element, or an rt
- * element, the UA must pop the current node off the stack of open elements.
- *
- * @param excludeTag
- * If a step requires the UA to generate implied end tags but
- * lists an element to exclude from the process, then the UA must
- * perform the above steps as if that element was not in the
- * above list.
- */
- void generateImpliedEndTags(String excludeTag) {
- while ((excludeTag != null && !currentElement().nodeName().equals(
- excludeTag))
- && StringUtil.in(currentElement().nodeName(), "dd", "dt", "li",
- "option", "optgroup", "p", "rp", "rt")) {
- pop();
- }
- }
-
- void generateImpliedEndTags() {
- generateImpliedEndTags(null);
- }
-
- boolean isSpecial(Element el) {
- // todo: mathml's mi, mo, mn
- // todo: svg's foreigObject, desc, title
- String name = el.nodeName();
- return StringUtil.in(name, "address", "applet", "area", "article",
- "aside", "base", "basefont", "bgsound", "blockquote", "body",
- "br", "button", "caption", "center", "col", "colgroup",
- "command", "dd", "details", "dir", "div", "dl", "dt", "embed",
- "fieldset", "figcaption", "figure", "footer", "form", "frame",
- "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head",
- "header", "hgroup", "hr", "html", "iframe", "img", "input",
- "isindex", "li", "link", "listing", "marquee", "menu", "meta",
- "nav", "noembed", "noframes", "noscript", "object", "ol", "p",
- "param", "plaintext", "pre", "script", "section", "select",
- "style", "summary", "table", "tbody", "td", "textarea",
- "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp");
- }
-
- // active formatting elements
- void pushActiveFormattingElements(Element in) {
- int numSeen = 0;
- Iterator<Element> iter = formattingElements.descendingIterator();
- while (iter.hasNext()) {
- Element el = iter.next();
- if (el == null) {
- break;
- }
-
- if (isSameFormattingElement(in, el)) {
- numSeen++;
- }
-
- if (numSeen == 3) {
- iter.remove();
- break;
- }
- }
- formattingElements.add(in);
- }
-
- private boolean isSameFormattingElement(Element a, Element b) {
- // same if: same namespace, tag, and attributes. Element.equals only
- // checks tag, might in future check children
- return a.nodeName().equals(b.nodeName()) &&
- // a.namespace().equals(b.namespace()) &&
- a.attributes().equals(b.attributes());
- // todo: namespaces
- }
-
- void reconstructFormattingElements() {
- int size = formattingElements.size();
- if (size == 0 || formattingElements.getLast() == null
- || onStack(formattingElements.getLast())) {
- return;
- }
-
- Element entry = formattingElements.getLast();
- int pos = size - 1;
- boolean skip = false;
- while (true) {
- if (pos == 0) { // step 4. if none before, skip to 8
- skip = true;
- break;
- }
- entry = formattingElements.get(--pos); // step 5. one earlier than
- // entry
- if (entry == null || onStack(entry)) {
- break; // jump to 8, else continue back to 4
- }
- }
- while (true) {
- if (!skip) {
- entry = formattingElements.get(++pos);
- }
- Validate.notNull(entry); // should not occur, as we break at last
- // element
-
- // 8. create new element from element, 9 insert into current node,
- // onto stack
- skip = false; // can only skip increment from 4.
- Element newEl = insert(entry.nodeName()); // todo: avoid fostering
- // here?
- // newEl.namespace(entry.namespace()); // todo: namespaces
- newEl.attributes().addAll(entry.attributes());
-
- // 10. replace entry with new entry
- formattingElements.add(pos, newEl);
- formattingElements.remove(pos + 1);
-
- // 11
- if (pos == size - 1) {
- break;
- }
- }
- }
-
- void clearFormattingElementsToLastMarker() {
- while (!formattingElements.isEmpty()) {
- Element el = formattingElements.peekLast();
- formattingElements.removeLast();
- if (el == null) {
- break;
- }
- }
- }
-
- void removeFromActiveFormattingElements(Element el) {
- Iterator<Element> it = formattingElements.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next == el) {
- it.remove();
- break;
- }
- }
- }
-
- boolean isInActiveFormattingElements(Element el) {
- return isElementInQueue(formattingElements, el);
- }
-
- Element getActiveFormattingElement(String nodeName) {
- Iterator<Element> it = formattingElements.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next == null) {
- break;
- } else if (next.nodeName().equals(nodeName)) {
- return next;
- }
- }
- return null;
- }
-
- void replaceActiveFormattingElement(Element out, Element in) {
- replaceInQueue(formattingElements, out, in);
- }
-
- void insertMarkerToFormattingElements() {
- formattingElements.add(null);
- }
-
- void insertInFosterParent(Node in) {
- Element fosterParent = null;
- Element lastTable = getFromStack("table");
- boolean isLastTableParent = false;
- if (lastTable != null) {
- if (lastTable.parent() != null) {
- fosterParent = lastTable.parent();
- isLastTableParent = true;
- } else {
- fosterParent = aboveOnStack(lastTable);
- }
- } else { // no table == frag
- fosterParent = stack.get(0);
- }
-
- if (isLastTableParent) {
- Validate.notNull(lastTable); // last table cannot be null by this
- // point.
- lastTable.before(in);
- } else {
- fosterParent.appendChild(in);
- }
- }
-
- @Override
- public String toString() {
- return "TreeBuilder{" + "currentToken=" + currentToken + ", state="
- + state + ", currentElement=" + currentElement() + '}';
- }
-}
diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java
deleted file mode 100644
index 258d547a49..0000000000
--- a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java
+++ /dev/null
@@ -1,1671 +0,0 @@
-package org.jsoup.parser;
-
-import java.util.Iterator;
-import java.util.LinkedList;
-
-import org.jsoup.helper.DescendableLinkedList;
-import org.jsoup.helper.StringUtil;
-import org.jsoup.nodes.Attribute;
-import org.jsoup.nodes.Attributes;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.DocumentType;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-
-/**
- * The Tree Builder's current state. Each state embodies the processing for the
- * state, and transitions to other states.
- */
-enum HtmlTreeBuilderState {
- Initial {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- return true; // ignore whitespace
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype()) {
- // todo: parse error check on expected doctypes
- // todo: quirk state check on doctype ids
- Token.Doctype d = t.asDoctype();
- DocumentType doctype = new DocumentType(d.getName(),
- d.getPublicIdentifier(), d.getSystemIdentifier(),
- tb.getBaseUri());
- tb.getDocument().appendChild(doctype);
- if (d.isForceQuirks()) {
- tb.getDocument().quirksMode(Document.QuirksMode.quirks);
- }
- tb.transition(BeforeHtml);
- } else {
- // todo: check not iframe srcdoc
- tb.transition(BeforeHtml);
- return tb.process(t); // re-process token
- }
- return true;
- }
- },
- BeforeHtml {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isDoctype()) {
- tb.error(this);
- return false;
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (isWhitespace(t)) {
- return true; // ignore whitespace
- } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
- tb.insert(t.asStartTag());
- tb.transition(BeforeHead);
- } else if (t.isEndTag()
- && (StringUtil.in(t.asEndTag().name(), "head", "body",
- "html", "br"))) {
- return anythingElse(t, tb);
- } else if (t.isEndTag()) {
- tb.error(this);
- return false;
- } else {
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- tb.insert("html");
- tb.transition(BeforeHead);
- return tb.process(t);
- }
- },
- BeforeHead {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- return true;
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype()) {
- tb.error(this);
- return false;
- } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
- return InBody.process(t, tb); // does not transition
- } else if (t.isStartTag() && t.asStartTag().name().equals("head")) {
- Element head = tb.insert(t.asStartTag());
- tb.setHeadElement(head);
- tb.transition(InHead);
- } else if (t.isEndTag()
- && (StringUtil.in(t.asEndTag().name(), "head", "body",
- "html", "br"))) {
- tb.process(new Token.StartTag("head"));
- return tb.process(t);
- } else if (t.isEndTag()) {
- tb.error(this);
- return false;
- } else {
- tb.process(new Token.StartTag("head"));
- return tb.process(t);
- }
- return true;
- }
- },
- InHead {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- tb.insert(t.asCharacter());
- return true;
- }
- switch (t.type) {
- case Comment:
- tb.insert(t.asComment());
- break;
- case Doctype:
- tb.error(this);
- return false;
- case StartTag:
- Token.StartTag start = t.asStartTag();
- String name = start.name();
- if (name.equals("html")) {
- return InBody.process(t, tb);
- } else if (StringUtil.in(name, "base", "basefont", "bgsound",
- "command", "link")) {
- Element el = tb.insertEmpty(start);
- // jsoup special: update base the frist time it is seen
- if (name.equals("base") && el.hasAttr("href")) {
- tb.maybeSetBaseUri(el);
- }
- } else if (name.equals("meta")) {
- Element meta = tb.insertEmpty(start);
- // todo: charset switches
- } else if (name.equals("title")) {
- handleRcData(start, tb);
- } else if (StringUtil.in(name, "noframes", "style")) {
- handleRawtext(start, tb);
- } else if (name.equals("noscript")) {
- // else if noscript && scripting flag = true: rawtext (jsoup
- // doesn't run script, to handle as noscript)
- tb.insert(start);
- tb.transition(InHeadNoscript);
- } else if (name.equals("script")) {
- // skips some script rules as won't execute them
- tb.insert(start);
- tb.tokeniser.transition(TokeniserState.ScriptData);
- tb.markInsertionMode();
- tb.transition(Text);
- } else if (name.equals("head")) {
- tb.error(this);
- return false;
- } else {
- return anythingElse(t, tb);
- }
- break;
- case EndTag:
- Token.EndTag end = t.asEndTag();
- name = end.name();
- if (name.equals("head")) {
- tb.pop();
- tb.transition(AfterHead);
- } else if (StringUtil.in(name, "body", "html", "br")) {
- return anythingElse(t, tb);
- } else {
- tb.error(this);
- return false;
- }
- break;
- default:
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, TreeBuilder tb) {
- tb.process(new Token.EndTag("head"));
- return tb.process(t);
- }
- },
- InHeadNoscript {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isDoctype()) {
- tb.error(this);
- } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
- return tb.process(t, InBody);
- } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) {
- tb.pop();
- tb.transition(InHead);
- } else if (isWhitespace(t)
- || t.isComment()
- || (t.isStartTag() && StringUtil.in(t.asStartTag().name(),
- "basefont", "bgsound", "link", "meta", "noframes",
- "style"))) {
- return tb.process(t, InHead);
- } else if (t.isEndTag() && t.asEndTag().name().equals("br")) {
- return anythingElse(t, tb);
- } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(),
- "head", "noscript")) || t.isEndTag()) {
- tb.error(this);
- return false;
- } else {
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- tb.error(this);
- tb.process(new Token.EndTag("noscript"));
- return tb.process(t);
- }
- },
- AfterHead {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- tb.insert(t.asCharacter());
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype()) {
- tb.error(this);
- } else if (t.isStartTag()) {
- Token.StartTag startTag = t.asStartTag();
- String name = startTag.name();
- if (name.equals("html")) {
- return tb.process(t, InBody);
- } else if (name.equals("body")) {
- tb.insert(startTag);
- tb.framesetOk(false);
- tb.transition(InBody);
- } else if (name.equals("frameset")) {
- tb.insert(startTag);
- tb.transition(InFrameset);
- } else if (StringUtil.in(name, "base", "basefont", "bgsound",
- "link", "meta", "noframes", "script", "style", "title")) {
- tb.error(this);
- Element head = tb.getHeadElement();
- tb.push(head);
- tb.process(t, InHead);
- tb.removeFromStack(head);
- } else if (name.equals("head")) {
- tb.error(this);
- return false;
- } else {
- anythingElse(t, tb);
- }
- } else if (t.isEndTag()) {
- if (StringUtil.in(t.asEndTag().name(), "body", "html")) {
- anythingElse(t, tb);
- } else {
- tb.error(this);
- return false;
- }
- } else {
- anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- tb.process(new Token.StartTag("body"));
- tb.framesetOk(true);
- return tb.process(t);
- }
- },
- InBody {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- switch (t.type) {
- case Character: {
- Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString)) {
- // todo confirm that check
- tb.error(this);
- return false;
- } else if (isWhitespace(c)) {
- tb.reconstructFormattingElements();
- tb.insert(c);
- } else {
- tb.reconstructFormattingElements();
- tb.insert(c);
- tb.framesetOk(false);
- }
- break;
- }
- case Comment: {
- tb.insert(t.asComment());
- break;
- }
- case Doctype: {
- tb.error(this);
- return false;
- }
- case StartTag:
- Token.StartTag startTag = t.asStartTag();
- String name = startTag.name();
- if (name.equals("html")) {
- tb.error(this);
- // merge attributes onto real html
- Element html = tb.getStack().getFirst();
- for (Attribute attribute : startTag.getAttributes()) {
- if (!html.hasAttr(attribute.getKey())) {
- html.attributes().put(attribute);
- }
- }
- } else if (StringUtil.in(name, "base", "basefont", "bgsound",
- "command", "link", "meta", "noframes", "script",
- "style", "title")) {
- return tb.process(t, InHead);
- } else if (name.equals("body")) {
- tb.error(this);
- LinkedList<Element> stack = tb.getStack();
- if (stack.size() == 1
- || (stack.size() > 2 && !stack.get(1).nodeName()
- .equals("body"))) {
- // only in fragment case
- return false; // ignore
- } else {
- tb.framesetOk(false);
- Element body = stack.get(1);
- for (Attribute attribute : startTag.getAttributes()) {
- if (!body.hasAttr(attribute.getKey())) {
- body.attributes().put(attribute);
- }
- }
- }
- } else if (name.equals("frameset")) {
- tb.error(this);
- LinkedList<Element> stack = tb.getStack();
- if (stack.size() == 1
- || (stack.size() > 2 && !stack.get(1).nodeName()
- .equals("body"))) {
- // only in fragment case
- return false; // ignore
- } else if (!tb.framesetOk()) {
- return false; // ignore frameset
- } else {
- Element second = stack.get(1);
- if (second.parent() != null) {
- second.remove();
- }
- // pop up to html element
- while (stack.size() > 1) {
- stack.removeLast();
- }
- tb.insert(startTag);
- tb.transition(InFrameset);
- }
- } else if (StringUtil.in(name, "address", "article", "aside",
- "blockquote", "center", "details", "dir", "div", "dl",
- "fieldset", "figcaption", "figure", "footer", "header",
- "hgroup", "menu", "nav", "ol", "p", "section",
- "summary", "ul")) {
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insert(startTag);
- } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5",
- "h6")) {
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- if (StringUtil.in(tb.currentElement().nodeName(), "h1",
- "h2", "h3", "h4", "h5", "h6")) {
- tb.error(this);
- tb.pop();
- }
- tb.insert(startTag);
- } else if (StringUtil.in(name, "pre", "listing")) {
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insert(startTag);
- // todo: ignore LF if next token
- tb.framesetOk(false);
- } else if (name.equals("form")) {
- if (tb.getFormElement() != null) {
- tb.error(this);
- return false;
- }
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- Element form = tb.insert(startTag);
- tb.setFormElement(form);
- } else if (name.equals("li")) {
- tb.framesetOk(false);
- LinkedList<Element> stack = tb.getStack();
- for (int i = stack.size() - 1; i > 0; i--) {
- Element el = stack.get(i);
- if (el.nodeName().equals("li")) {
- tb.process(new Token.EndTag("li"));
- break;
- }
- if (tb.isSpecial(el)
- && !StringUtil.in(el.nodeName(), "address",
- "div", "p")) {
- break;
- }
- }
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insert(startTag);
- } else if (StringUtil.in(name, "dd", "dt")) {
- tb.framesetOk(false);
- LinkedList<Element> stack = tb.getStack();
- for (int i = stack.size() - 1; i > 0; i--) {
- Element el = stack.get(i);
- if (StringUtil.in(el.nodeName(), "dd", "dt")) {
- tb.process(new Token.EndTag(el.nodeName()));
- break;
- }
- if (tb.isSpecial(el)
- && !StringUtil.in(el.nodeName(), "address",
- "div", "p")) {
- break;
- }
- }
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insert(startTag);
- } else if (name.equals("plaintext")) {
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insert(startTag);
- tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once
- // in,
- // never
- // gets
- // out
- } else if (name.equals("button")) {
- if (tb.inButtonScope("button")) {
- // close and reprocess
- tb.error(this);
- tb.process(new Token.EndTag("button"));
- tb.process(startTag);
- } else {
- tb.reconstructFormattingElements();
- tb.insert(startTag);
- tb.framesetOk(false);
- }
- } else if (name.equals("a")) {
- if (tb.getActiveFormattingElement("a") != null) {
- tb.error(this);
- tb.process(new Token.EndTag("a"));
-
- // still on stack?
- Element remainingA = tb.getFromStack("a");
- if (remainingA != null) {
- tb.removeFromActiveFormattingElements(remainingA);
- tb.removeFromStack(remainingA);
- }
- }
- tb.reconstructFormattingElements();
- Element a = tb.insert(startTag);
- tb.pushActiveFormattingElements(a);
- } else if (StringUtil.in(name, "b", "big", "code", "em",
- "font", "i", "s", "small", "strike", "strong", "tt",
- "u")) {
- tb.reconstructFormattingElements();
- Element el = tb.insert(startTag);
- tb.pushActiveFormattingElements(el);
- } else if (name.equals("nobr")) {
- tb.reconstructFormattingElements();
- if (tb.inScope("nobr")) {
- tb.error(this);
- tb.process(new Token.EndTag("nobr"));
- tb.reconstructFormattingElements();
- }
- Element el = tb.insert(startTag);
- tb.pushActiveFormattingElements(el);
- } else if (StringUtil.in(name, "applet", "marquee", "object")) {
- tb.reconstructFormattingElements();
- tb.insert(startTag);
- tb.insertMarkerToFormattingElements();
- tb.framesetOk(false);
- } else if (name.equals("table")) {
- if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks
- && tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insert(startTag);
- tb.framesetOk(false);
- tb.transition(InTable);
- } else if (StringUtil.in(name, "area", "br", "embed", "img",
- "keygen", "wbr")) {
- tb.reconstructFormattingElements();
- tb.insertEmpty(startTag);
- tb.framesetOk(false);
- } else if (name.equals("input")) {
- tb.reconstructFormattingElements();
- Element el = tb.insertEmpty(startTag);
- if (!el.attr("type").equalsIgnoreCase("hidden")) {
- tb.framesetOk(false);
- }
- } else if (StringUtil.in(name, "param", "source", "track")) {
- tb.insertEmpty(startTag);
- } else if (name.equals("hr")) {
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.insertEmpty(startTag);
- tb.framesetOk(false);
- } else if (name.equals("image")) {
- // we're not supposed to ask.
- startTag.name("img");
- return tb.process(startTag);
- } else if (name.equals("isindex")) {
- // how much do we care about the early 90s?
- tb.error(this);
- if (tb.getFormElement() != null) {
- return false;
- }
-
- tb.tokeniser.acknowledgeSelfClosingFlag();
- tb.process(new Token.StartTag("form"));
- if (startTag.attributes.hasKey("action")) {
- Element form = tb.getFormElement();
- form.attr("action", startTag.attributes.get("action"));
- }
- tb.process(new Token.StartTag("hr"));
- tb.process(new Token.StartTag("label"));
- // hope you like english.
- String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes
- .get("prompt")
- : "This is a searchable index. Enter search keywords: ";
-
- tb.process(new Token.Character(prompt));
-
- // input
- Attributes inputAttribs = new Attributes();
- for (Attribute attr : startTag.attributes) {
- if (!StringUtil.in(attr.getKey(), "name", "action",
- "prompt")) {
- inputAttribs.put(attr);
- }
- }
- inputAttribs.put("name", "isindex");
- tb.process(new Token.StartTag("input", inputAttribs));
- tb.process(new Token.EndTag("label"));
- tb.process(new Token.StartTag("hr"));
- tb.process(new Token.EndTag("form"));
- } else if (name.equals("textarea")) {
- tb.insert(startTag);
- // todo: If the next token is a U+000A LINE FEED (LF)
- // character token, then ignore that token and move on to
- // the next one. (Newlines at the start of textarea elements
- // are ignored as an authoring convenience.)
- tb.tokeniser.transition(TokeniserState.Rcdata);
- tb.markInsertionMode();
- tb.framesetOk(false);
- tb.transition(Text);
- } else if (name.equals("xmp")) {
- if (tb.inButtonScope("p")) {
- tb.process(new Token.EndTag("p"));
- }
- tb.reconstructFormattingElements();
- tb.framesetOk(false);
- handleRawtext(startTag, tb);
- } else if (name.equals("iframe")) {
- tb.framesetOk(false);
- handleRawtext(startTag, tb);
- } else if (name.equals("noembed")) {
- // also handle noscript if script enabled
- handleRawtext(startTag, tb);
- } else if (name.equals("select")) {
- tb.reconstructFormattingElements();
- tb.insert(startTag);
- tb.framesetOk(false);
-
- HtmlTreeBuilderState state = tb.state();
- if (state.equals(InTable) || state.equals(InCaption)
- || state.equals(InTableBody) || state.equals(InRow)
- || state.equals(InCell)) {
- tb.transition(InSelectInTable);
- } else {
- tb.transition(InSelect);
- }
- } else if (StringUtil.in("optgroup", "option")) {
- if (tb.currentElement().nodeName().equals("option")) {
- tb.process(new Token.EndTag("option"));
- }
- tb.reconstructFormattingElements();
- tb.insert(startTag);
- } else if (StringUtil.in("rp", "rt")) {
- if (tb.inScope("ruby")) {
- tb.generateImpliedEndTags();
- if (!tb.currentElement().nodeName().equals("ruby")) {
- tb.error(this);
- tb.popStackToBefore("ruby"); // i.e. close up to but
- // not include name
- }
- tb.insert(startTag);
- }
- } else if (name.equals("math")) {
- tb.reconstructFormattingElements();
- // todo: handle A start tag whose tag name is "math" (i.e.
- // foreign, mathml)
- tb.insert(startTag);
- tb.tokeniser.acknowledgeSelfClosingFlag();
- } else if (name.equals("svg")) {
- tb.reconstructFormattingElements();
- // todo: handle A start tag whose tag name is "svg" (xlink,
- // svg)
- tb.insert(startTag);
- tb.tokeniser.acknowledgeSelfClosingFlag();
- } else if (StringUtil.in(name, "caption", "col", "colgroup",
- "frame", "head", "tbody", "td", "tfoot", "th", "thead",
- "tr")) {
- tb.error(this);
- return false;
- } else {
- tb.reconstructFormattingElements();
- tb.insert(startTag);
- }
- break;
-
- case EndTag:
- Token.EndTag endTag = t.asEndTag();
- name = endTag.name();
- if (name.equals("body")) {
- if (!tb.inScope("body")) {
- tb.error(this);
- return false;
- } else {
- // todo: error if stack contains something not dd, dt,
- // li, optgroup, option, p, rp, rt, tbody, td, tfoot,
- // th, thead, tr, body, html
- tb.transition(AfterBody);
- }
- } else if (name.equals("html")) {
- boolean notIgnored = tb.process(new Token.EndTag("body"));
- if (notIgnored) {
- return tb.process(endTag);
- }
- } else if (StringUtil.in(name, "address", "article", "aside",
- "blockquote", "button", "center", "details", "dir",
- "div", "dl", "fieldset", "figcaption", "figure",
- "footer", "header", "hgroup", "listing", "menu", "nav",
- "ol", "pre", "section", "summary", "ul")) {
- // todo: refactor these lookups
- if (!tb.inScope(name)) {
- // nothing to close
- tb.error(this);
- return false;
- } else {
- tb.generateImpliedEndTags();
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- }
- } else if (name.equals("form")) {
- Element currentForm = tb.getFormElement();
- tb.setFormElement(null);
- if (currentForm == null || !tb.inScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.generateImpliedEndTags();
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- // remove currentForm from stack. will shift anything
- // under up.
- tb.removeFromStack(currentForm);
- }
- } else if (name.equals("p")) {
- if (!tb.inButtonScope(name)) {
- tb.error(this);
- tb.process(new Token.StartTag(name)); // if no p to
- // close, creates
- // an empty
- // <p></p>
- return tb.process(endTag);
- } else {
- tb.generateImpliedEndTags(name);
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- }
- } else if (name.equals("li")) {
- if (!tb.inListItemScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.generateImpliedEndTags(name);
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- }
- } else if (StringUtil.in(name, "dd", "dt")) {
- if (!tb.inScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.generateImpliedEndTags(name);
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- }
- } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5",
- "h6")) {
- if (!tb.inScope(new String[] { "h1", "h2", "h3", "h4",
- "h5", "h6" })) {
- tb.error(this);
- return false;
- } else {
- tb.generateImpliedEndTags(name);
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6");
- }
- } else if (name.equals("sarcasm")) {
- // *sigh*
- return anyOtherEndTag(t, tb);
- } else if (StringUtil.in(name, "a", "b", "big", "code", "em",
- "font", "i", "nobr", "s", "small", "strike", "strong",
- "tt", "u")) {
- // Adoption Agency Algorithm.
- OUTER: for (int i = 0; i < 8; i++) {
- Element formatEl = tb.getActiveFormattingElement(name);
- if (formatEl == null) {
- return anyOtherEndTag(t, tb);
- } else if (!tb.onStack(formatEl)) {
- tb.error(this);
- tb.removeFromActiveFormattingElements(formatEl);
- return true;
- } else if (!tb.inScope(formatEl.nodeName())) {
- tb.error(this);
- return false;
- } else if (tb.currentElement() != formatEl) {
- tb.error(this);
- }
-
- Element furthestBlock = null;
- Element commonAncestor = null;
- boolean seenFormattingElement = false;
- LinkedList<Element> stack = tb.getStack();
- for (int si = 0; si < stack.size(); si++) {
- Element el = stack.get(si);
- if (el == formatEl) {
- commonAncestor = stack.get(si - 1);
- seenFormattingElement = true;
- } else if (seenFormattingElement
- && tb.isSpecial(el)) {
- furthestBlock = el;
- break;
- }
- }
- if (furthestBlock == null) {
- tb.popStackToClose(formatEl.nodeName());
- tb.removeFromActiveFormattingElements(formatEl);
- return true;
- }
-
- // todo: Let a bookmark note the position of the
- // formatting element in the list of active formatting
- // elements relative to the elements on either side of
- // it in the list.
- // does that mean: int pos of format el in list?
- Element node = furthestBlock;
- Element lastNode = furthestBlock;
- INNER: for (int j = 0; j < 3; j++) {
- if (tb.onStack(node)) {
- node = tb.aboveOnStack(node);
- }
- if (!tb.isInActiveFormattingElements(node)) { // note
- // no
- // bookmark
- // check
- tb.removeFromStack(node);
- continue INNER;
- } else if (node == formatEl) {
- break INNER;
- }
-
- Element replacement = new Element(Tag.valueOf(node
- .nodeName()), tb.getBaseUri());
- tb.replaceActiveFormattingElement(node, replacement);
- tb.replaceOnStack(node, replacement);
- node = replacement;
-
- if (lastNode == furthestBlock) {
- // todo: move the aforementioned bookmark to be
- // immediately after the new node in the list of
- // active formatting elements.
- // not getting how this bookmark both straddles
- // the element above, but is inbetween here...
- }
- if (lastNode.parent() != null) {
- lastNode.remove();
- }
- node.appendChild(lastNode);
-
- lastNode = node;
- }
-
- if (StringUtil.in(commonAncestor.nodeName(), "table",
- "tbody", "tfoot", "thead", "tr")) {
- if (lastNode.parent() != null) {
- lastNode.remove();
- }
- tb.insertInFosterParent(lastNode);
- } else {
- if (lastNode.parent() != null) {
- lastNode.remove();
- }
- commonAncestor.appendChild(lastNode);
- }
-
- Element adopter = new Element(Tag.valueOf(name),
- tb.getBaseUri());
- Node[] childNodes = furthestBlock.childNodes().toArray(
- new Node[furthestBlock.childNodes().size()]);
- for (Node childNode : childNodes) {
- adopter.appendChild(childNode); // append will
- // reparent. thus
- // the clone to
- // avoid concurrent
- // mod.
- }
- furthestBlock.appendChild(adopter);
- tb.removeFromActiveFormattingElements(formatEl);
- // todo: insert the new element into the list of active
- // formatting elements at the position of the
- // aforementioned bookmark.
- tb.removeFromStack(formatEl);
- tb.insertOnStackAfter(furthestBlock, adopter);
- }
- } else if (StringUtil.in(name, "applet", "marquee", "object")) {
- if (!tb.inScope("name")) {
- if (!tb.inScope(name)) {
- tb.error(this);
- return false;
- }
- tb.generateImpliedEndTags();
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- tb.clearFormattingElementsToLastMarker();
- }
- } else if (name.equals("br")) {
- tb.error(this);
- tb.process(new Token.StartTag("br"));
- return false;
- } else {
- return anyOtherEndTag(t, tb);
- }
-
- break;
- case EOF:
- // todo: error if stack contains something not dd, dt, li, p,
- // tbody, td, tfoot, th, thead, tr, body, html
- // stop parsing
- break;
- }
- return true;
- }
-
- boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) {
- String name = t.asEndTag().name();
- DescendableLinkedList<Element> stack = tb.getStack();
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element node = it.next();
- if (node.nodeName().equals(name)) {
- tb.generateImpliedEndTags(name);
- if (!name.equals(tb.currentElement().nodeName())) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- break;
- } else {
- if (tb.isSpecial(node)) {
- tb.error(this);
- return false;
- }
- }
- }
- return true;
- }
- },
- Text {
- // in script, style etc. normally treated as data tags
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isCharacter()) {
- tb.insert(t.asCharacter());
- } else if (t.isEOF()) {
- tb.error(this);
- // if current node is script: already started
- tb.pop();
- tb.transition(tb.originalState());
- return tb.process(t);
- } else if (t.isEndTag()) {
- // if: An end tag whose tag name is "script" -- scripting
- // nesting level, if evaluating scripts
- tb.pop();
- tb.transition(tb.originalState());
- }
- return true;
- }
- },
- InTable {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isCharacter()) {
- tb.newPendingTableCharacters();
- tb.markInsertionMode();
- tb.transition(InTableText);
- return tb.process(t);
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- return true;
- } else if (t.isDoctype()) {
- tb.error(this);
- return false;
- } else if (t.isStartTag()) {
- Token.StartTag startTag = t.asStartTag();
- String name = startTag.name();
- if (name.equals("caption")) {
- tb.clearStackToTableContext();
- tb.insertMarkerToFormattingElements();
- tb.insert(startTag);
- tb.transition(InCaption);
- } else if (name.equals("colgroup")) {
- tb.clearStackToTableContext();
- tb.insert(startTag);
- tb.transition(InColumnGroup);
- } else if (name.equals("col")) {
- tb.process(new Token.StartTag("colgroup"));
- return tb.process(t);
- } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) {
- tb.clearStackToTableContext();
- tb.insert(startTag);
- tb.transition(InTableBody);
- } else if (StringUtil.in(name, "td", "th", "tr")) {
- tb.process(new Token.StartTag("tbody"));
- return tb.process(t);
- } else if (name.equals("table")) {
- tb.error(this);
- boolean processed = tb.process(new Token.EndTag("table"));
- if (processed) {
- return tb.process(t);
- }
- } else if (StringUtil.in(name, "style", "script")) {
- return tb.process(t, InHead);
- } else if (name.equals("input")) {
- if (!startTag.attributes.get("type").equalsIgnoreCase(
- "hidden")) {
- return anythingElse(t, tb);
- } else {
- tb.insertEmpty(startTag);
- }
- } else if (name.equals("form")) {
- tb.error(this);
- if (tb.getFormElement() != null) {
- return false;
- } else {
- Element form = tb.insertEmpty(startTag);
- tb.setFormElement(form);
- }
- } else {
- return anythingElse(t, tb);
- }
- } else if (t.isEndTag()) {
- Token.EndTag endTag = t.asEndTag();
- String name = endTag.name();
-
- if (name.equals("table")) {
- if (!tb.inTableScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.popStackToClose("table");
- }
- tb.resetInsertionMode();
- } else if (StringUtil.in(name, "body", "caption", "col",
- "colgroup", "html", "tbody", "td", "tfoot", "th",
- "thead", "tr")) {
- tb.error(this);
- return false;
- } else {
- return anythingElse(t, tb);
- }
- } else if (t.isEOF()) {
- if (tb.currentElement().nodeName().equals("html")) {
- tb.error(this);
- }
- return true; // stops parsing
- }
- return anythingElse(t, tb);
- }
-
- boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- tb.error(this);
- boolean processed = true;
- if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody",
- "tfoot", "thead", "tr")) {
- tb.setFosterInserts(true);
- processed = tb.process(t, InBody);
- tb.setFosterInserts(false);
- } else {
- processed = tb.process(t, InBody);
- }
- return processed;
- }
- },
- InTableText {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- switch (t.type) {
- case Character:
- Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString)) {
- tb.error(this);
- return false;
- } else {
- tb.getPendingTableCharacters().add(c);
- }
- break;
- default:
- if (tb.getPendingTableCharacters().size() > 0) {
- for (Token.Character character : tb
- .getPendingTableCharacters()) {
- if (!isWhitespace(character)) {
- // InTable anything else section:
- tb.error(this);
- if (StringUtil.in(tb.currentElement().nodeName(),
- "table", "tbody", "tfoot", "thead", "tr")) {
- tb.setFosterInserts(true);
- tb.process(character, InBody);
- tb.setFosterInserts(false);
- } else {
- tb.process(character, InBody);
- }
- } else {
- tb.insert(character);
- }
- }
- tb.newPendingTableCharacters();
- }
- tb.transition(tb.originalState());
- return tb.process(t);
- }
- return true;
- }
- },
- InCaption {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isEndTag() && t.asEndTag().name().equals("caption")) {
- Token.EndTag endTag = t.asEndTag();
- String name = endTag.name();
- if (!tb.inTableScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.generateImpliedEndTags();
- if (!tb.currentElement().nodeName().equals("caption")) {
- tb.error(this);
- }
- tb.popStackToClose("caption");
- tb.clearFormattingElementsToLastMarker();
- tb.transition(InTable);
- }
- } else if ((t.isStartTag()
- && StringUtil.in(t.asStartTag().name(), "caption", "col",
- "colgroup", "tbody", "td", "tfoot", "th", "thead",
- "tr") || t.isEndTag()
- && t.asEndTag().name().equals("table"))) {
- tb.error(this);
- boolean processed = tb.process(new Token.EndTag("caption"));
- if (processed) {
- return tb.process(t);
- }
- } else if (t.isEndTag()
- && StringUtil.in(t.asEndTag().name(), "body", "col",
- "colgroup", "html", "tbody", "td", "tfoot", "th",
- "thead", "tr")) {
- tb.error(this);
- return false;
- } else {
- return tb.process(t, InBody);
- }
- return true;
- }
- },
- InColumnGroup {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- tb.insert(t.asCharacter());
- return true;
- }
- switch (t.type) {
- case Comment:
- tb.insert(t.asComment());
- break;
- case Doctype:
- tb.error(this);
- break;
- case StartTag:
- Token.StartTag startTag = t.asStartTag();
- String name = startTag.name();
- if (name.equals("html")) {
- return tb.process(t, InBody);
- } else if (name.equals("col")) {
- tb.insertEmpty(startTag);
- } else {
- return anythingElse(t, tb);
- }
- break;
- case EndTag:
- Token.EndTag endTag = t.asEndTag();
- name = endTag.name();
- if (name.equals("colgroup")) {
- if (tb.currentElement().nodeName().equals("html")) { // frag
- // case
- tb.error(this);
- return false;
- } else {
- tb.pop();
- tb.transition(InTable);
- }
- } else {
- return anythingElse(t, tb);
- }
- break;
- case EOF:
- if (tb.currentElement().nodeName().equals("html")) {
- return true; // stop parsing; frag case
- } else {
- return anythingElse(t, tb);
- }
- default:
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, TreeBuilder tb) {
- boolean processed = tb.process(new Token.EndTag("colgroup"));
- if (processed) {
- return tb.process(t);
- }
- return true;
- }
- },
- InTableBody {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- switch (t.type) {
- case StartTag:
- Token.StartTag startTag = t.asStartTag();
- String name = startTag.name();
- if (name.equals("tr")) {
- tb.clearStackToTableBodyContext();
- tb.insert(startTag);
- tb.transition(InRow);
- } else if (StringUtil.in(name, "th", "td")) {
- tb.error(this);
- tb.process(new Token.StartTag("tr"));
- return tb.process(startTag);
- } else if (StringUtil.in(name, "caption", "col", "colgroup",
- "tbody", "tfoot", "thead")) {
- return exitTableBody(t, tb);
- } else {
- return anythingElse(t, tb);
- }
- break;
- case EndTag:
- Token.EndTag endTag = t.asEndTag();
- name = endTag.name();
- if (StringUtil.in(name, "tbody", "tfoot", "thead")) {
- if (!tb.inTableScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.clearStackToTableBodyContext();
- tb.pop();
- tb.transition(InTable);
- }
- } else if (name.equals("table")) {
- return exitTableBody(t, tb);
- } else if (StringUtil.in(name, "body", "caption", "col",
- "colgroup", "html", "td", "th", "tr")) {
- tb.error(this);
- return false;
- } else {
- return anythingElse(t, tb);
- }
- break;
- default:
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean exitTableBody(Token t, HtmlTreeBuilder tb) {
- if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb
- .inScope("tfoot"))) {
- // frag case
- tb.error(this);
- return false;
- }
- tb.clearStackToTableBodyContext();
- tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody,
- // tfoot,
- // thead
- return tb.process(t);
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- return tb.process(t, InTable);
- }
- },
- InRow {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isStartTag()) {
- Token.StartTag startTag = t.asStartTag();
- String name = startTag.name();
-
- if (StringUtil.in(name, "th", "td")) {
- tb.clearStackToTableRowContext();
- tb.insert(startTag);
- tb.transition(InCell);
- tb.insertMarkerToFormattingElements();
- } else if (StringUtil.in(name, "caption", "col", "colgroup",
- "tbody", "tfoot", "thead", "tr")) {
- return handleMissingTr(t, tb);
- } else {
- return anythingElse(t, tb);
- }
- } else if (t.isEndTag()) {
- Token.EndTag endTag = t.asEndTag();
- String name = endTag.name();
-
- if (name.equals("tr")) {
- if (!tb.inTableScope(name)) {
- tb.error(this); // frag
- return false;
- }
- tb.clearStackToTableRowContext();
- tb.pop(); // tr
- tb.transition(InTableBody);
- } else if (name.equals("table")) {
- return handleMissingTr(t, tb);
- } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) {
- if (!tb.inTableScope(name)) {
- tb.error(this);
- return false;
- }
- tb.process(new Token.EndTag("tr"));
- return tb.process(t);
- } else if (StringUtil.in(name, "body", "caption", "col",
- "colgroup", "html", "td", "th")) {
- tb.error(this);
- return false;
- } else {
- return anythingElse(t, tb);
- }
- } else {
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- return tb.process(t, InTable);
- }
-
- private boolean handleMissingTr(Token t, TreeBuilder tb) {
- boolean processed = tb.process(new Token.EndTag("tr"));
- if (processed) {
- return tb.process(t);
- } else {
- return false;
- }
- }
- },
- InCell {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isEndTag()) {
- Token.EndTag endTag = t.asEndTag();
- String name = endTag.name();
-
- if (StringUtil.in(name, "td", "th")) {
- if (!tb.inTableScope(name)) {
- tb.error(this);
- tb.transition(InRow); // might not be in scope if empty:
- // <td /> and processing fake end
- // tag
- return false;
- }
- tb.generateImpliedEndTags();
- if (!tb.currentElement().nodeName().equals(name)) {
- tb.error(this);
- }
- tb.popStackToClose(name);
- tb.clearFormattingElementsToLastMarker();
- tb.transition(InRow);
- } else if (StringUtil.in(name, "body", "caption", "col",
- "colgroup", "html")) {
- tb.error(this);
- return false;
- } else if (StringUtil.in(name, "table", "tbody", "tfoot",
- "thead", "tr")) {
- if (!tb.inTableScope(name)) {
- tb.error(this);
- return false;
- }
- closeCell(tb);
- return tb.process(t);
- } else {
- return anythingElse(t, tb);
- }
- } else if (t.isStartTag()
- && StringUtil.in(t.asStartTag().name(), "caption", "col",
- "colgroup", "tbody", "td", "tfoot", "th", "thead",
- "tr")) {
- if (!(tb.inTableScope("td") || tb.inTableScope("th"))) {
- tb.error(this);
- return false;
- }
- closeCell(tb);
- return tb.process(t);
- } else {
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- return tb.process(t, InBody);
- }
-
- private void closeCell(HtmlTreeBuilder tb) {
- if (tb.inTableScope("td")) {
- tb.process(new Token.EndTag("td"));
- } else {
- tb.process(new Token.EndTag("th")); // only here if th or td in
- // scope
- }
- }
- },
- InSelect {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- switch (t.type) {
- case Character:
- Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString)) {
- tb.error(this);
- return false;
- } else {
- tb.insert(c);
- }
- break;
- case Comment:
- tb.insert(t.asComment());
- break;
- case Doctype:
- tb.error(this);
- return false;
- case StartTag:
- Token.StartTag start = t.asStartTag();
- String name = start.name();
- if (name.equals("html")) {
- return tb.process(start, InBody);
- } else if (name.equals("option")) {
- tb.process(new Token.EndTag("option"));
- tb.insert(start);
- } else if (name.equals("optgroup")) {
- if (tb.currentElement().nodeName().equals("option")) {
- tb.process(new Token.EndTag("option"));
- } else if (tb.currentElement().nodeName()
- .equals("optgroup")) {
- tb.process(new Token.EndTag("optgroup"));
- }
- tb.insert(start);
- } else if (name.equals("select")) {
- tb.error(this);
- return tb.process(new Token.EndTag("select"));
- } else if (StringUtil.in(name, "input", "keygen", "textarea")) {
- tb.error(this);
- if (!tb.inSelectScope("select")) {
- return false; // frag
- }
- tb.process(new Token.EndTag("select"));
- return tb.process(start);
- } else if (name.equals("script")) {
- return tb.process(t, InHead);
- } else {
- return anythingElse(t, tb);
- }
- break;
- case EndTag:
- Token.EndTag end = t.asEndTag();
- name = end.name();
- if (name.equals("optgroup")) {
- if (tb.currentElement().nodeName().equals("option")
- && tb.aboveOnStack(tb.currentElement()) != null
- && tb.aboveOnStack(tb.currentElement()).nodeName()
- .equals("optgroup")) {
- tb.process(new Token.EndTag("option"));
- }
- if (tb.currentElement().nodeName().equals("optgroup")) {
- tb.pop();
- } else {
- tb.error(this);
- }
- } else if (name.equals("option")) {
- if (tb.currentElement().nodeName().equals("option")) {
- tb.pop();
- } else {
- tb.error(this);
- }
- } else if (name.equals("select")) {
- if (!tb.inSelectScope(name)) {
- tb.error(this);
- return false;
- } else {
- tb.popStackToClose(name);
- tb.resetInsertionMode();
- }
- } else {
- return anythingElse(t, tb);
- }
- break;
- case EOF:
- if (!tb.currentElement().nodeName().equals("html")) {
- tb.error(this);
- }
- break;
- default:
- return anythingElse(t, tb);
- }
- return true;
- }
-
- private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
- tb.error(this);
- return false;
- }
- },
- InSelectInTable {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isStartTag()
- && StringUtil.in(t.asStartTag().name(), "caption", "table",
- "tbody", "tfoot", "thead", "tr", "td", "th")) {
- tb.error(this);
- tb.process(new Token.EndTag("select"));
- return tb.process(t);
- } else if (t.isEndTag()
- && StringUtil.in(t.asEndTag().name(), "caption", "table",
- "tbody", "tfoot", "thead", "tr", "td", "th")) {
- tb.error(this);
- if (tb.inTableScope(t.asEndTag().name())) {
- tb.process(new Token.EndTag("select"));
- return (tb.process(t));
- } else {
- return false;
- }
- } else {
- return tb.process(t, InSelect);
- }
- }
- },
- AfterBody {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- return tb.process(t, InBody);
- } else if (t.isComment()) {
- tb.insert(t.asComment()); // into html node
- } else if (t.isDoctype()) {
- tb.error(this);
- return false;
- } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
- return tb.process(t, InBody);
- } else if (t.isEndTag() && t.asEndTag().name().equals("html")) {
- if (tb.isFragmentParsing()) {
- tb.error(this);
- return false;
- } else {
- tb.transition(AfterAfterBody);
- }
- } else if (t.isEOF()) {
- // chillax! we're done
- } else {
- tb.error(this);
- tb.transition(InBody);
- return tb.process(t);
- }
- return true;
- }
- },
- InFrameset {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- tb.insert(t.asCharacter());
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype()) {
- tb.error(this);
- return false;
- } else if (t.isStartTag()) {
- Token.StartTag start = t.asStartTag();
- String name = start.name();
- if (name.equals("html")) {
- return tb.process(start, InBody);
- } else if (name.equals("frameset")) {
- tb.insert(start);
- } else if (name.equals("frame")) {
- tb.insertEmpty(start);
- } else if (name.equals("noframes")) {
- return tb.process(start, InHead);
- } else {
- tb.error(this);
- return false;
- }
- } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) {
- if (tb.currentElement().nodeName().equals("html")) { // frag
- tb.error(this);
- return false;
- } else {
- tb.pop();
- if (!tb.isFragmentParsing()
- && !tb.currentElement().nodeName()
- .equals("frameset")) {
- tb.transition(AfterFrameset);
- }
- }
- } else if (t.isEOF()) {
- if (!tb.currentElement().nodeName().equals("html")) {
- tb.error(this);
- return true;
- }
- } else {
- tb.error(this);
- return false;
- }
- return true;
- }
- },
- AfterFrameset {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (isWhitespace(t)) {
- tb.insert(t.asCharacter());
- } else if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype()) {
- tb.error(this);
- return false;
- } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
- return tb.process(t, InBody);
- } else if (t.isEndTag() && t.asEndTag().name().equals("html")) {
- tb.transition(AfterAfterFrameset);
- } else if (t.isStartTag()
- && t.asStartTag().name().equals("noframes")) {
- return tb.process(t, InHead);
- } else if (t.isEOF()) {
- // cool your heels, we're complete
- } else {
- tb.error(this);
- return false;
- }
- return true;
- }
- },
- AfterAfterBody {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype() || isWhitespace(t)
- || (t.isStartTag() && t.asStartTag().name().equals("html"))) {
- return tb.process(t, InBody);
- } else if (t.isEOF()) {
- // nice work chuck
- } else {
- tb.error(this);
- tb.transition(InBody);
- return tb.process(t);
- }
- return true;
- }
- },
- AfterAfterFrameset {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- if (t.isComment()) {
- tb.insert(t.asComment());
- } else if (t.isDoctype() || isWhitespace(t)
- || (t.isStartTag() && t.asStartTag().name().equals("html"))) {
- return tb.process(t, InBody);
- } else if (t.isEOF()) {
- // nice work chuck
- } else if (t.isStartTag()
- && t.asStartTag().name().equals("noframes")) {
- return tb.process(t, InHead);
- } else {
- tb.error(this);
- return false;
- }
- return true;
- }
- },
- ForeignContent {
- @Override
- boolean process(Token t, HtmlTreeBuilder tb) {
- return true;
- // todo: implement. Also; how do we get here?
- }
- };
-
- private static String nullString = String.valueOf('\u0000');
-
- abstract boolean process(Token t, HtmlTreeBuilder tb);
-
- private static boolean isWhitespace(Token t) {
- if (t.isCharacter()) {
- String data = t.asCharacter().getData();
- // todo: this checks more than spec - "\t", "\n", "\f", "\r", " "
- for (int i = 0; i < data.length(); i++) {
- char c = data.charAt(i);
- if (!StringUtil.isWhitespace(c)) {
- return false;
- }
- }
- return true;
- }
- return false;
- }
-
- private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) {
- tb.insert(startTag);
- tb.tokeniser.transition(TokeniserState.Rcdata);
- tb.markInsertionMode();
- tb.transition(Text);
- }
-
- private static void handleRawtext(Token.StartTag startTag,
- HtmlTreeBuilder tb) {
- tb.insert(startTag);
- tb.tokeniser.transition(TokeniserState.Rawtext);
- tb.markInsertionMode();
- tb.transition(Text);
- }
-}
diff --git a/server/src/org/jsoup/parser/ParseError.java b/server/src/org/jsoup/parser/ParseError.java
deleted file mode 100644
index eb3c240a59..0000000000
--- a/server/src/org/jsoup/parser/ParseError.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package org.jsoup.parser;
-
-/**
- * A Parse Error records an error in the input HTML that occurs in either the
- * tokenisation or the tree building phase.
- */
-public class ParseError {
- private int pos;
- private String errorMsg;
-
- ParseError(int pos, String errorMsg) {
- this.pos = pos;
- this.errorMsg = errorMsg;
- }
-
- ParseError(int pos, String errorFormat, Object... args) {
- errorMsg = String.format(errorFormat, args);
- this.pos = pos;
- }
-
- /**
- * Retrieve the error message.
- *
- * @return the error message.
- */
- public String getErrorMessage() {
- return errorMsg;
- }
-
- /**
- * Retrieves the offset of the error.
- *
- * @return error offset within input
- */
- public int getPosition() {
- return pos;
- }
-
- @Override
- public String toString() {
- return pos + ": " + errorMsg;
- }
-}
diff --git a/server/src/org/jsoup/parser/ParseErrorList.java b/server/src/org/jsoup/parser/ParseErrorList.java
deleted file mode 100644
index 773dfcae24..0000000000
--- a/server/src/org/jsoup/parser/ParseErrorList.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package org.jsoup.parser;
-
-import java.util.ArrayList;
-
-/**
- * A container for ParseErrors.
- *
- * @author Jonathan Hedley
- */
-class ParseErrorList extends ArrayList<ParseError> {
- private static final int INITIAL_CAPACITY = 16;
- private final int maxSize;
-
- ParseErrorList(int initialCapacity, int maxSize) {
- super(initialCapacity);
- this.maxSize = maxSize;
- }
-
- boolean canAddError() {
- return size() < maxSize;
- }
-
- int getMaxSize() {
- return maxSize;
- }
-
- static ParseErrorList noTracking() {
- return new ParseErrorList(0, 0);
- }
-
- static ParseErrorList tracking(int maxSize) {
- return new ParseErrorList(INITIAL_CAPACITY, maxSize);
- }
-}
diff --git a/server/src/org/jsoup/parser/Parser.java b/server/src/org/jsoup/parser/Parser.java
deleted file mode 100644
index a1f6fd5184..0000000000
--- a/server/src/org/jsoup/parser/Parser.java
+++ /dev/null
@@ -1,198 +0,0 @@
-package org.jsoup.parser;
-
-import java.util.List;
-
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-
-/**
- * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use
- * one of the more convenient parse methods in {@link org.jsoup.Jsoup}.
- */
-public class Parser {
- private static final int DEFAULT_MAX_ERRORS = 0; // by default, error
- // tracking is disabled.
-
- private TreeBuilder treeBuilder;
- private int maxErrors = DEFAULT_MAX_ERRORS;
- private ParseErrorList errors;
-
- /**
- * Create a new Parser, using the specified TreeBuilder
- *
- * @param treeBuilder
- * TreeBuilder to use to parse input into Documents.
- */
- public Parser(TreeBuilder treeBuilder) {
- this.treeBuilder = treeBuilder;
- }
-
- public Document parseInput(String html, String baseUri) {
- errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors)
- : ParseErrorList.noTracking();
- Document doc = treeBuilder.parse(html, baseUri, errors);
- return doc;
- }
-
- // gets & sets
- /**
- * Get the TreeBuilder currently in use.
- *
- * @return current TreeBuilder.
- */
- public TreeBuilder getTreeBuilder() {
- return treeBuilder;
- }
-
- /**
- * Update the TreeBuilder used when parsing content.
- *
- * @param treeBuilder
- * current TreeBuilder
- * @return this, for chaining
- */
- public Parser setTreeBuilder(TreeBuilder treeBuilder) {
- this.treeBuilder = treeBuilder;
- return this;
- }
-
- /**
- * Check if parse error tracking is enabled.
- *
- * @return current track error state.
- */
- public boolean isTrackErrors() {
- return maxErrors > 0;
- }
-
- /**
- * Enable or disable parse error tracking for the next parse.
- *
- * @param maxErrors
- * the maximum number of errors to track. Set to 0 to disable.
- * @return this, for chaining
- */
- public Parser setTrackErrors(int maxErrors) {
- this.maxErrors = maxErrors;
- return this;
- }
-
- /**
- * Retrieve the parse errors, if any, from the last parse.
- *
- * @return list of parse errors, up to the size of the maximum errors
- * tracked.
- */
- public List<ParseError> getErrors() {
- return errors;
- }
-
- // static parse functions below
- /**
- * Parse HTML into a Document.
- *
- * @param html
- * HTML to parse
- * @param baseUri
- * base URI of document (i.e. original fetch location), for
- * resolving relative URLs.
- *
- * @return parsed Document
- */
- public static Document parse(String html, String baseUri) {
- TreeBuilder treeBuilder = new HtmlTreeBuilder();
- return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking());
- }
-
- /**
- * Parse a fragment of HTML into a list of nodes. The context element, if
- * supplied, supplies parsing context.
- *
- * @param fragmentHtml
- * the fragment of HTML to parse
- * @param context
- * (optional) the element that this HTML fragment is being parsed
- * for (i.e. for inner HTML). This provides stack context (for
- * implicit element creation).
- * @param baseUri
- * base URI of document (i.e. original fetch location), for
- * resolving relative URLs.
- *
- * @return list of nodes parsed from the input HTML. Note that the context
- * element, if supplied, is not modified.
- */
- public static List<Node> parseFragment(String fragmentHtml,
- Element context, String baseUri) {
- HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
- return treeBuilder.parseFragment(fragmentHtml, context, baseUri,
- ParseErrorList.noTracking());
- }
-
- /**
- * Parse a fragment of HTML into the {@code body} of a Document.
- *
- * @param bodyHtml
- * fragment of HTML
- * @param baseUri
- * base URI of document (i.e. original fetch location), for
- * resolving relative URLs.
- *
- * @return Document, with empty head, and HTML parsed into body
- */
- public static Document parseBodyFragment(String bodyHtml, String baseUri) {
- Document doc = Document.createShell(baseUri);
- Element body = doc.body();
- List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
- Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node
- // list gets
- // modified
- // when
- // re-parented
- for (Node node : nodes) {
- body.appendChild(node);
- }
- return doc;
- }
-
- /**
- * @param bodyHtml
- * HTML to parse
- * @param baseUri
- * baseUri base URI of document (i.e. original fetch location),
- * for resolving relative URLs.
- *
- * @return parsed Document
- * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment}
- * instead.
- */
- @Deprecated
- public static Document parseBodyFragmentRelaxed(String bodyHtml,
- String baseUri) {
- return parse(bodyHtml, baseUri);
- }
-
- // builders
-
- /**
- * Create a new HTML parser. This parser treats input as HTML5, and enforces
- * the creation of a normalised document, based on a knowledge of the
- * semantics of the incoming tags.
- *
- * @return a new HTML parser.
- */
- public static Parser htmlParser() {
- return new Parser(new HtmlTreeBuilder());
- }
-
- /**
- * Create a new XML parser. This parser assumes no knowledge of the incoming
- * tags and does not treat it as HTML, rather creates a simple tree directly
- * from the input.
- *
- * @return a new simple XML parser.
- */
- public static Parser xmlParser() {
- return new Parser(new XmlTreeBuilder());
- }
-}
diff --git a/server/src/org/jsoup/parser/Tag.java b/server/src/org/jsoup/parser/Tag.java
deleted file mode 100644
index c43f27aff3..0000000000
--- a/server/src/org/jsoup/parser/Tag.java
+++ /dev/null
@@ -1,298 +0,0 @@
-package org.jsoup.parser;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import org.jsoup.helper.Validate;
-
-/**
- * HTML Tag capabilities.
- *
- * @author Jonathan Hedley, jonathan@hedley.net
- */
-public class Tag {
- private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map
- // of
- // known
- // tags
-
- private String tagName;
- private boolean isBlock = true; // block or inline
- private boolean formatAsBlock = true; // should be formatted as a block
- private boolean canContainBlock = true; // Can this tag hold block level
- // tags?
- private boolean canContainInline = true; // only pcdata if not
- private boolean empty = false; // can hold nothing; e.g. img
- private boolean selfClosing = false; // can self close (<foo />). used for
- // unknown tags that self close,
- // without forcing them as empty.
- private boolean preserveWhitespace = false; // for pre, textarea, script etc
-
- private Tag(String tagName) {
- this.tagName = tagName.toLowerCase();
- }
-
- /**
- * Get this tag's name.
- *
- * @return the tag's name
- */
- public String getName() {
- return tagName;
- }
-
- /**
- * Get a Tag by name. If not previously defined (unknown), returns a new
- * generic tag, that can do anything.
- * <p/>
- * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not
- * registered and will only .equals().
- *
- * @param tagName
- * Name of tag, e.g. "p". Case insensitive.
- * @return The tag, either defined or new generic.
- */
- public static Tag valueOf(String tagName) {
- Validate.notNull(tagName);
- tagName = tagName.trim().toLowerCase();
- Validate.notEmpty(tagName);
-
- synchronized (tags) {
- Tag tag = tags.get(tagName);
- if (tag == null) {
- // not defined: create default; go anywhere, do anything! (incl
- // be inside a <p>)
- tag = new Tag(tagName);
- tag.isBlock = false;
- tag.canContainBlock = true;
- }
- return tag;
- }
- }
-
- /**
- * Gets if this is a block tag.
- *
- * @return if block tag
- */
- public boolean isBlock() {
- return isBlock;
- }
-
- /**
- * Gets if this tag should be formatted as a block (or as inline)
- *
- * @return if should be formatted as block or inline
- */
- public boolean formatAsBlock() {
- return formatAsBlock;
- }
-
- /**
- * Gets if this tag can contain block tags.
- *
- * @return if tag can contain block tags
- */
- public boolean canContainBlock() {
- return canContainBlock;
- }
-
- /**
- * Gets if this tag is an inline tag.
- *
- * @return if this tag is an inline tag.
- */
- public boolean isInline() {
- return !isBlock;
- }
-
- /**
- * Gets if this tag is a data only tag.
- *
- * @return if this tag is a data only tag
- */
- public boolean isData() {
- return !canContainInline && !isEmpty();
- }
-
- /**
- * Get if this is an empty tag
- *
- * @return if this is an empty tag
- */
- public boolean isEmpty() {
- return empty;
- }
-
- /**
- * Get if this tag is self closing.
- *
- * @return if this tag should be output as self closing.
- */
- public boolean isSelfClosing() {
- return empty || selfClosing;
- }
-
- /**
- * Get if this is a pre-defined tag, or was auto created on parsing.
- *
- * @return if a known tag
- */
- public boolean isKnownTag() {
- return tags.containsKey(tagName);
- }
-
- /**
- * Check if this tagname is a known tag.
- *
- * @param tagName
- * name of tag
- * @return if known HTML tag
- */
- public static boolean isKnownTag(String tagName) {
- return tags.containsKey(tagName);
- }
-
- /**
- * Get if this tag should preserve whitespace within child text nodes.
- *
- * @return if preserve whitepace
- */
- public boolean preserveWhitespace() {
- return preserveWhitespace;
- }
-
- Tag setSelfClosing() {
- selfClosing = true;
- return this;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (!(o instanceof Tag)) {
- return false;
- }
-
- Tag tag = (Tag) o;
-
- if (canContainBlock != tag.canContainBlock) {
- return false;
- }
- if (canContainInline != tag.canContainInline) {
- return false;
- }
- if (empty != tag.empty) {
- return false;
- }
- if (formatAsBlock != tag.formatAsBlock) {
- return false;
- }
- if (isBlock != tag.isBlock) {
- return false;
- }
- if (preserveWhitespace != tag.preserveWhitespace) {
- return false;
- }
- if (selfClosing != tag.selfClosing) {
- return false;
- }
- if (!tagName.equals(tag.tagName)) {
- return false;
- }
-
- return true;
- }
-
- @Override
- public int hashCode() {
- int result = tagName.hashCode();
- result = 31 * result + (isBlock ? 1 : 0);
- result = 31 * result + (formatAsBlock ? 1 : 0);
- result = 31 * result + (canContainBlock ? 1 : 0);
- result = 31 * result + (canContainInline ? 1 : 0);
- result = 31 * result + (empty ? 1 : 0);
- result = 31 * result + (selfClosing ? 1 : 0);
- result = 31 * result + (preserveWhitespace ? 1 : 0);
- return result;
- }
-
- @Override
- public String toString() {
- return tagName;
- }
-
- // internal static initialisers:
- // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other
- // sources
- private static final String[] blockTags = { "html", "head", "body",
- "frameset", "script", "noscript", "style", "meta", "link", "title",
- "frame", "noframes", "section", "nav", "aside", "hgroup", "header",
- "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol",
- "pre", "div", "blockquote", "hr", "address", "figure",
- "figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd",
- "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup",
- "col", "tr", "th", "td", "video", "audio", "canvas", "details",
- "menu", "plaintext" };
- private static final String[] inlineTags = { "object", "base", "font",
- "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code",
- "samp", "kbd", "var", "cite", "abbr", "time", "acronym", "mark",
- "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", "sub",
- "sup", "bdo", "iframe", "embed", "span", "input", "select",
- "textarea", "label", "button", "optgroup", "option", "legend",
- "datalist", "keygen", "output", "progress", "meter", "area",
- "param", "source", "track", "summary", "command", "device" };
- private static final String[] emptyTags = { "meta", "link", "base",
- "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen",
- "col", "command", "device" };
- private static final String[] formatAsInlineTags = { "title", "a", "p",
- "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th",
- "td", "script", "style" };
- private static final String[] preserveWhitespaceTags = { "pre",
- "plaintext", "title" };
-
- static {
- // creates
- for (String tagName : blockTags) {
- Tag tag = new Tag(tagName);
- register(tag);
- }
- for (String tagName : inlineTags) {
- Tag tag = new Tag(tagName);
- tag.isBlock = false;
- tag.canContainBlock = false;
- tag.formatAsBlock = false;
- register(tag);
- }
-
- // mods:
- for (String tagName : emptyTags) {
- Tag tag = tags.get(tagName);
- Validate.notNull(tag);
- tag.canContainBlock = false;
- tag.canContainInline = false;
- tag.empty = true;
- }
-
- for (String tagName : formatAsInlineTags) {
- Tag tag = tags.get(tagName);
- Validate.notNull(tag);
- tag.formatAsBlock = false;
- }
-
- for (String tagName : preserveWhitespaceTags) {
- Tag tag = tags.get(tagName);
- Validate.notNull(tag);
- tag.preserveWhitespace = true;
- }
- }
-
- private static Tag register(Tag tag) {
- synchronized (tags) {
- tags.put(tag.tagName, tag);
- }
- return tag;
- }
-}
diff --git a/server/src/org/jsoup/parser/Token.java b/server/src/org/jsoup/parser/Token.java
deleted file mode 100644
index e465eb74e3..0000000000
--- a/server/src/org/jsoup/parser/Token.java
+++ /dev/null
@@ -1,253 +0,0 @@
-package org.jsoup.parser;
-
-import org.jsoup.helper.Validate;
-import org.jsoup.nodes.Attribute;
-import org.jsoup.nodes.Attributes;
-
-/**
- * Parse tokens for the Tokeniser.
- */
-abstract class Token {
- TokenType type;
-
- private Token() {
- }
-
- String tokenType() {
- return this.getClass().getSimpleName();
- }
-
- static class Doctype extends Token {
- final StringBuilder name = new StringBuilder();
- final StringBuilder publicIdentifier = new StringBuilder();
- final StringBuilder systemIdentifier = new StringBuilder();
- boolean forceQuirks = false;
-
- Doctype() {
- type = TokenType.Doctype;
- }
-
- String getName() {
- return name.toString();
- }
-
- String getPublicIdentifier() {
- return publicIdentifier.toString();
- }
-
- public String getSystemIdentifier() {
- return systemIdentifier.toString();
- }
-
- public boolean isForceQuirks() {
- return forceQuirks;
- }
- }
-
- static abstract class Tag extends Token {
- protected String tagName;
- private String pendingAttributeName;
- private String pendingAttributeValue;
-
- boolean selfClosing = false;
- Attributes attributes = new Attributes(); // todo: allow nodes to not
- // have attributes
-
- void newAttribute() {
- if (pendingAttributeName != null) {
- if (pendingAttributeValue == null) {
- pendingAttributeValue = "";
- }
- Attribute attribute = new Attribute(pendingAttributeName,
- pendingAttributeValue);
- attributes.put(attribute);
- }
- pendingAttributeName = null;
- pendingAttributeValue = null;
- }
-
- void finaliseTag() {
- // finalises for emit
- if (pendingAttributeName != null) {
- // todo: check if attribute name exists; if so, drop and error
- newAttribute();
- }
- }
-
- String name() {
- Validate.isFalse(tagName.length() == 0);
- return tagName;
- }
-
- Tag name(String name) {
- tagName = name;
- return this;
- }
-
- boolean isSelfClosing() {
- return selfClosing;
- }
-
- @SuppressWarnings({ "TypeMayBeWeakened" })
- Attributes getAttributes() {
- return attributes;
- }
-
- // these appenders are rarely hit in not null state-- caused by null
- // chars.
- void appendTagName(String append) {
- tagName = tagName == null ? append : tagName.concat(append);
- }
-
- void appendTagName(char append) {
- appendTagName(String.valueOf(append));
- }
-
- void appendAttributeName(String append) {
- pendingAttributeName = pendingAttributeName == null ? append
- : pendingAttributeName.concat(append);
- }
-
- void appendAttributeName(char append) {
- appendAttributeName(String.valueOf(append));
- }
-
- void appendAttributeValue(String append) {
- pendingAttributeValue = pendingAttributeValue == null ? append
- : pendingAttributeValue.concat(append);
- }
-
- void appendAttributeValue(char append) {
- appendAttributeValue(String.valueOf(append));
- }
- }
-
- static class StartTag extends Tag {
- StartTag() {
- super();
- type = TokenType.StartTag;
- }
-
- StartTag(String name) {
- this();
- tagName = name;
- }
-
- StartTag(String name, Attributes attributes) {
- this();
- tagName = name;
- this.attributes = attributes;
- }
-
- @Override
- public String toString() {
- return "<" + name() + " " + attributes.toString() + ">";
- }
- }
-
- static class EndTag extends Tag {
- EndTag() {
- super();
- type = TokenType.EndTag;
- }
-
- EndTag(String name) {
- this();
- tagName = name;
- }
-
- @Override
- public String toString() {
- return "</" + name() + " " + attributes.toString() + ">";
- }
- }
-
- static class Comment extends Token {
- final StringBuilder data = new StringBuilder();
-
- Comment() {
- type = TokenType.Comment;
- }
-
- String getData() {
- return data.toString();
- }
-
- @Override
- public String toString() {
- return "<!--" + getData() + "-->";
- }
- }
-
- static class Character extends Token {
- private final String data;
-
- Character(String data) {
- type = TokenType.Character;
- this.data = data;
- }
-
- String getData() {
- return data;
- }
-
- @Override
- public String toString() {
- return getData();
- }
- }
-
- static class EOF extends Token {
- EOF() {
- type = Token.TokenType.EOF;
- }
- }
-
- boolean isDoctype() {
- return type == TokenType.Doctype;
- }
-
- Doctype asDoctype() {
- return (Doctype) this;
- }
-
- boolean isStartTag() {
- return type == TokenType.StartTag;
- }
-
- StartTag asStartTag() {
- return (StartTag) this;
- }
-
- boolean isEndTag() {
- return type == TokenType.EndTag;
- }
-
- EndTag asEndTag() {
- return (EndTag) this;
- }
-
- boolean isComment() {
- return type == TokenType.Comment;
- }
-
- Comment asComment() {
- return (Comment) this;
- }
-
- boolean isCharacter() {
- return type == TokenType.Character;
- }
-
- Character asCharacter() {
- return (Character) this;
- }
-
- boolean isEOF() {
- return type == TokenType.EOF;
- }
-
- enum TokenType {
- Doctype, StartTag, EndTag, Comment, Character, EOF
- }
-}
diff --git a/server/src/org/jsoup/parser/TokenQueue.java b/server/src/org/jsoup/parser/TokenQueue.java
deleted file mode 100644
index 3e7127e640..0000000000
--- a/server/src/org/jsoup/parser/TokenQueue.java
+++ /dev/null
@@ -1,473 +0,0 @@
-package org.jsoup.parser;
-
-import org.jsoup.helper.StringUtil;
-import org.jsoup.helper.Validate;
-
-/**
- * A character queue with parsing helpers.
- *
- * @author Jonathan Hedley
- */
-public class TokenQueue {
- private String queue;
- private int pos = 0;
-
- private static final char ESC = '\\'; // escape char for chomp balanced.
-
- /**
- * Create a new TokenQueue.
- *
- * @param data
- * string of data to back queue.
- */
- public TokenQueue(String data) {
- Validate.notNull(data);
- queue = data;
- }
-
- /**
- * Is the queue empty?
- *
- * @return true if no data left in queue.
- */
- public boolean isEmpty() {
- return remainingLength() == 0;
- }
-
- private int remainingLength() {
- return queue.length() - pos;
- }
-
- /**
- * Retrieves but does not remove the first character from the queue.
- *
- * @return First character, or 0 if empty.
- */
- public char peek() {
- return isEmpty() ? 0 : queue.charAt(pos);
- }
-
- /**
- * Add a character to the start of the queue (will be the next character
- * retrieved).
- *
- * @param c
- * character to add
- */
- public void addFirst(Character c) {
- addFirst(c.toString());
- }
-
- /**
- * Add a string to the start of the queue.
- *
- * @param seq
- * string to add.
- */
- public void addFirst(String seq) {
- // not very performant, but an edge case
- queue = seq + queue.substring(pos);
- pos = 0;
- }
-
- /**
- * Tests if the next characters on the queue match the sequence. Case
- * insensitive.
- *
- * @param seq
- * String to check queue for.
- * @return true if the next characters match.
- */
- public boolean matches(String seq) {
- return queue.regionMatches(true, pos, seq, 0, seq.length());
- }
-
- /**
- * Case sensitive match test.
- *
- * @param seq
- * string to case sensitively check for
- * @return true if matched, false if not
- */
- public boolean matchesCS(String seq) {
- return queue.startsWith(seq, pos);
- }
-
- /**
- * Tests if the next characters match any of the sequences. Case
- * insensitive.
- *
- * @param seq
- * list of strings to case insensitively check for
- * @return true of any matched, false if none did
- */
- public boolean matchesAny(String... seq) {
- for (String s : seq) {
- if (matches(s)) {
- return true;
- }
- }
- return false;
- }
-
- public boolean matchesAny(char... seq) {
- if (isEmpty()) {
- return false;
- }
-
- for (char c : seq) {
- if (queue.charAt(pos) == c) {
- return true;
- }
- }
- return false;
- }
-
- public boolean matchesStartTag() {
- // micro opt for matching "<x"
- return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character
- .isLetter(queue.charAt(pos + 1)));
- }
-
- /**
- * Tests if the queue matches the sequence (as with match), and if they do,
- * removes the matched string from the queue.
- *
- * @param seq
- * String to search for, and if found, remove from queue.
- * @return true if found and removed, false if not found.
- */
- public boolean matchChomp(String seq) {
- if (matches(seq)) {
- pos += seq.length();
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Tests if queue starts with a whitespace character.
- *
- * @return if starts with whitespace
- */
- public boolean matchesWhitespace() {
- return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos));
- }
-
- /**
- * Test if the queue matches a word character (letter or digit).
- *
- * @return if matches a word character
- */
- public boolean matchesWord() {
- return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
- }
-
- /**
- * Drops the next character off the queue.
- */
- public void advance() {
- if (!isEmpty()) {
- pos++;
- }
- }
-
- /**
- * Consume one character off queue.
- *
- * @return first character on queue.
- */
- public char consume() {
- return queue.charAt(pos++);
- }
-
- /**
- * Consumes the supplied sequence of the queue. If the queue does not start
- * with the supplied sequence, will throw an illegal state exception -- but
- * you should be running match() against that condition.
- * <p>
- * Case insensitive.
- *
- * @param seq
- * sequence to remove from head of queue.
- */
- public void consume(String seq) {
- if (!matches(seq)) {
- throw new IllegalStateException(
- "Queue did not match expected sequence");
- }
- int len = seq.length();
- if (len > remainingLength()) {
- throw new IllegalStateException(
- "Queue not long enough to consume sequence");
- }
-
- pos += len;
- }
-
- /**
- * Pulls a string off the queue, up to but exclusive of the match sequence,
- * or to the queue running out.
- *
- * @param seq
- * String to end on (and not include in return, but leave on
- * queue). <b>Case sensitive.</b>
- * @return The matched data consumed from queue.
- */
- public String consumeTo(String seq) {
- int offset = queue.indexOf(seq, pos);
- if (offset != -1) {
- String consumed = queue.substring(pos, offset);
- pos += consumed.length();
- return consumed;
- } else {
- return remainder();
- }
- }
-
- public String consumeToIgnoreCase(String seq) {
- int start = pos;
- String first = seq.substring(0, 1);
- boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if
- // first
- // is
- // not
- // cased,
- // use
- // index
- // of
- while (!isEmpty()) {
- if (matches(seq)) {
- break;
- }
-
- if (canScan) {
- int skip = queue.indexOf(first, pos) - pos;
- if (skip == 0) {
- pos++;
- } else if (skip < 0) {
- pos = queue.length();
- } else {
- pos += skip;
- }
- } else {
- pos++;
- }
- }
-
- String data = queue.substring(start, pos);
- return data;
- }
-
- /**
- * Consumes to the first sequence provided, or to the end of the queue.
- * Leaves the terminator on the queue.
- *
- * @param seq
- * any number of terminators to consume to. <b>Case
- * insensitive.</b>
- * @return consumed string
- */
- // todo: method name. not good that consumeTo cares for case, and consume to
- // any doesn't. And the only use for this
- // is is a case sensitive time...
- public String consumeToAny(String... seq) {
- int start = pos;
- while (!isEmpty() && !matchesAny(seq)) {
- pos++;
- }
-
- String data = queue.substring(start, pos);
- return data;
- }
-
- /**
- * Pulls a string off the queue (like consumeTo), and then pulls off the
- * matched string (but does not return it).
- * <p>
- * If the queue runs out of characters before finding the seq, will return
- * as much as it can (and queue will go isEmpty() == true).
- *
- * @param seq
- * String to match up to, and not include in return, and to pull
- * off queue. <b>Case sensitive.</b>
- * @return Data matched from queue.
- */
- public String chompTo(String seq) {
- String data = consumeTo(seq);
- matchChomp(seq);
- return data;
- }
-
- public String chompToIgnoreCase(String seq) {
- String data = consumeToIgnoreCase(seq); // case insensitive scan
- matchChomp(seq);
- return data;
- }
-
- /**
- * Pulls a balanced string off the queue. E.g. if queue is
- * "(one (two) three) four", (,) will return "one (two) three", and leave
- * " four" on the queue. Unbalanced openers and closers can be escaped (with
- * \). Those escapes will be left in the returned string, which is suitable
- * for regexes (where we need to preserve the escape), but unsuitable for
- * contains text strings; use unescape for that.
- *
- * @param open
- * opener
- * @param close
- * closer
- * @return data matched from the queue
- */
- public String chompBalanced(char open, char close) {
- StringBuilder accum = new StringBuilder();
- int depth = 0;
- char last = 0;
-
- do {
- if (isEmpty()) {
- break;
- }
- Character c = consume();
- if (last == 0 || last != ESC) {
- if (c.equals(open)) {
- depth++;
- } else if (c.equals(close)) {
- depth--;
- }
- }
-
- if (depth > 0 && last != 0) {
- accum.append(c); // don't include the outer match pair in the
- // return
- }
- last = c;
- } while (depth > 0);
- return accum.toString();
- }
-
- /**
- * Unescaped a \ escaped string.
- *
- * @param in
- * backslash escaped string
- * @return unescaped string
- */
- public static String unescape(String in) {
- StringBuilder out = new StringBuilder();
- char last = 0;
- for (char c : in.toCharArray()) {
- if (c == ESC) {
- if (last != 0 && last == ESC) {
- out.append(c);
- }
- } else {
- out.append(c);
- }
- last = c;
- }
- return out.toString();
- }
-
- /**
- * Pulls the next run of whitespace characters of the queue.
- */
- public boolean consumeWhitespace() {
- boolean seen = false;
- while (matchesWhitespace()) {
- pos++;
- seen = true;
- }
- return seen;
- }
-
- /**
- * Retrieves the next run of word type (letter or digit) off the queue.
- *
- * @return String of word characters from queue, or empty string if none.
- */
- public String consumeWord() {
- int start = pos;
- while (matchesWord()) {
- pos++;
- }
- return queue.substring(start, pos);
- }
-
- /**
- * Consume an tag name off the queue (word or :, _, -)
- *
- * @return tag name
- */
- public String consumeTagName() {
- int start = pos;
- while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) {
- pos++;
- }
-
- return queue.substring(start, pos);
- }
-
- /**
- * Consume a CSS element selector (tag name, but | instead of : for
- * namespaces, to not conflict with :pseudo selects).
- *
- * @return tag name
- */
- public String consumeElementSelector() {
- int start = pos;
- while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) {
- pos++;
- }
-
- return queue.substring(start, pos);
- }
-
- /**
- * Consume a CSS identifier (ID or class) off the queue (letter, digit, -,
- * _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
- *
- * @return identifier
- */
- public String consumeCssIdentifier() {
- int start = pos;
- while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) {
- pos++;
- }
-
- return queue.substring(start, pos);
- }
-
- /**
- * Consume an attribute key off the queue (letter, digit, -, _, :")
- *
- * @return attribute key
- */
- public String consumeAttributeKey() {
- int start = pos;
- while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) {
- pos++;
- }
-
- return queue.substring(start, pos);
- }
-
- /**
- * Consume and return whatever is left on the queue.
- *
- * @return remained of queue.
- */
- public String remainder() {
- StringBuilder accum = new StringBuilder();
- while (!isEmpty()) {
- accum.append(consume());
- }
- return accum.toString();
- }
-
- @Override
- public String toString() {
- return queue.substring(pos);
- }
-}
diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java
deleted file mode 100644
index f46c962281..0000000000
--- a/server/src/org/jsoup/parser/Tokeniser.java
+++ /dev/null
@@ -1,264 +0,0 @@
-package org.jsoup.parser;
-
-import org.jsoup.helper.Validate;
-import org.jsoup.nodes.Entities;
-
-/**
- * Readers the input stream into tokens.
- */
-class Tokeniser {
- static final char replacementChar = '\uFFFD'; // replaces null character
-
- private CharacterReader reader; // html input
- private ParseErrorList errors; // errors found while tokenising
-
- private TokeniserState state = TokeniserState.Data; // current tokenisation
- // state
- private Token emitPending; // the token we are about to emit on next read
- private boolean isEmitPending = false;
- private StringBuilder charBuffer = new StringBuilder(); // buffers
- // characters to
- // output as one
- // token
- StringBuilder dataBuffer; // buffers data looking for </script>
-
- Token.Tag tagPending; // tag we are building up
- Token.Doctype doctypePending; // doctype building up
- Token.Comment commentPending; // comment building up
- private Token.StartTag lastStartTag; // the last start tag emitted, to test
- // appropriate end tag
- private boolean selfClosingFlagAcknowledged = true;
-
- Tokeniser(CharacterReader reader, ParseErrorList errors) {
- this.reader = reader;
- this.errors = errors;
- }
-
- Token read() {
- if (!selfClosingFlagAcknowledged) {
- error("Self closing flag not acknowledged");
- selfClosingFlagAcknowledged = true;
- }
-
- while (!isEmitPending) {
- state.read(this, reader);
- }
-
- // if emit is pending, a non-character token was found: return any chars
- // in buffer, and leave token for next read:
- if (charBuffer.length() > 0) {
- String str = charBuffer.toString();
- charBuffer.delete(0, charBuffer.length());
- return new Token.Character(str);
- } else {
- isEmitPending = false;
- return emitPending;
- }
- }
-
- void emit(Token token) {
- Validate.isFalse(isEmitPending, "There is an unread token pending!");
-
- emitPending = token;
- isEmitPending = true;
-
- if (token.type == Token.TokenType.StartTag) {
- Token.StartTag startTag = (Token.StartTag) token;
- lastStartTag = startTag;
- if (startTag.selfClosing) {
- selfClosingFlagAcknowledged = false;
- }
- } else if (token.type == Token.TokenType.EndTag) {
- Token.EndTag endTag = (Token.EndTag) token;
- if (endTag.attributes.size() > 0) {
- error("Attributes incorrectly present on end tag");
- }
- }
- }
-
- void emit(String str) {
- // buffer strings up until last string token found, to emit only one
- // token for a run of character refs etc.
- // does not set isEmitPending; read checks that
- charBuffer.append(str);
- }
-
- void emit(char c) {
- charBuffer.append(c);
- }
-
- TokeniserState getState() {
- return state;
- }
-
- void transition(TokeniserState state) {
- this.state = state;
- }
-
- void advanceTransition(TokeniserState state) {
- reader.advance();
- this.state = state;
- }
-
- void acknowledgeSelfClosingFlag() {
- selfClosingFlagAcknowledged = true;
- }
-
- Character consumeCharacterReference(Character additionalAllowedCharacter,
- boolean inAttribute) {
- if (reader.isEmpty()) {
- return null;
- }
- if (additionalAllowedCharacter != null
- && additionalAllowedCharacter == reader.current()) {
- return null;
- }
- if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) {
- return null;
- }
-
- reader.mark();
- if (reader.matchConsume("#")) { // numbered
- boolean isHexMode = reader.matchConsumeIgnoreCase("X");
- String numRef = isHexMode ? reader.consumeHexSequence() : reader
- .consumeDigitSequence();
- if (numRef.length() == 0) { // didn't match anything
- characterReferenceError("numeric reference with no numerals");
- reader.rewindToMark();
- return null;
- }
- if (!reader.matchConsume(";")) {
- characterReferenceError("missing semicolon"); // missing semi
- }
- int charval = -1;
- try {
- int base = isHexMode ? 16 : 10;
- charval = Integer.valueOf(numRef, base);
- } catch (NumberFormatException e) {
- } // skip
- if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF)
- || charval > 0x10FFFF) {
- characterReferenceError("character outside of valid range");
- return replacementChar;
- } else {
- // todo: implement number replacement table
- // todo: check for extra illegal unicode points as parse errors
- return (char) charval;
- }
- } else { // named
- // get as many letters as possible, and look for matching entities.
- // unconsume backwards till a match is found
- String nameRef = reader.consumeLetterThenDigitSequence();
- String origNameRef = new String(nameRef); // for error reporting.
- // nameRef gets chomped
- // looking for matches
- boolean looksLegit = reader.matches(';');
- boolean found = false;
- while (nameRef.length() > 0 && !found) {
- if (Entities.isNamedEntity(nameRef)) {
- found = true;
- } else {
- nameRef = nameRef.substring(0, nameRef.length() - 1);
- reader.unconsume();
- }
- }
- if (!found) {
- if (looksLegit) {
- characterReferenceError(String.format(
- "invalid named referenece '%s'", origNameRef));
- }
- reader.rewindToMark();
- return null;
- }
- if (inAttribute
- && (reader.matchesLetter() || reader.matchesDigit() || reader
- .matchesAny('=', '-', '_'))) {
- // don't want that to match
- reader.rewindToMark();
- return null;
- }
- if (!reader.matchConsume(";")) {
- characterReferenceError("missing semicolon"); // missing semi
- }
- return Entities.getCharacterByName(nameRef);
- }
- }
-
- Token.Tag createTagPending(boolean start) {
- tagPending = start ? new Token.StartTag() : new Token.EndTag();
- return tagPending;
- }
-
- void emitTagPending() {
- tagPending.finaliseTag();
- emit(tagPending);
- }
-
- void createCommentPending() {
- commentPending = new Token.Comment();
- }
-
- void emitCommentPending() {
- emit(commentPending);
- }
-
- void createDoctypePending() {
- doctypePending = new Token.Doctype();
- }
-
- void emitDoctypePending() {
- emit(doctypePending);
- }
-
- void createTempBuffer() {
- dataBuffer = new StringBuilder();
- }
-
- boolean isAppropriateEndTagToken() {
- if (lastStartTag == null) {
- return false;
- }
- return tagPending.tagName.equals(lastStartTag.tagName);
- }
-
- String appropriateEndTagName() {
- return lastStartTag.tagName;
- }
-
- void error(TokeniserState state) {
- if (errors.canAddError()) {
- errors.add(new ParseError(reader.pos(),
- "Unexpected character '%s' in input state [%s]", reader
- .current(), state));
- }
- }
-
- void eofError(TokeniserState state) {
- if (errors.canAddError()) {
- errors.add(new ParseError(
- reader.pos(),
- "Unexpectedly reached end of file (EOF) in input state [%s]",
- state));
- }
- }
-
- private void characterReferenceError(String message) {
- if (errors.canAddError()) {
- errors.add(new ParseError(reader.pos(),
- "Invalid character reference: %s", message));
- }
- }
-
- private void error(String errorMsg) {
- if (errors.canAddError()) {
- errors.add(new ParseError(reader.pos(), errorMsg));
- }
- }
-
- boolean currentNodeInHtmlNS() {
- // todo: implement namespaces correctly
- return true;
- // Element currentNode = currentNode();
- // return currentNode != null && currentNode.namespace().equals("HTML");
- }
-}
diff --git a/server/src/org/jsoup/parser/TokeniserState.java b/server/src/org/jsoup/parser/TokeniserState.java
deleted file mode 100644
index 7f7315d769..0000000000
--- a/server/src/org/jsoup/parser/TokeniserState.java
+++ /dev/null
@@ -1,1870 +0,0 @@
-package org.jsoup.parser;
-
-/**
- * States and transition activations for the Tokeniser.
- */
-enum TokeniserState {
- Data {
- // in data state, gather characters until a character reference or tag
- // is found
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '&':
- t.advanceTransition(CharacterReferenceInData);
- break;
- case '<':
- t.advanceTransition(TagOpen);
- break;
- case nullChar:
- t.error(this); // NOT replacement character (oddly?)
- t.emit(r.consume());
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('&', '<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- CharacterReferenceInData {
- // from & in data
- @Override
- void read(Tokeniser t, CharacterReader r) {
- Character c = t.consumeCharacterReference(null, false);
- if (c == null) {
- t.emit('&');
- } else {
- t.emit(c);
- }
- t.transition(Data);
- }
- },
- Rcdata {
- // / handles data in title, textarea etc
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '&':
- t.advanceTransition(CharacterReferenceInRcdata);
- break;
- case '<':
- t.advanceTransition(RcdataLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('&', '<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- CharacterReferenceInRcdata {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- Character c = t.consumeCharacterReference(null, false);
- if (c == null) {
- t.emit('&');
- } else {
- t.emit(c);
- }
- t.transition(Rcdata);
- }
- },
- Rawtext {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '<':
- t.advanceTransition(RawtextLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- ScriptData {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '<':
- t.advanceTransition(ScriptDataLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeToAny('<', nullChar);
- t.emit(data);
- break;
- }
- }
- },
- PLAINTEXT {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.emit(new Token.EOF());
- break;
- default:
- String data = r.consumeTo(nullChar);
- t.emit(data);
- break;
- }
- }
- },
- TagOpen {
- // from < in data
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.current()) {
- case '!':
- t.advanceTransition(MarkupDeclarationOpen);
- break;
- case '/':
- t.advanceTransition(EndTagOpen);
- break;
- case '?':
- t.advanceTransition(BogusComment);
- break;
- default:
- if (r.matchesLetter()) {
- t.createTagPending(true);
- t.transition(TagName);
- } else {
- t.error(this);
- t.emit('<'); // char that got us here
- t.transition(Data);
- }
- break;
- }
- }
- },
- EndTagOpen {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.emit("</");
- t.transition(Data);
- } else if (r.matchesLetter()) {
- t.createTagPending(false);
- t.transition(TagName);
- } else if (r.matches('>')) {
- t.error(this);
- t.advanceTransition(Data);
- } else {
- t.error(this);
- t.advanceTransition(BogusComment);
- }
- }
- },
- TagName {
- // from < or </ in data, will have start or end tag pending
- @Override
- void read(Tokeniser t, CharacterReader r) {
- // previous TagOpen state did NOT consume, will have a letter char
- // in current
- String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>',
- nullChar).toLowerCase();
- t.tagPending.appendTagName(tagName);
-
- switch (r.consume()) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar: // replacement
- t.tagPending.appendTagName(replacementStr);
- break;
- case eof: // should emit pending tag?
- t.eofError(this);
- t.transition(Data);
- // no default, as covered with above consumeToAny
- }
- }
- },
- RcdataLessthanSign {
- // from < in rcdata
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('/')) {
- t.createTempBuffer();
- t.advanceTransition(RCDATAEndTagOpen);
- } else if (r.matchesLetter()
- && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
- // diverge from spec: got a start tag, but there's no
- // appropriate end tag (</title>), so rather than
- // consuming to EOF; break out here
- t.tagPending = new Token.EndTag(t.appropriateEndTagName());
- t.emitTagPending();
- r.unconsume(); // undo "<"
- t.transition(Data);
- } else {
- t.emit("<");
- t.transition(Rcdata);
- }
- }
- },
- RCDATAEndTagOpen {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.tagPending.appendTagName(Character.toLowerCase(r.current()));
- t.dataBuffer.append(Character.toLowerCase(r.current()));
- t.advanceTransition(RCDATAEndTagName);
- } else {
- t.emit("</");
- t.transition(Rcdata);
- }
- }
- },
- RCDATAEndTagName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- if (t.isAppropriateEndTagToken()) {
- t.transition(BeforeAttributeName);
- } else {
- anythingElse(t, r);
- }
- break;
- case '/':
- if (t.isAppropriateEndTagToken()) {
- t.transition(SelfClosingStartTag);
- } else {
- anythingElse(t, r);
- }
- break;
- case '>':
- if (t.isAppropriateEndTagToken()) {
- t.emitTagPending();
- t.transition(Data);
- } else {
- anythingElse(t, r);
- }
- break;
- default:
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(Rcdata);
- }
- },
- RawtextLessthanSign {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('/')) {
- t.createTempBuffer();
- t.advanceTransition(RawtextEndTagOpen);
- } else {
- t.emit('<');
- t.transition(Rawtext);
- }
- }
- },
- RawtextEndTagOpen {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.transition(RawtextEndTagName);
- } else {
- t.emit("</");
- t.transition(Rawtext);
- }
- }
- },
- RawtextEndTagName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- default:
- t.dataBuffer.append(c);
- anythingElse(t, r);
- }
- } else {
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(Rawtext);
- }
- },
- ScriptDataLessthanSign {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- switch (r.consume()) {
- case '/':
- t.createTempBuffer();
- t.transition(ScriptDataEndTagOpen);
- break;
- case '!':
- t.emit("<!");
- t.transition(ScriptDataEscapeStart);
- break;
- default:
- t.emit("<");
- r.unconsume();
- t.transition(ScriptData);
- }
- }
- },
- ScriptDataEndTagOpen {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.transition(ScriptDataEndTagName);
- } else {
- t.emit("</");
- t.transition(ScriptData);
- }
-
- }
- },
- ScriptDataEndTagName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- default:
- t.dataBuffer.append(c);
- anythingElse(t, r);
- }
- } else {
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(ScriptData);
- }
- },
- ScriptDataEscapeStart {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('-')) {
- t.emit('-');
- t.advanceTransition(ScriptDataEscapeStartDash);
- } else {
- t.transition(ScriptData);
- }
- }
- },
- ScriptDataEscapeStartDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('-')) {
- t.emit('-');
- t.advanceTransition(ScriptDataEscapedDashDash);
- } else {
- t.transition(ScriptData);
- }
- }
- },
- ScriptDataEscaped {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.transition(Data);
- return;
- }
-
- switch (r.current()) {
- case '-':
- t.emit('-');
- t.advanceTransition(ScriptDataEscapedDash);
- break;
- case '<':
- t.advanceTransition(ScriptDataEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- default:
- String data = r.consumeToAny('-', '<', nullChar);
- t.emit(data);
- }
- }
- },
- ScriptDataEscapedDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.transition(Data);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- t.transition(ScriptDataEscapedDashDash);
- break;
- case '<':
- t.transition(ScriptDataEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataEscaped);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedDashDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.transition(Data);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- break;
- case '<':
- t.transition(ScriptDataEscapedLessthanSign);
- break;
- case '>':
- t.emit(c);
- t.transition(ScriptData);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataEscaped);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedLessthanSign {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTempBuffer();
- t.dataBuffer.append(Character.toLowerCase(r.current()));
- t.emit("<" + r.current());
- t.advanceTransition(ScriptDataDoubleEscapeStart);
- } else if (r.matches('/')) {
- t.createTempBuffer();
- t.advanceTransition(ScriptDataEscapedEndTagOpen);
- } else {
- t.emit('<');
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedEndTagOpen {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createTagPending(false);
- t.tagPending.appendTagName(Character.toLowerCase(r.current()));
- t.dataBuffer.append(r.current());
- t.advanceTransition(ScriptDataEscapedEndTagName);
- } else {
- t.emit("</");
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataEscapedEndTagName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.tagPending.appendTagName(name.toLowerCase());
- t.dataBuffer.append(name);
- return;
- }
-
- if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- default:
- t.dataBuffer.append(c);
- anythingElse(t, r);
- break;
- }
- } else {
- anythingElse(t, r);
- }
- }
-
- private void anythingElse(Tokeniser t, CharacterReader r) {
- t.emit("</" + t.dataBuffer.toString());
- t.transition(ScriptDataEscaped);
- }
- },
- ScriptDataDoubleEscapeStart {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.dataBuffer.append(name.toLowerCase());
- t.emit(name);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- case '/':
- case '>':
- if (t.dataBuffer.toString().equals("script")) {
- t.transition(ScriptDataDoubleEscaped);
- } else {
- t.transition(ScriptDataEscaped);
- }
- t.emit(c);
- break;
- default:
- r.unconsume();
- t.transition(ScriptDataEscaped);
- }
- }
- },
- ScriptDataDoubleEscaped {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.current();
- switch (c) {
- case '-':
- t.emit(c);
- t.advanceTransition(ScriptDataDoubleEscapedDash);
- break;
- case '<':
- t.emit(c);
- t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.emit(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- String data = r.consumeToAny('-', '<', nullChar);
- t.emit(data);
- }
- }
- },
- ScriptDataDoubleEscapedDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- t.transition(ScriptDataDoubleEscapedDashDash);
- break;
- case '<':
- t.emit(c);
- t.transition(ScriptDataDoubleEscapedLessthanSign);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataDoubleEscaped);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- ScriptDataDoubleEscapedDashDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.emit(c);
- break;
- case '<':
- t.emit(c);
- t.transition(ScriptDataDoubleEscapedLessthanSign);
- break;
- case '>':
- t.emit(c);
- t.transition(ScriptData);
- break;
- case nullChar:
- t.error(this);
- t.emit(replacementChar);
- t.transition(ScriptDataDoubleEscaped);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.emit(c);
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- ScriptDataDoubleEscapedLessthanSign {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matches('/')) {
- t.emit('/');
- t.createTempBuffer();
- t.advanceTransition(ScriptDataDoubleEscapeEnd);
- } else {
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- ScriptDataDoubleEscapeEnd {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.dataBuffer.append(name.toLowerCase());
- t.emit(name);
- return;
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- case '/':
- case '>':
- if (t.dataBuffer.toString().equals("script")) {
- t.transition(ScriptDataEscaped);
- } else {
- t.transition(ScriptDataDoubleEscaped);
- }
- t.emit(c);
- break;
- default:
- r.unconsume();
- t.transition(ScriptDataDoubleEscaped);
- }
- }
- },
- BeforeAttributeName {
- // from tagname <xxx
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break; // ignore whitespace
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.newAttribute();
- r.unconsume();
- t.transition(AttributeName);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- case '=':
- t.error(this);
- t.tagPending.newAttribute();
- t.tagPending.appendAttributeName(c);
- t.transition(AttributeName);
- break;
- default: // A-Z, anything else
- t.tagPending.newAttribute();
- r.unconsume();
- t.transition(AttributeName);
- }
- }
- },
- AttributeName {
- // from before attribute name
- @Override
- void read(Tokeniser t, CharacterReader r) {
- String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>',
- nullChar, '"', '\'', '<');
- t.tagPending.appendAttributeName(name.toLowerCase());
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(AfterAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '=':
- t.transition(BeforeAttributeValue);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeName(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- t.error(this);
- t.tagPending.appendAttributeName(c);
- // no default, as covered in consumeToAny
- }
- }
- },
- AfterAttributeName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- // ignore
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '=':
- t.transition(BeforeAttributeValue);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeName(replacementChar);
- t.transition(AttributeName);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- t.error(this);
- t.tagPending.newAttribute();
- t.tagPending.appendAttributeName(c);
- t.transition(AttributeName);
- break;
- default: // A-Z, anything else
- t.tagPending.newAttribute();
- r.unconsume();
- t.transition(AttributeName);
- }
- }
- },
- BeforeAttributeValue {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- // ignore
- break;
- case '"':
- t.transition(AttributeValue_doubleQuoted);
- break;
- case '&':
- r.unconsume();
- t.transition(AttributeValue_unquoted);
- break;
- case '\'':
- t.transition(AttributeValue_singleQuoted);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- t.transition(AttributeValue_unquoted);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '>':
- t.error(this);
- t.emitTagPending();
- t.transition(Data);
- break;
- case '<':
- case '=':
- case '`':
- t.error(this);
- t.tagPending.appendAttributeValue(c);
- t.transition(AttributeValue_unquoted);
- break;
- default:
- r.unconsume();
- t.transition(AttributeValue_unquoted);
- }
- }
- },
- AttributeValue_doubleQuoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- String value = r.consumeToAny('"', '&', nullChar);
- if (value.length() > 0) {
- t.tagPending.appendAttributeValue(value);
- }
-
- char c = r.consume();
- switch (c) {
- case '"':
- t.transition(AfterAttributeValue_quoted);
- break;
- case '&':
- Character ref = t.consumeCharacterReference('"', true);
- if (ref != null) {
- t.tagPending.appendAttributeValue(ref);
- } else {
- t.tagPending.appendAttributeValue('&');
- }
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- // no default, handled in consume to any above
- }
- }
- },
- AttributeValue_singleQuoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- String value = r.consumeToAny('\'', '&', nullChar);
- if (value.length() > 0) {
- t.tagPending.appendAttributeValue(value);
- }
-
- char c = r.consume();
- switch (c) {
- case '\'':
- t.transition(AfterAttributeValue_quoted);
- break;
- case '&':
- Character ref = t.consumeCharacterReference('\'', true);
- if (ref != null) {
- t.tagPending.appendAttributeValue(ref);
- } else {
- t.tagPending.appendAttributeValue('&');
- }
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- // no default, handled in consume to any above
- }
- }
- },
- AttributeValue_unquoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>',
- nullChar, '"', '\'', '<', '=', '`');
- if (value.length() > 0) {
- t.tagPending.appendAttributeValue(value);
- }
-
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '&':
- Character ref = t.consumeCharacterReference('>', true);
- if (ref != null) {
- t.tagPending.appendAttributeValue(ref);
- } else {
- t.tagPending.appendAttributeValue('&');
- }
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.tagPending.appendAttributeValue(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- case '"':
- case '\'':
- case '<':
- case '=':
- case '`':
- t.error(this);
- t.tagPending.appendAttributeValue(c);
- break;
- // no default, handled in consume to any above
- }
-
- }
- },
- // CharacterReferenceInAttributeValue state handled inline
- AfterAttributeValue_quoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeAttributeName);
- break;
- case '/':
- t.transition(SelfClosingStartTag);
- break;
- case '>':
- t.emitTagPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.error(this);
- r.unconsume();
- t.transition(BeforeAttributeName);
- }
-
- }
- },
- SelfClosingStartTag {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '>':
- t.tagPending.selfClosing = true;
- t.emitTagPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.transition(BeforeAttributeName);
- }
- }
- },
- BogusComment {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- // todo: handle bogus comment starting from eof. when does that
- // trigger?
- // rewind to capture character that lead us here
- r.unconsume();
- Token.Comment comment = new Token.Comment();
- comment.data.append(r.consumeTo('>'));
- // todo: replace nullChar with replaceChar
- t.emit(comment);
- t.advanceTransition(Data);
- }
- },
- MarkupDeclarationOpen {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchConsume("--")) {
- t.createCommentPending();
- t.transition(CommentStart);
- } else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
- t.transition(Doctype);
- } else if (r.matchConsume("[CDATA[")) {
- // todo: should actually check current namepspace, and only
- // non-html allows cdata. until namespace
- // is implemented properly, keep handling as cdata
- // } else if (!t.currentNodeInHtmlNS() &&
- // r.matchConsume("[CDATA[")) {
- t.transition(CdataSection);
- } else {
- t.error(this);
- t.advanceTransition(BogusComment); // advance so this character
- // gets in bogus comment
- // data's rewind
- }
- }
- },
- CommentStart {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.transition(CommentStartDash);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append(replacementChar);
- t.transition(Comment);
- break;
- case '>':
- t.error(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append(c);
- t.transition(Comment);
- }
- }
- },
- CommentStartDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.transition(CommentStartDash);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append(replacementChar);
- t.transition(Comment);
- break;
- case '>':
- t.error(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append(c);
- t.transition(Comment);
- }
- }
- },
- Comment {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.current();
- switch (c) {
- case '-':
- t.advanceTransition(CommentEndDash);
- break;
- case nullChar:
- t.error(this);
- r.advance();
- t.commentPending.data.append(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append(r.consumeToAny('-', nullChar));
- }
- }
- },
- CommentEndDash {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.transition(CommentEnd);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append('-').append(replacementChar);
- t.transition(Comment);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append('-').append(c);
- t.transition(Comment);
- }
- }
- },
- CommentEnd {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '>':
- t.emitCommentPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append("--").append(replacementChar);
- t.transition(Comment);
- break;
- case '!':
- t.error(this);
- t.transition(CommentEndBang);
- break;
- case '-':
- t.error(this);
- t.commentPending.data.append('-');
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.commentPending.data.append("--").append(c);
- t.transition(Comment);
- }
- }
- },
- CommentEndBang {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '-':
- t.commentPending.data.append("--!");
- t.transition(CommentEndDash);
- break;
- case '>':
- t.emitCommentPending();
- t.transition(Data);
- break;
- case nullChar:
- t.error(this);
- t.commentPending.data.append("--!").append(replacementChar);
- t.transition(Comment);
- break;
- case eof:
- t.eofError(this);
- t.emitCommentPending();
- t.transition(Data);
- break;
- default:
- t.commentPending.data.append("--!").append(c);
- t.transition(Comment);
- }
- }
- },
- Doctype {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeDoctypeName);
- break;
- case eof:
- t.eofError(this);
- t.createDoctypePending();
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.transition(BeforeDoctypeName);
- }
- }
- },
- BeforeDoctypeName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- t.createDoctypePending();
- t.transition(DoctypeName);
- return;
- }
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break; // ignore whitespace
- case nullChar:
- t.error(this);
- t.doctypePending.name.append(replacementChar);
- t.transition(DoctypeName);
- break;
- case eof:
- t.eofError(this);
- t.createDoctypePending();
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.createDoctypePending();
- t.doctypePending.name.append(c);
- t.transition(DoctypeName);
- }
- }
- },
- DoctypeName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.matchesLetter()) {
- String name = r.consumeLetterSequence();
- t.doctypePending.name.append(name.toLowerCase());
- return;
- }
- char c = r.consume();
- switch (c) {
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(AfterDoctypeName);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.name.append(replacementChar);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.name.append(c);
- }
- }
- },
- AfterDoctypeName {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- if (r.isEmpty()) {
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- return;
- }
- if (r.matchesAny('\t', '\n', '\f', ' ')) {
- r.advance(); // ignore whitespace
- } else if (r.matches('>')) {
- t.emitDoctypePending();
- t.advanceTransition(Data);
- } else if (r.matchConsumeIgnoreCase("PUBLIC")) {
- t.transition(AfterDoctypePublicKeyword);
- } else if (r.matchConsumeIgnoreCase("SYSTEM")) {
- t.transition(AfterDoctypeSystemKeyword);
- } else {
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.advanceTransition(BogusDoctype);
- }
-
- }
- },
- AfterDoctypePublicKeyword {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeDoctypePublicIdentifier);
- break;
- case '"':
- t.error(this);
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_singleQuoted);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- BeforeDoctypePublicIdentifier {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '"':
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_doubleQuoted);
- break;
- case '\'':
- // set public id to empty string
- t.transition(DoctypePublicIdentifier_singleQuoted);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- DoctypePublicIdentifier_doubleQuoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '"':
- t.transition(AfterDoctypePublicIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.publicIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.publicIdentifier.append(c);
- }
- }
- },
- DoctypePublicIdentifier_singleQuoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\'':
- t.transition(AfterDoctypePublicIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.publicIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.publicIdentifier.append(c);
- }
- }
- },
- AfterDoctypePublicIdentifier {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BetweenDoctypePublicAndSystemIdentifiers);
- break;
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '"':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- BetweenDoctypePublicAndSystemIdentifiers {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '"':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- AfterDoctypeSystemKeyword {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- t.transition(BeforeDoctypeSystemIdentifier);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case '"':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- t.error(this);
- // system id empty
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- }
- }
- },
- BeforeDoctypeSystemIdentifier {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '"':
- // set system id to empty string
- t.transition(DoctypeSystemIdentifier_doubleQuoted);
- break;
- case '\'':
- // set public id to empty string
- t.transition(DoctypeSystemIdentifier_singleQuoted);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.transition(BogusDoctype);
- }
- }
- },
- DoctypeSystemIdentifier_doubleQuoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '"':
- t.transition(AfterDoctypeSystemIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.systemIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.systemIdentifier.append(c);
- }
- }
- },
- DoctypeSystemIdentifier_singleQuoted {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\'':
- t.transition(AfterDoctypeSystemIdentifier);
- break;
- case nullChar:
- t.error(this);
- t.doctypePending.systemIdentifier.append(replacementChar);
- break;
- case '>':
- t.error(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.doctypePending.systemIdentifier.append(c);
- }
- }
- },
- AfterDoctypeSystemIdentifier {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '\t':
- case '\n':
- case '\f':
- case ' ':
- break;
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.eofError(this);
- t.doctypePending.forceQuirks = true;
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- t.error(this);
- t.transition(BogusDoctype);
- // NOT force quirks
- }
- }
- },
- BogusDoctype {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- char c = r.consume();
- switch (c) {
- case '>':
- t.emitDoctypePending();
- t.transition(Data);
- break;
- case eof:
- t.emitDoctypePending();
- t.transition(Data);
- break;
- default:
- // ignore char
- break;
- }
- }
- },
- CdataSection {
- @Override
- void read(Tokeniser t, CharacterReader r) {
- String data = r.consumeTo("]]>");
- t.emit(data);
- r.matchConsume("]]>");
- t.transition(Data);
- }
- };
-
- abstract void read(Tokeniser t, CharacterReader r);
-
- private static final char nullChar = '\u0000';
- private static final char replacementChar = Tokeniser.replacementChar;
- private static final String replacementStr = String
- .valueOf(Tokeniser.replacementChar);
- private static final char eof = CharacterReader.EOF;
-}
diff --git a/server/src/org/jsoup/parser/TreeBuilder.java b/server/src/org/jsoup/parser/TreeBuilder.java
deleted file mode 100644
index 5e2dbebc66..0000000000
--- a/server/src/org/jsoup/parser/TreeBuilder.java
+++ /dev/null
@@ -1,61 +0,0 @@
-package org.jsoup.parser;
-
-import org.jsoup.helper.DescendableLinkedList;
-import org.jsoup.helper.Validate;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-
-/**
- * @author Jonathan Hedley
- */
-abstract class TreeBuilder {
- CharacterReader reader;
- Tokeniser tokeniser;
- protected Document doc; // current doc we are building into
- protected DescendableLinkedList<Element> stack; // the stack of open
- // elements
- protected String baseUri; // current base uri, for creating new elements
- protected Token currentToken; // currentToken is used only for error
- // tracking.
- protected ParseErrorList errors; // null when not tracking errors
-
- protected void initialiseParse(String input, String baseUri,
- ParseErrorList errors) {
- Validate.notNull(input, "String input must not be null");
- Validate.notNull(baseUri, "BaseURI must not be null");
-
- doc = new Document(baseUri);
- reader = new CharacterReader(input);
- this.errors = errors;
- tokeniser = new Tokeniser(reader, errors);
- stack = new DescendableLinkedList<Element>();
- this.baseUri = baseUri;
- }
-
- Document parse(String input, String baseUri) {
- return parse(input, baseUri, ParseErrorList.noTracking());
- }
-
- Document parse(String input, String baseUri, ParseErrorList errors) {
- initialiseParse(input, baseUri, errors);
- runParser();
- return doc;
- }
-
- protected void runParser() {
- while (true) {
- Token token = tokeniser.read();
- process(token);
-
- if (token.type == Token.TokenType.EOF) {
- break;
- }
- }
- }
-
- protected abstract boolean process(Token token);
-
- protected Element currentElement() {
- return stack.getLast();
- }
-}
diff --git a/server/src/org/jsoup/parser/XmlTreeBuilder.java b/server/src/org/jsoup/parser/XmlTreeBuilder.java
deleted file mode 100644
index c2a3635b3d..0000000000
--- a/server/src/org/jsoup/parser/XmlTreeBuilder.java
+++ /dev/null
@@ -1,121 +0,0 @@
-package org.jsoup.parser;
-
-import java.util.Iterator;
-
-import org.jsoup.helper.Validate;
-import org.jsoup.nodes.Comment;
-import org.jsoup.nodes.DocumentType;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-import org.jsoup.nodes.TextNode;
-
-/**
- * @author Jonathan Hedley
- */
-public class XmlTreeBuilder extends TreeBuilder {
- @Override
- protected void initialiseParse(String input, String baseUri,
- ParseErrorList errors) {
- super.initialiseParse(input, baseUri, errors);
- stack.add(doc); // place the document onto the stack. differs from
- // HtmlTreeBuilder (not on stack)
- }
-
- @Override
- protected boolean process(Token token) {
- // start tag, end tag, doctype, comment, character, eof
- switch (token.type) {
- case StartTag:
- insert(token.asStartTag());
- break;
- case EndTag:
- popStackToClose(token.asEndTag());
- break;
- case Comment:
- insert(token.asComment());
- break;
- case Character:
- insert(token.asCharacter());
- break;
- case Doctype:
- insert(token.asDoctype());
- break;
- case EOF: // could put some normalisation here if desired
- break;
- default:
- Validate.fail("Unexpected token type: " + token.type);
- }
- return true;
- }
-
- private void insertNode(Node node) {
- currentElement().appendChild(node);
- }
-
- Element insert(Token.StartTag startTag) {
- Tag tag = Tag.valueOf(startTag.name());
- // todo: wonder if for xml parsing, should treat all tags as unknown?
- // because it's not html.
- Element el = new Element(tag, baseUri, startTag.attributes);
- insertNode(el);
- if (startTag.isSelfClosing()) {
- tokeniser.acknowledgeSelfClosingFlag();
- if (!tag.isKnownTag()) {
- tag.setSelfClosing();
- }
- } else {
- stack.add(el);
- }
- return el;
- }
-
- void insert(Token.Comment commentToken) {
- Comment comment = new Comment(commentToken.getData(), baseUri);
- insertNode(comment);
- }
-
- void insert(Token.Character characterToken) {
- Node node = new TextNode(characterToken.getData(), baseUri);
- insertNode(node);
- }
-
- void insert(Token.Doctype d) {
- DocumentType doctypeNode = new DocumentType(d.getName(),
- d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri);
- insertNode(doctypeNode);
- }
-
- /**
- * If the stack contains an element with this tag's name, pop up the stack
- * to remove the first occurrence. If not found, skips.
- *
- * @param endTag
- */
- private void popStackToClose(Token.EndTag endTag) {
- String elName = endTag.name();
- Element firstFound = null;
-
- Iterator<Element> it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next.nodeName().equals(elName)) {
- firstFound = next;
- break;
- }
- }
- if (firstFound == null) {
- return; // not found, skip
- }
-
- it = stack.descendingIterator();
- while (it.hasNext()) {
- Element next = it.next();
- if (next == firstFound) {
- it.remove();
- break;
- } else {
- it.remove();
- }
- }
- }
-}
diff --git a/server/src/org/jsoup/parser/package-info.java b/server/src/org/jsoup/parser/package-info.java
deleted file mode 100644
index c6c3d9a029..0000000000
--- a/server/src/org/jsoup/parser/package-info.java
+++ /dev/null
@@ -1,5 +0,0 @@
-/**
- Contains the HTML parser, tag specifications, and HTML tokeniser.
- */
-package org.jsoup.parser;
-