Include jsoup library for modifying bootstap page DOM (#9274)

author Leif Åstrand <leif@vaadin.com>

Thu, 9 Aug 2012 13:25:06 +0000 (16:25 +0300)

committer Leif Åstrand <leif@vaadin.com>

Thu, 9 Aug 2012 13:39:36 +0000 (16:39 +0300)
author Leif Åstrand <leif@vaadin.com>
Thu, 9 Aug 2012 13:25:06 +0000 (16:25 +0300)
committer Leif Åstrand <leif@vaadin.com>
Thu, 9 Aug 2012 13:39:36 +0000 (16:39 +0300)
diff --git a/src/org/jsoup/Connection.java b/src/org/jsoup/Connection.java

new file mode 100644 (file)

index 0000000..564eeb8
--- /dev/null
+++ b/src/org/jsoup/Connection.java
@@ -0,0 +1,481 @@
+package org.jsoup;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+
+import java.net.URL;
+import java.util.Map;
+import java.util.Collection;
+import java.io.IOException;
+
+/**
+ * A Connection provides a convenient interface to fetch content from the web, and parse them into Documents.
+ * <p>
+ * To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}. Connections contain {@link Connection.Request}
+ * and {@link Connection.Response} objects. The request objects are reusable as prototype requests.
+ * <p>
+ * Request configuration can be made using either the shortcut methods in Connection (e.g. {@link #userAgent(String)}),
+ * or by methods in the Connection.Request object directly. All request configuration must be made before the request
+ * is executed.
+ * <p>
+ * The Connection interface is <b>currently in beta</b> and subject to change. Comments, suggestions, and bug reports are welcome.
+ */
+public interface Connection {
+
+    /**
+     * GET and POST http methods.
+     */
+    public enum Method {
+        GET, POST
+    }
+
+    /**
+     * Set the request URL to fetch. The protocol must be HTTP or HTTPS.
+     * @param url URL to connect to
+     * @return this Connection, for chaining
+     */
+    public Connection url(URL url);
+
+    /**
+     * Set the request URL to fetch. The protocol must be HTTP or HTTPS.
+     * @param url URL to connect to
+     * @return this Connection, for chaining
+     */
+    public Connection url(String url);
+
+    /**
+     * Set the request user-agent header.
+     * @param userAgent user-agent to use
+     * @return this Connection, for chaining
+     */
+    public Connection userAgent(String userAgent);
+
+    /**
+     * Set the request timeouts (connect and read). If a timeout occurs, an IOException will be thrown. The default
+     * timeout is 3 seconds (3000 millis). A timeout of zero is treated as an infinite timeout.
+     * @param millis number of milliseconds (thousandths of a second) before timing out connects or reads.
+     * @return this Connection, for chaining
+     */
+    public Connection timeout(int millis);
+
+    /**
+     * Set the request referrer (aka "referer") header.
+     * @param referrer referrer to use
+     * @return this Connection, for chaining
+     */
+    public Connection referrer(String referrer);
+
+    /**
+     * Configures the connection to (not) follow server redirects. By default this is <b>true</b>.
+     * @param followRedirects true if server redirects should be followed.
+     * @return this Connection, for chaining
+     */
+    public Connection followRedirects(boolean followRedirects);
+
+    /**
+     * Set the request method to use, GET or POST. Default is GET.
+     * @param method HTTP request method
+     * @return this Connection, for chaining
+     */
+    public Connection method(Method method);
+
+    /**
+     * Configures the connection to not throw exceptions when a HTTP error occurs. (4xx - 5xx, e.g. 404 or 500). By
+     * default this is <b>false</b>; an IOException is thrown if an error is encountered. If set to <b>true</b>, the
+     * response is populated with the error body, and the status message will reflect the error.
+     * @param ignoreHttpErrors - false (default) if HTTP errors should be ignored.
+     * @return this Connection, for chaining
+     */
+    public Connection ignoreHttpErrors(boolean ignoreHttpErrors);
+
+    /**
+     * Ignore the document's Content-Type when parsing the response. By default this is <b>false</b>, an unrecognised
+     * content-type will cause an IOException to be thrown. (This is to prevent producing garbage by attempting to parse
+     * a JPEG binary image, for example.) Set to true to force a parse attempt regardless of content type.
+     * @param ignoreContentType set to true if you would like the content type ignored on parsing the response into a
+     * Document.
+     * @return this Connection, for chaining
+     */
+    public Connection ignoreContentType(boolean ignoreContentType);
+
+    /**
+     * Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the request
+     * body for POSTs. A request may have multiple values of the same name.
+     * @param key data key
+     * @param value data value
+     * @return this Connection, for chaining
+     */
+    public Connection data(String key, String value);
+
+    /**
+     * Adds all of the supplied data to the request data parameters
+     * @param data map of data parameters
+     * @return this Connection, for chaining
+     */
+    public Connection data(Map<String, String> data);
+
+    /**
+     * Add a number of request data parameters. Multiple parameters may be set at once, e.g.:
+     * <code>.data("name", "jsoup", "language", "Java", "language", "English");</code> creates a query string like:
+     * <code>?name=jsoup&language=Java&language=English</code>
+     * @param keyvals a set of key value pairs.
+     * @return this Connection, for chaining
+     */
+    public Connection data(String... keyvals);
+
+    /**
+     * Set a request header.
+     * @param name header name
+     * @param value header value
+     * @return this Connection, for chaining
+     * @see org.jsoup.Connection.Request#headers()
+     */
+    public Connection header(String name, String value);
+
+    /**
+     * Set a cookie to be sent in the request.
+     * @param name name of cookie
+     * @param value value of cookie
+     * @return this Connection, for chaining
+     */
+    public Connection cookie(String name, String value);
+
+    /**
+     * Adds each of the supplied cookies to the request.
+     * @param cookies map of cookie name -> value pairs
+     * @return this Connection, for chaining
+     */
+    public Connection cookies(Map<String, String> cookies);
+
+    /**
+     * Provide an alternate parser to use when parsing the response to a Document.
+     * @param parser alternate parser
+     * @return this Connection, for chaining
+     */
+    public Connection parser(Parser parser);
+
+    /**
+     * Execute the request as a GET, and parse the result.
+     * @return parsed Document
+     * @throws IOException on error
+     */
+    public Document get() throws IOException;
+
+    /**
+     * Execute the request as a POST, and parse the result.
+     * @return parsed Document
+     * @throws IOException on error
+     */
+    public Document post() throws IOException;
+
+    /**
+     * Execute the request.
+     * @return a response object
+     * @throws IOException on error
+     */
+    public Response execute() throws IOException;
+
+    /**
+     * Get the request object associated with this connection
+     * @return request
+     */
+    public Request request();
+
+    /**
+     * Set the connection's request
+     * @param request new request object
+     * @return this Connection, for chaining
+     */
+    public Connection request(Request request);
+
+    /**
+     * Get the response, once the request has been executed
+     * @return response
+     */
+    public Response response();
+
+    /**
+     * Set the connection's response
+     * @param response new response
+     * @return this Connection, for chaining
+     */
+    public Connection response(Response response);
+
+
+    /**
+     * Common methods for Requests and Responses
+     * @param <T> Type of Base, either Request or Response
+     */
+    interface Base<T extends Base> {
+
+        /**
+         * Get the URL
+         * @return URL
+         */
+        public URL url();
+
+        /**
+         * Set the URL
+         * @param url new URL
+         * @return this, for chaining
+         */
+        public T url(URL url);
+
+        /**
+         * Get the request method
+         * @return method
+         */
+        public Method method();
+
+        /**
+         * Set the request method
+         * @param method new method
+         * @return this, for chaining
+         */
+        public T method(Method method);
+
+        /**
+         * Get the value of a header. This is a simplified header model, where a header may only have one value.
+         * <p>
+         * Header names are case insensitive.
+         * @param name name of header (case insensitive)
+         * @return value of header, or null if not set.
+         * @see #hasHeader(String)
+         * @see #cookie(String)
+         */
+        public String header(String name);
+
+        /**
+         * Set a header. This method will overwrite any existing header with the same case insensitive name. 
+         * @param name Name of header
+         * @param value Value of header
+         * @return this, for chaining
+         */
+        public T header(String name, String value);
+
+        /**
+         * Check if a header is present
+         * @param name name of header (case insensitive)
+         * @return if the header is present in this request/response
+         */
+        public boolean hasHeader(String name);
+
+        /**
+         * Remove a header by name
+         * @param name name of header to remove (case insensitive)
+         * @return this, for chaining
+         */
+        public T removeHeader(String name);
+
+        /**
+         * Retrieve all of the request/response headers as a map
+         * @return headers
+         */
+        public Map<String, String> headers();
+
+        /**
+         * Get a cookie value by name from this request/response.
+         * <p>
+         * Response objects have a simplified cookie model. Each cookie set in the response is added to the response
+         * object's cookie key=value map. The cookie's path, domain, and expiry date are ignored.
+         * @param name name of cookie to retrieve.
+         * @return value of cookie, or null if not set
+         */
+        public String cookie(String name);
+
+        /**
+         * Set a cookie in this request/response.
+         * @param name name of cookie
+         * @param value value of cookie
+         * @return this, for chaining
+         */
+        public T cookie(String name, String value);
+
+        /**
+         * Check if a cookie is present
+         * @param name name of cookie
+         * @return if the cookie is present in this request/response
+         */
+        public boolean hasCookie(String name);
+
+        /**
+         * Remove a cookie by name
+         * @param name name of cookie to remove
+         * @return this, for chaining
+         */
+        public T removeCookie(String name);
+
+        /**
+         * Retrieve all of the request/response cookies as a map
+         * @return cookies
+         */
+        public Map<String, String> cookies();
+
+    }
+
+    /**
+     * Represents a HTTP request.
+     */
+    public interface Request extends Base<Request> {
+        /**
+         * Get the request timeout, in milliseconds.
+         * @return the timeout in milliseconds.
+         */
+        public int timeout();
+
+        /**
+         * Update the request timeout.
+         * @param millis timeout, in milliseconds
+         * @return this Request, for chaining
+         */
+        public Request timeout(int millis);
+
+        /**
+         * Get the current followRedirects configuration.
+         * @return true if followRedirects is enabled.
+         */
+        public boolean followRedirects();
+
+        /**
+         * Configures the request to (not) follow server redirects. By default this is <b>true</b>.
+         *
+         * @param followRedirects true if server redirects should be followed.
+         * @return this Request, for chaining
+         */
+        public Request followRedirects(boolean followRedirects);
+
+        /**
+         * Get the current ignoreHttpErrors configuration.
+         * @return true if errors will be ignored; false (default) if HTTP errors will cause an IOException to be thrown.
+         */
+        public boolean ignoreHttpErrors();
+
+       /**
+        * Configures the request to ignore HTTP errors in the response.
+        * @param ignoreHttpErrors set to true to ignore HTTP errors.
+         * @return this Request, for chaining
+        */
+        public Request ignoreHttpErrors(boolean ignoreHttpErrors);
+
+        /**
+         * Get the current ignoreContentType configuration.
+         * @return true if invalid content-types will be ignored; false (default) if they will cause an IOException to be thrown.
+         */
+        public boolean ignoreContentType();
+
+        /**
+        * Configures the request to ignore the Content-Type of the response.
+        * @param ignoreContentType set to true to ignore the content type.
+         * @return this Request, for chaining
+        */
+        public Request ignoreContentType(boolean ignoreContentType);
+
+        /**
+         * Add a data parameter to the request
+         * @param keyval data to add.
+         * @return this Request, for chaining
+         */
+        public Request data(KeyVal keyval);
+
+        /**
+         * Get all of the request's data parameters
+         * @return collection of keyvals
+         */
+        public Collection<KeyVal> data();
+
+        /**
+         * Specify the parser to use when parsing the document.
+         * @param parser parser to use.
+         * @return this Request, for chaining
+         */
+        public Request parser(Parser parser);
+
+        /**
+         * Get the current parser to use when parsing the document.
+         * @return current Parser
+         */
+        public Parser parser();
+    }
+
+    /**
+     * Represents a HTTP response.
+     */
+    public interface Response extends Base<Response> {
+       
+       /**
+         * Get the status code of the response.
+         * @return status code
+         */
+        public int statusCode();
+
+        /**
+         * Get the status message of the response.
+         * @return status message
+         */
+        public String statusMessage();
+
+        /**
+         * Get the character set name of the response.
+         * @return character set name
+         */
+        public String charset();
+
+        /**
+         * Get the response content type (e.g. "text/html");
+         * @return the response content type
+         */
+        public String contentType();
+
+        /**
+         * Parse the body of the response as a Document.
+         * @return a parsed Document
+         * @throws IOException on error
+         */
+        public Document parse() throws IOException;
+
+        /**
+         * Get the body of the response as a plain string.
+         * @return body
+         */
+        public String body();
+
+        /**
+         * Get the body of the response as an array of bytes.
+         * @return body bytes
+         */
+        public byte[] bodyAsBytes();
+    }
+
+    /**
+     * A Key Value tuple.
+     */
+    public interface KeyVal {
+
+        /**
+         * Update the key of a keyval
+         * @param key new key
+         * @return this KeyVal, for chaining
+         */
+        public KeyVal key(String key);
+
+        /**
+         * Get the key of a keyval
+         * @return the key
+         */
+        public String key();
+
+        /**
+         * Update the value of a keyval
+         * @param value the new value
+         * @return this KeyVal, for chaining
+         */
+        public KeyVal value(String value);
+
+        /**
+         * Get the value of a keyval
+         * @return the value
+         */
+        public String value();
+    }
+}
+
diff --git a/src/org/jsoup/Jsoup.java b/src/org/jsoup/Jsoup.java

new file mode 100644 (file)

index 0000000..8c6afce
--- /dev/null
+++ b/src/org/jsoup/Jsoup.java
@@ -0,0 +1,229 @@
+package org.jsoup;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.jsoup.safety.Cleaner;
+import org.jsoup.safety.Whitelist;
+import org.jsoup.helper.DataUtil;
+import org.jsoup.helper.HttpConnection;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+/**
+ The core public access point to the jsoup functionality.
+
+ @author Jonathan Hedley */
+public class Jsoup {
+    private Jsoup() {}
+
+    /**
+     Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
+
+     @param html    HTML to parse
+     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
+     before the HTML declares a {@code <base href>} tag.
+     @return sane HTML
+     */
+    public static Document parse(String html, String baseUri) {
+        return Parser.parse(html, baseUri);
+    }
+
+    /**
+     Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
+     (non-HTML) parser.
+
+     @param html    HTML to parse
+     @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
+     before the HTML declares a {@code <base href>} tag.
+     @param parser alternate {@link Parser#xmlParser() parser} to use.
+     @return sane HTML
+     */
+    public static Document parse(String html, String baseUri, Parser parser) {
+        return parser.parseInput(html, baseUri);
+    }
+
+    /**
+     Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
+     {@code <base href>} tag.
+
+     @param html HTML to parse
+     @return sane HTML
+
+     @see #parse(String, String)
+     */
+    public static Document parse(String html) {
+        return Parser.parse(html, "");
+    }
+
+    /**
+     * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
+     * <p>
+     * Use examples:
+     * <ul>
+     *  <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
+     *  <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
+     * </ul>
+     * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
+     * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
+     */
+    public static Connection connect(String url) {
+        return HttpConnection.connect(url);
+    }
+
+    /**
+     Parse the contents of a file as HTML.
+
+     @param in          file to load HTML from
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     */
+    public static Document parse(File in, String charsetName, String baseUri) throws IOException {
+        return DataUtil.load(in, charsetName, baseUri);
+    }
+
+    /**
+     Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+
+     @param in          file to load HTML from
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     @see #parse(File, String, String)
+     */
+    public static Document parse(File in, String charsetName) throws IOException {
+        return DataUtil.load(in, charsetName, in.getAbsolutePath());
+    }
+
+     /**
+     Read an input stream, and parse it to a Document.
+
+     @param in          input stream to read. Make sure to close it after parsing.
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     */
+    public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
+        return DataUtil.load(in, charsetName, baseUri);
+    }
+
+    /**
+     Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
+     (non-HTML) parser.
+
+     @param in          input stream to read. Make sure to close it after parsing.
+     @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+     present, or fall back to {@code UTF-8} (which is often safe to do).
+     @param baseUri     The URL where the HTML was retrieved from, to resolve relative links against.
+     @param parser alternate {@link Parser#xmlParser() parser} to use.
+     @return sane HTML
+
+     @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+     */
+    public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+        return DataUtil.load(in, charsetName, baseUri, parser);
+    }
+
+    /**
+     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
+
+     @param bodyHtml body HTML fragment
+     @param baseUri  URL to resolve relative URLs against.
+     @return sane HTML document
+
+     @see Document#body()
+     */
+    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
+        return Parser.parseBodyFragment(bodyHtml, baseUri);
+    }
+
+    /**
+     Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
+
+     @param bodyHtml body HTML fragment
+     @return sane HTML document
+
+     @see Document#body()
+     */
+    public static Document parseBodyFragment(String bodyHtml) {
+        return Parser.parseBodyFragment(bodyHtml, "");
+    }
+
+    /**
+     Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
+     <p>
+     The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
+
+     @param url           URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
+     @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
+     @return The parsed HTML.
+
+     @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading
+     the response stream.
+
+     @see #connect(String)
+     */
+    public static Document parse(URL url, int timeoutMillis) throws IOException {
+        Connection con = HttpConnection.connect(url);
+        con.timeout(timeoutMillis);
+        return con.get();
+    }
+
+    /**
+     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
+     tags and attributes.
+
+     @param bodyHtml  input untrusted HTML
+     @param baseUri   URL to resolve relative URLs against
+     @param whitelist white-list of permitted HTML elements
+     @return safe HTML
+
+     @see Cleaner#clean(Document)
+     */
+    public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
+        Document dirty = parseBodyFragment(bodyHtml, baseUri);
+        Cleaner cleaner = new Cleaner(whitelist);
+        Document clean = cleaner.clean(dirty);
+        return clean.body().html();
+    }
+
+    /**
+     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
+     tags and attributes.
+
+     @param bodyHtml  input untrusted HTML
+     @param whitelist white-list of permitted HTML elements
+     @return safe HTML
+
+     @see Cleaner#clean(Document)
+     */
+    public static String clean(String bodyHtml, Whitelist whitelist) {
+        return clean(bodyHtml, "", whitelist);
+    }
+
+    /**
+     Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
+     still be run through the cleaner to set up enforced attributes, and to tidy the output.
+     @param bodyHtml HTML to test
+     @param whitelist whitelist to test against
+     @return true if no tags or attributes were removed; false otherwise
+     @see #clean(String, org.jsoup.safety.Whitelist) 
+     */
+    public static boolean isValid(String bodyHtml, Whitelist whitelist) {
+        Document dirty = parseBodyFragment(bodyHtml, "");
+        Cleaner cleaner = new Cleaner(whitelist);
+        return cleaner.isValid(dirty);
+    }
+    
+}
diff --git a/src/org/jsoup/examples/HtmlToPlainText.java b/src/org/jsoup/examples/HtmlToPlainText.java

new file mode 100644 (file)

index 0000000..8f563e9
--- /dev/null
+++ b/src/org/jsoup/examples/HtmlToPlainText.java
@@ -0,0 +1,109 @@
+package org.jsoup.examples;
+
+import org.jsoup.Jsoup;
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeTraversor;
+import org.jsoup.select.NodeVisitor;
+
+import java.io.IOException;
+
+/**
+ * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted
+ * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a
+ * scrape.
+ * <p/>
+ * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend.
+ *
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
+public class HtmlToPlainText {
+    public static void main(String... args) throws IOException {
+        Validate.isTrue(args.length == 1, "usage: supply url to fetch");
+        String url = args[0];
+
+        // fetch the specified URL and parse to a HTML DOM
+        Document doc = Jsoup.connect(url).get();
+
+        HtmlToPlainText formatter = new HtmlToPlainText();
+        String plainText = formatter.getPlainText(doc);
+        System.out.println(plainText);
+    }
+
+    /**
+     * Format an Element to plain-text
+     * @param element the root element to format
+     * @return formatted text
+     */
+    public String getPlainText(Element element) {
+        FormattingVisitor formatter = new FormattingVisitor();
+        NodeTraversor traversor = new NodeTraversor(formatter);
+        traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node
+
+        return formatter.toString();
+    }
+
+    // the formatting rules, implemented in a breadth-first DOM traverse
+    private class FormattingVisitor implements NodeVisitor {
+        private static final int maxWidth = 80;
+        private int width = 0;
+        private StringBuilder accum = new StringBuilder(); // holds the accumulated text
+
+        // hit when the node is first seen
+        public void head(Node node, int depth) {
+            String name = node.nodeName();
+            if (node instanceof TextNode)
+                append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
+            else if (name.equals("li"))
+                append("\n * ");
+        }
+
+        // hit when all of the node's children (if any) have been visited
+        public void tail(Node node, int depth) {
+            String name = node.nodeName();
+            if (name.equals("br"))
+                append("\n");
+            else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5"))
+                append("\n\n");
+            else if (name.equals("a"))
+                append(String.format(" <%s>", node.absUrl("href")));
+        }
+
+        // appends text to the string builder with a simple word wrap method
+        private void append(String text) {
+            if (text.startsWith("\n"))
+                width = 0; // reset counter if starts with a newline. only from formats above, not in natural text
+            if (text.equals(" ") &&
+                    (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n")))
+                return; // don't accumulate long runs of empty spaces
+
+            if (text.length() + width > maxWidth) { // won't fit, needs to wrap
+                String words[] = text.split("\\s+");
+                for (int i = 0; i < words.length; i++) {
+                    String word = words[i];
+                    boolean last = i == words.length - 1;
+                    if (!last) // insert a space if not the last word
+                        word = word + " ";
+                    if (word.length() + width > maxWidth) { // wrap and reset counter
+                        accum.append("\n").append(word);
+                        width = word.length();
+                    } else {
+                        accum.append(word);
+                        width += word.length();
+                    }
+                }
+            } else { // fits as is, without need to wrap text
+                accum.append(text);
+                width += text.length();
+            }
+        }
+
+        public String toString() {
+            return accum.toString();
+        }
+    }
+}
diff --git a/src/org/jsoup/examples/ListLinks.java b/src/org/jsoup/examples/ListLinks.java

new file mode 100644 (file)

index 0000000..64b29ba
--- /dev/null
+++ b/src/org/jsoup/examples/ListLinks.java
@@ -0,0 +1,56 @@
+package org.jsoup.examples;
+
+import org.jsoup.Jsoup;
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+
+/**
+ * Example program to list links from a URL.
+ */
+public class ListLinks {
+    public static void main(String[] args) throws IOException {
+        Validate.isTrue(args.length == 1, "usage: supply url to fetch");
+        String url = args[0];
+        print("Fetching %s...", url);
+
+        Document doc = Jsoup.connect(url).get();
+        Elements links = doc.select("a[href]");
+        Elements media = doc.select("[src]");
+        Elements imports = doc.select("link[href]");
+
+        print("\nMedia: (%d)", media.size());
+        for (Element src : media) {
+            if (src.tagName().equals("img"))
+                print(" * %s: <%s> %sx%s (%s)",
+                        src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"),
+                        trim(src.attr("alt"), 20));
+            else
+                print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
+        }
+
+        print("\nImports: (%d)", imports.size());
+        for (Element link : imports) {
+            print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
+        }
+
+        print("\nLinks: (%d)", links.size());
+        for (Element link : links) {
+            print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
+        }
+    }
+
+    private static void print(String msg, Object... args) {
+        System.out.println(String.format(msg, args));
+    }
+
+    private static String trim(String s, int width) {
+        if (s.length() > width)
+            return s.substring(0, width-1) + ".";
+        else
+            return s;
+    }
+}
diff --git a/src/org/jsoup/examples/package-info.java b/src/org/jsoup/examples/package-info.java

new file mode 100644 (file)

index 0000000..c312f43
--- /dev/null
+++ b/src/org/jsoup/examples/package-info.java
@@ -0,0 +1,4 @@
+/**
+ Contains example programs and use of jsoup. See the <a href="http://jsoup.org/cookbook/">jsoup cookbook</a>.
+ */
+package org.jsoup.examples;
+\ No newline at end of file
diff --git a/src/org/jsoup/helper/DataUtil.java b/src/org/jsoup/helper/DataUtil.java

new file mode 100644 (file)

index 0000000..9adfe42
--- /dev/null
+++ b/src/org/jsoup/helper/DataUtil.java
@@ -0,0 +1,135 @@
+package org.jsoup.helper;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.parser.Parser;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Internal static utilities for handling data.
+ *
+ */
+public class DataUtil {
+    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
+    static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
+    private static final int bufferSize = 0x20000; // ~130K.
+
+    private DataUtil() {}
+
+    /**
+     * Loads a file to a Document.
+     * @param in file to load
+     * @param charsetName character set of input
+     * @param baseUri base URI of document, to resolve relative links against
+     * @return Document
+     * @throws IOException on IO error
+     */
+    public static Document load(File in, String charsetName, String baseUri) throws IOException {
+        FileInputStream inStream = null;
+        try {
+            inStream = new FileInputStream(in);
+            ByteBuffer byteData = readToByteBuffer(inStream);
+            return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
+        } finally {
+            if (inStream != null)
+                inStream.close();
+        }
+    }
+
+    /**
+     * Parses a Document from an input steam.
+     * @param in input stream to parse. You will need to close it.
+     * @param charsetName character set of input
+     * @param baseUri base URI of document, to resolve relative links against
+     * @return Document
+     * @throws IOException on IO error
+     */
+    public static Document load(InputStream in, String charsetName, String baseUri) throws IOException {
+        ByteBuffer byteData = readToByteBuffer(in);
+        return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
+    }
+
+    /**
+     * Parses a Document from an input steam, using the provided Parser.
+     * @param in input stream to parse. You will need to close it.
+     * @param charsetName character set of input
+     * @param baseUri base URI of document, to resolve relative links against
+     * @param parser alternate {@link Parser#xmlParser() parser} to use.
+     * @return Document
+     * @throws IOException on IO error
+     */
+    public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+        ByteBuffer byteData = readToByteBuffer(in);
+        return parseByteData(byteData, charsetName, baseUri, parser);
+    }
+
+    // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
+    // switching the chartset midstream when a meta http-equiv tag defines the charset.
+    static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
+        String docData;
+        Document doc = null;
+        if (charsetName == null) { // determine from meta. safe parse as UTF-8
+            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
+            docData = Charset.forName(defaultCharset).decode(byteData).toString();
+            doc = parser.parseInput(docData, baseUri);
+            Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
+            if (meta != null) { // if not found, will keep utf-8 as best attempt
+                String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
+                if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
+                    charsetName = foundCharset;
+                    byteData.rewind();
+                    docData = Charset.forName(foundCharset).decode(byteData).toString();
+                    doc = null;
+                }
+            }
+        } else { // specified by content type header (or by user on file load)
+            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
+            docData = Charset.forName(charsetName).decode(byteData).toString();
+        }
+        if (doc == null) {
+            // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
+            // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
+            // into head mode
+            if (docData.charAt(0) == 65279)
+                docData = docData.substring(1);
+
+            doc = parser.parseInput(docData, baseUri);
+            doc.outputSettings().charset(charsetName);
+        }
+        return doc;
+    }
+
+    static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
+        byte[] buffer = new byte[bufferSize];
+        ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
+        int read;
+        while(true) {
+            read  = inStream.read(buffer);
+            if (read == -1) break;
+            outStream.write(buffer, 0, read);
+        }
+        ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
+        return byteData;
+    }
+
+    /**
+     * Parse out a charset from a content type header.
+     * @param contentType e.g. "text/html; charset=EUC-JP"
+     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
+     */
+    static String getCharsetFromContentType(String contentType) {
+        if (contentType == null) return null;
+        Matcher m = charsetPattern.matcher(contentType);
+        if (m.find()) {
+            return m.group(1).trim().toUpperCase();
+        }
+        return null;
+    }
+    
+    
+}
diff --git a/src/org/jsoup/helper/DescendableLinkedList.java b/src/org/jsoup/helper/DescendableLinkedList.java

new file mode 100644 (file)

index 0000000..28ca197
--- /dev/null
+++ b/src/org/jsoup/helper/DescendableLinkedList.java
@@ -0,0 +1,82 @@
+package org.jsoup.helper;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE.
+ */
+public class DescendableLinkedList<E> extends LinkedList<E> {
+
+    /**
+     * Create a new DescendableLinkedList.
+     */
+    public DescendableLinkedList() {
+        super();
+    }
+
+    /**
+     * Add a new element to the start of the list.
+     * @param e element to add
+     */
+    public void push(E e) {
+        addFirst(e);
+    }
+
+    /**
+     * Look at the last element, if there is one.
+     * @return the last element, or null
+     */
+    public E peekLast() {
+        return size() == 0 ? null : getLast();
+    }
+
+    /**
+     * Remove and return the last element, if there is one
+     * @return the last element, or null
+     */
+    public E pollLast() {
+        return size() == 0 ? null : removeLast();
+    }
+
+    /**
+     * Get an iterator that starts and the end of the list and works towards the start.
+     * @return an iterator that starts and the end of the list and works towards the start.
+     */
+    public Iterator<E> descendingIterator() {
+        return new DescendingIterator<E>(size());
+    }
+
+    private class DescendingIterator<E> implements Iterator<E> {
+        private final ListIterator<E> iter;
+
+        @SuppressWarnings("unchecked")
+        private DescendingIterator(int index) {
+            iter = (ListIterator<E>) listIterator(index);
+        }
+
+        /**
+         * Check if there is another element on the list.
+         * @return if another element
+         */
+        public boolean hasNext() {
+            return iter.hasPrevious();
+        }
+
+        /**
+         * Get the next element.
+         * @return the next element.
+         */
+        public E next() {
+            return iter.previous();
+        }
+
+        /**
+         * Remove the current element.
+         */
+        public void remove() {
+            iter.remove();
+        }
+    }
+}
diff --git a/src/org/jsoup/helper/HttpConnection.java b/src/org/jsoup/helper/HttpConnection.java

new file mode 100644 (file)

index 0000000..06200a2
--- /dev/null
+++ b/src/org/jsoup/helper/HttpConnection.java
@@ -0,0 +1,658 @@
+package org.jsoup.helper;
+
+import org.jsoup.Connection;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.TokenQueue;
+
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.*;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Implementation of {@link Connection}.
+ * @see org.jsoup.Jsoup#connect(String) 
+ */
+public class HttpConnection implements Connection {
+    public static Connection connect(String url) {
+        Connection con = new HttpConnection();
+        con.url(url);
+        return con;
+    }
+
+    public static Connection connect(URL url) {
+        Connection con = new HttpConnection();
+        con.url(url);
+        return con;
+    }
+
+    private Connection.Request req;
+    private Connection.Response res;
+
+       private HttpConnection() {
+        req = new Request();
+        res = new Response();
+    }
+
+    public Connection url(URL url) {
+        req.url(url);
+        return this;
+    }
+
+    public Connection url(String url) {
+        Validate.notEmpty(url, "Must supply a valid URL");
+        try {
+            req.url(new URL(url));
+        } catch (MalformedURLException e) {
+            throw new IllegalArgumentException("Malformed URL: " + url, e);
+        }
+        return this;
+    }
+
+    public Connection userAgent(String userAgent) {
+        Validate.notNull(userAgent, "User agent must not be null");
+        req.header("User-Agent", userAgent);
+        return this;
+    }
+
+    public Connection timeout(int millis) {
+        req.timeout(millis);
+        return this;
+    }
+
+    public Connection followRedirects(boolean followRedirects) {
+        req.followRedirects(followRedirects);
+        return this;
+    }
+
+    public Connection referrer(String referrer) {
+        Validate.notNull(referrer, "Referrer must not be null");
+        req.header("Referer", referrer);
+        return this;
+    }
+
+    public Connection method(Method method) {
+        req.method(method);
+        return this;
+    }
+
+    public Connection ignoreHttpErrors(boolean ignoreHttpErrors) {
+               req.ignoreHttpErrors(ignoreHttpErrors);
+               return this;
+       }
+
+    public Connection ignoreContentType(boolean ignoreContentType) {
+        req.ignoreContentType(ignoreContentType);
+        return this;
+    }
+
+    public Connection data(String key, String value) {
+        req.data(KeyVal.create(key, value));
+        return this;
+    }
+
+    public Connection data(Map<String, String> data) {
+        Validate.notNull(data, "Data map must not be null");
+        for (Map.Entry<String, String> entry : data.entrySet()) {
+            req.data(KeyVal.create(entry.getKey(), entry.getValue()));
+        }
+        return this;
+    }
+
+    public Connection data(String... keyvals) {
+        Validate.notNull(keyvals, "Data key value pairs must not be null");
+        Validate.isTrue(keyvals.length %2 == 0, "Must supply an even number of key value pairs");
+        for (int i = 0; i < keyvals.length; i += 2) {
+            String key = keyvals[i];
+            String value = keyvals[i+1];
+            Validate.notEmpty(key, "Data key must not be empty");
+            Validate.notNull(value, "Data value must not be null");
+            req.data(KeyVal.create(key, value));
+        }
+        return this;
+    }
+
+    public Connection header(String name, String value) {
+        req.header(name, value);
+        return this;
+    }
+
+    public Connection cookie(String name, String value) {
+        req.cookie(name, value);
+        return this;
+    }
+
+    public Connection cookies(Map<String, String> cookies) {
+        Validate.notNull(cookies, "Cookie map must not be null");
+        for (Map.Entry<String, String> entry : cookies.entrySet()) {
+            req.cookie(entry.getKey(), entry.getValue());
+        }
+        return this;
+    }
+
+    public Connection parser(Parser parser) {
+        req.parser(parser);
+        return this;
+    }
+
+    public Document get() throws IOException {
+        req.method(Method.GET);
+        execute();
+        return res.parse();
+    }
+
+    public Document post() throws IOException {
+        req.method(Method.POST);
+        execute();
+        return res.parse();
+    }
+
+    public Connection.Response execute() throws IOException {
+        res = Response.execute(req);
+        return res;
+    }
+
+    public Connection.Request request() {
+        return req;
+    }
+
+    public Connection request(Connection.Request request) {
+        req = request;
+        return this;
+    }
+
+    public Connection.Response response() {
+        return res;
+    }
+
+    public Connection response(Connection.Response response) {
+        res = response;
+        return this;
+    }
+
+    @SuppressWarnings({"unchecked"})
+    private static abstract class Base<T extends Connection.Base> implements Connection.Base<T> {
+        URL url;
+        Method method;
+        Map<String, String> headers;
+        Map<String, String> cookies;
+
+        private Base() {
+            headers = new LinkedHashMap<String, String>();
+            cookies = new LinkedHashMap<String, String>();
+        }
+
+        public URL url() {
+            return url;
+        }
+
+        public T url(URL url) {
+            Validate.notNull(url, "URL must not be null");
+            this.url = url;
+            return (T) this;
+        }
+
+        public Method method() {
+            return method;
+        }
+
+        public T method(Method method) {
+            Validate.notNull(method, "Method must not be null");
+            this.method = method;
+            return (T) this;
+        }
+
+        public String header(String name) {
+            Validate.notNull(name, "Header name must not be null");
+            return getHeaderCaseInsensitive(name);
+        }
+
+        public T header(String name, String value) {
+            Validate.notEmpty(name, "Header name must not be empty");
+            Validate.notNull(value, "Header value must not be null");
+            removeHeader(name); // ensures we don't get an "accept-encoding" and a "Accept-Encoding"
+            headers.put(name, value);
+            return (T) this;
+        }
+
+        public boolean hasHeader(String name) {
+            Validate.notEmpty(name, "Header name must not be empty");
+            return getHeaderCaseInsensitive(name) != null;
+        }
+
+        public T removeHeader(String name) {
+            Validate.notEmpty(name, "Header name must not be empty");
+            Map.Entry<String, String> entry = scanHeaders(name); // remove is case insensitive too
+            if (entry != null)
+                headers.remove(entry.getKey()); // ensures correct case
+            return (T) this;
+        }
+
+        public Map<String, String> headers() {
+            return headers;
+        }
+
+        private String getHeaderCaseInsensitive(String name) {
+            Validate.notNull(name, "Header name must not be null");
+            // quick evals for common case of title case, lower case, then scan for mixed
+            String value = headers.get(name);
+            if (value == null)
+                value = headers.get(name.toLowerCase());
+            if (value == null) {
+                Map.Entry<String, String> entry = scanHeaders(name);
+                if (entry != null)
+                    value = entry.getValue();
+            }
+            return value;
+        }
+
+        private Map.Entry<String, String> scanHeaders(String name) {
+            String lc = name.toLowerCase();
+            for (Map.Entry<String, String> entry : headers.entrySet()) {
+                if (entry.getKey().toLowerCase().equals(lc))
+                    return entry;
+            }
+            return null;
+        }
+
+        public String cookie(String name) {
+            Validate.notNull(name, "Cookie name must not be null");
+            return cookies.get(name);
+        }
+
+        public T cookie(String name, String value) {
+            Validate.notEmpty(name, "Cookie name must not be empty");
+            Validate.notNull(value, "Cookie value must not be null");
+            cookies.put(name, value);
+            return (T) this;
+        }
+
+        public boolean hasCookie(String name) {
+            Validate.notEmpty("Cookie name must not be empty");
+            return cookies.containsKey(name);
+        }
+
+        public T removeCookie(String name) {
+            Validate.notEmpty("Cookie name must not be empty");
+            cookies.remove(name);
+            return (T) this;
+        }
+
+        public Map<String, String> cookies() {
+            return cookies;
+        }
+    }
+
+    public static class Request extends Base<Connection.Request> implements Connection.Request {
+        private int timeoutMilliseconds;
+        private boolean followRedirects;
+        private Collection<Connection.KeyVal> data;
+        private boolean ignoreHttpErrors = false;
+        private boolean ignoreContentType = false;
+        private Parser parser;
+
+       private Request() {
+            timeoutMilliseconds = 3000;
+            followRedirects = true;
+            data = new ArrayList<Connection.KeyVal>();
+            method = Connection.Method.GET;
+            headers.put("Accept-Encoding", "gzip");
+            parser = Parser.htmlParser();
+        }
+
+        public int timeout() {
+            return timeoutMilliseconds;
+        }
+
+        public Request timeout(int millis) {
+            Validate.isTrue(millis >= 0, "Timeout milliseconds must be 0 (infinite) or greater");
+            timeoutMilliseconds = millis;
+            return this;
+        }
+
+        public boolean followRedirects() {
+            return followRedirects;
+        }
+
+        public Connection.Request followRedirects(boolean followRedirects) {
+            this.followRedirects = followRedirects;
+            return this;
+        }
+
+        public boolean ignoreHttpErrors() {
+            return ignoreHttpErrors;
+        }
+
+        public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) {
+            this.ignoreHttpErrors = ignoreHttpErrors;
+            return this;
+        }
+
+        public boolean ignoreContentType() {
+            return ignoreContentType;
+        }
+
+        public Connection.Request ignoreContentType(boolean ignoreContentType) {
+            this.ignoreContentType = ignoreContentType;
+            return this;
+        }
+
+        public Request data(Connection.KeyVal keyval) {
+            Validate.notNull(keyval, "Key val must not be null");
+            data.add(keyval);
+            return this;
+        }
+
+        public Collection<Connection.KeyVal> data() {
+            return data;
+        }
+        
+        public Request parser(Parser parser) {
+            this.parser = parser;
+            return this;
+        }
+        
+        public Parser parser() {
+            return parser;
+        }
+    }
+
+    public static class Response extends Base<Connection.Response> implements Connection.Response {
+        private static final int MAX_REDIRECTS = 20;
+        private int statusCode;
+        private String statusMessage;
+        private ByteBuffer byteData;
+        private String charset;
+        private String contentType;
+        private boolean executed = false;
+        private int numRedirects = 0;
+        private Connection.Request req;
+
+        Response() {
+            super();
+        }
+
+        private Response(Response previousResponse) throws IOException {
+            super();
+            if (previousResponse != null) {
+                numRedirects = previousResponse.numRedirects + 1;
+                if (numRedirects >= MAX_REDIRECTS)
+                    throw new IOException(String.format("Too many redirects occurred trying to load URL %s", previousResponse.url()));
+            }
+        }
+        
+        static Response execute(Connection.Request req) throws IOException {
+            return execute(req, null);
+        }
+
+        static Response execute(Connection.Request req, Response previousResponse) throws IOException {
+            Validate.notNull(req, "Request must not be null");
+            String protocol = req.url().getProtocol();
+            Validate
+                .isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported");
+
+            // set up the request for execution
+            if (req.method() == Connection.Method.GET && req.data().size() > 0)
+                serialiseRequestUrl(req); // appends query string
+            HttpURLConnection conn = createConnection(req);
+            conn.connect();
+            if (req.method() == Connection.Method.POST)
+                writePost(req.data(), conn.getOutputStream());          
+
+            int status = conn.getResponseCode();
+            boolean needsRedirect = false;
+            if (status != HttpURLConnection.HTTP_OK) {
+                if (status == HttpURLConnection.HTTP_MOVED_TEMP || status == HttpURLConnection.HTTP_MOVED_PERM || status == HttpURLConnection.HTTP_SEE_OTHER)
+                    needsRedirect = true;
+                else if (!req.ignoreHttpErrors())
+                    throw new IOException(status + " error loading URL " + req.url().toString());
+            }
+            Response res = new Response(previousResponse);
+            res.setupFromConnection(conn, previousResponse);
+            if (needsRedirect && req.followRedirects()) {
+                req.method(Method.GET); // always redirect with a get. any data param from original req are dropped.
+                req.data().clear();
+                req.url(new URL(req.url(), res.header("Location")));
+                for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add response cookies to request (for e.g. login posts)
+                    req.cookie(cookie.getKey(), cookie.getValue());
+                }
+                return execute(req, res);
+            }
+            res.req = req;
+
+            InputStream bodyStream = null;
+            InputStream dataStream = null;
+            try {
+                dataStream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream();
+               bodyStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase("gzip") ?
+                        new BufferedInputStream(new GZIPInputStream(dataStream)) :
+                        new BufferedInputStream(dataStream);
+                
+                res.byteData = DataUtil.readToByteBuffer(bodyStream);
+                res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it
+            } finally {
+                if (bodyStream != null) bodyStream.close();
+                if (dataStream != null) dataStream.close();
+            }
+
+            res.executed = true;
+            return res;
+        }
+
+        public int statusCode() {
+            return statusCode;
+        }
+
+        public String statusMessage() {
+            return statusMessage;
+        }
+
+        public String charset() {
+            return charset;
+        }
+
+        public String contentType() {
+            return contentType;
+        }
+
+        public Document parse() throws IOException {
+            Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response");
+            if (!req.ignoreContentType() && (contentType == null || !(contentType.startsWith("text/") || contentType.startsWith("application/xml") || contentType.startsWith("application/xhtml+xml"))))
+                throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*, application/xml, or application/xhtml+xml",
+                    contentType, url.toString()));
+            Document doc = DataUtil.parseByteData(byteData, charset, url.toExternalForm(), req.parser());
+            byteData.rewind();
+            charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
+            return doc;
+        }
+
+        public String body() {
+            Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body");
+            // charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet
+            String body;
+            if (charset == null)
+                body = Charset.forName(DataUtil.defaultCharset).decode(byteData).toString();
+            else
+                body = Charset.forName(charset).decode(byteData).toString();
+            byteData.rewind();
+            return body;
+        }
+
+        public byte[] bodyAsBytes() {
+            Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body");
+            return byteData.array();
+        }
+
+        // set up connection defaults, and details from request
+        private static HttpURLConnection createConnection(Connection.Request req) throws IOException {
+            HttpURLConnection conn = (HttpURLConnection) req.url().openConnection();
+            conn.setRequestMethod(req.method().name());
+            conn.setInstanceFollowRedirects(false); // don't rely on native redirection support
+            conn.setConnectTimeout(req.timeout());
+            conn.setReadTimeout(req.timeout());
+            if (req.method() == Method.POST)
+                conn.setDoOutput(true);
+            if (req.cookies().size() > 0)
+                conn.addRequestProperty("Cookie", getRequestCookieString(req));
+            for (Map.Entry<String, String> header : req.headers().entrySet()) {
+                conn.addRequestProperty(header.getKey(), header.getValue());
+            }
+            return conn;
+        }
+
+        // set up url, method, header, cookies
+        private void setupFromConnection(HttpURLConnection conn, Connection.Response previousResponse) throws IOException {
+            method = Connection.Method.valueOf(conn.getRequestMethod());
+            url = conn.getURL();
+            statusCode = conn.getResponseCode();
+            statusMessage = conn.getResponseMessage();
+            contentType = conn.getContentType();
+
+            Map<String, List<String>> resHeaders = conn.getHeaderFields();
+            processResponseHeaders(resHeaders);
+
+            // if from a redirect, map previous response cookies into this response
+            if (previousResponse != null) {
+                for (Map.Entry<String, String> prevCookie : previousResponse.cookies().entrySet()) {
+                    if (!hasCookie(prevCookie.getKey()))
+                        cookie(prevCookie.getKey(), prevCookie.getValue());
+                }
+            }
+        }
+
+        void processResponseHeaders(Map<String, List<String>> resHeaders) {
+            for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) {
+                String name = entry.getKey();
+                if (name == null)
+                    continue; // http/1.1 line
+
+                List<String> values = entry.getValue();
+                if (name.equalsIgnoreCase("Set-Cookie")) {
+                    for (String value : values) {
+                        if (value == null)
+                            continue;
+                        TokenQueue cd = new TokenQueue(value);
+                        String cookieName = cd.chompTo("=").trim();
+                        String cookieVal = cd.consumeTo(";").trim();
+                        if (cookieVal == null)
+                            cookieVal = "";
+                        // ignores path, date, domain, secure et al. req'd?
+                        // name not blank, value not null
+                        if (cookieName != null && cookieName.length() > 0)
+                            cookie(cookieName, cookieVal);
+                    }
+                } else { // only take the first instance of each header
+                    if (!values.isEmpty())
+                        header(name, values.get(0));
+                }
+            }
+        }
+
+        private static void writePost(Collection<Connection.KeyVal> data, OutputStream outputStream) throws IOException {
+            OutputStreamWriter w = new OutputStreamWriter(outputStream, DataUtil.defaultCharset);
+            boolean first = true;
+            for (Connection.KeyVal keyVal : data) {
+                if (!first) 
+                    w.append('&');
+                else
+                    first = false;
+                
+                w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset));
+                w.write('=');
+                w.write(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset));
+            }
+            w.close();
+        }
+        
+        private static String getRequestCookieString(Connection.Request req) {
+            StringBuilder sb = new StringBuilder();
+            boolean first = true;
+            for (Map.Entry<String, String> cookie : req.cookies().entrySet()) {
+                if (!first)
+                    sb.append("; ");
+                else
+                    first = false;
+                sb.append(cookie.getKey()).append('=').append(cookie.getValue());
+                // todo: spec says only ascii, no escaping / encoding defined. validate on set? or escape somehow here?
+            }
+            return sb.toString();
+        }
+
+        // for get url reqs, serialise the data map into the url
+        private static void serialiseRequestUrl(Connection.Request req) throws IOException {
+            URL in = req.url();
+            StringBuilder url = new StringBuilder();
+            boolean first = true;
+            // reconstitute the query, ready for appends
+            url
+                .append(in.getProtocol())
+                .append("://")
+                .append(in.getAuthority()) // includes host, port
+                .append(in.getPath())
+                .append("?");
+            if (in.getQuery() != null) {
+                url.append(in.getQuery());
+                first = false;
+            }
+            for (Connection.KeyVal keyVal : req.data()) {
+                if (!first)
+                    url.append('&');
+                else
+                    first = false;
+                url
+                    .append(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset))
+                    .append('=')
+                    .append(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset));
+            }
+            req.url(new URL(url.toString()));
+            req.data().clear(); // moved into url as get params
+        }
+    }
+
+    public static class KeyVal implements Connection.KeyVal {
+        private String key;
+        private String value;
+
+        public static KeyVal create(String key, String value) {
+            Validate.notEmpty(key, "Data key must not be empty");
+            Validate.notNull(value, "Data value must not be null");
+            return new KeyVal(key, value);
+        }
+
+        private KeyVal(String key, String value) {
+            this.key = key;
+            this.value = value;
+        }
+
+        public KeyVal key(String key) {
+            Validate.notEmpty(key, "Data key must not be empty");
+            this.key = key;
+            return this;
+        }
+
+        public String key() {
+            return key;
+        }
+
+        public KeyVal value(String value) {
+            Validate.notNull(value, "Data value must not be null");
+            this.value = value;
+            return this;
+        }
+
+        public String value() {
+            return value;
+        }
+
+        @Override
+        public String toString() {
+            return key + "=" + value;
+        }      
+    }
+}
diff --git a/src/org/jsoup/helper/StringUtil.java b/src/org/jsoup/helper/StringUtil.java

new file mode 100644 (file)

index 0000000..071a92c
--- /dev/null
+++ b/src/org/jsoup/helper/StringUtil.java
@@ -0,0 +1,140 @@
+package org.jsoup.helper;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A minimal String utility class. Designed for internal jsoup use only.
+ */
+public final class StringUtil {
+    // memoised padding up to 10
+    private static final String[] padding = {"", " ", "  ", "   ", "    ", "     ", "      ", "       ", "        ", "         ", "          "};
+
+    /**
+     * Join a collection of strings by a seperator
+     * @param strings collection of string objects
+     * @param sep string to place between strings
+     * @return joined string
+     */
+    public static String join(Collection strings, String sep) {
+        return join(strings.iterator(), sep);
+    }
+
+    /**
+     * Join a collection of strings by a seperator
+     * @param strings iterator of string objects
+     * @param sep string to place between strings
+     * @return joined string
+     */
+    public static String join(Iterator strings, String sep) {
+        if (!strings.hasNext())
+            return "";
+
+        String start = strings.next().toString();
+        if (!strings.hasNext()) // only one, avoid builder
+            return start;
+
+        StringBuilder sb = new StringBuilder(64).append(start);
+        while (strings.hasNext()) {
+            sb.append(sep);
+            sb.append(strings.next());
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns space padding
+     * @param width amount of padding desired
+     * @return string of spaces * width
+     */
+    public static String padding(int width) {
+        if (width < 0)
+            throw new IllegalArgumentException("width must be > 0");
+
+        if (width < padding.length)
+            return padding[width];
+
+        char[] out = new char[width];
+        for (int i = 0; i < width; i++)
+            out[i] = ' ';
+        return String.valueOf(out);
+    }
+
+    /**
+     * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, \t, etc)
+     * @param string string to test
+     * @return if string is blank
+     */
+    public static boolean isBlank(String string) {
+        if (string == null || string.length() == 0)
+            return true;
+
+        int l = string.length();
+        for (int i = 0; i < l; i++) {
+            if (!StringUtil.isWhitespace(string.codePointAt(i)))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Tests if a string is numeric, i.e. contains only digit characters
+     * @param string string to test
+     * @return true if only digit chars, false if empty or null or contains non-digit chrs
+     */
+    public static boolean isNumeric(String string) {
+        if (string == null || string.length() == 0)
+            return false;
+
+        int l = string.length();
+        for (int i = 0; i < l; i++) {
+            if (!Character.isDigit(string.codePointAt(i)))
+                return false;
+        }
+        return true;
+    }
+
+    /**
+     * Tests if a code point is "whitespace" as defined in the HTML spec.
+     * @param c code point to test
+     * @return true if code point is whitespace, false otherwise
+     */
+    public static boolean isWhitespace(int c){
+        return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
+    }
+
+    public static String normaliseWhitespace(String string) {
+        StringBuilder sb = new StringBuilder(string.length());
+
+        boolean lastWasWhite = false;
+        boolean modified = false;
+
+        int l = string.length();
+        for (int i = 0; i < l; i++) {
+            int c = string.codePointAt(i);
+            if (isWhitespace(c)) {
+                if (lastWasWhite) {
+                    modified = true;
+                    continue;
+                }
+                if (c != ' ')
+                    modified = true;
+                sb.append(' ');
+                lastWasWhite = true;
+            }
+            else {
+                sb.appendCodePoint(c);
+                lastWasWhite = false;
+            }
+        }
+        return modified ? sb.toString() : string;
+    }
+
+    public static boolean in(String needle, String... haystack) {
+        for (String hay : haystack) {
+            if (hay.equals(needle))
+            return true;
+        }
+        return false;
+    }
+}
diff --git a/src/org/jsoup/helper/Validate.java b/src/org/jsoup/helper/Validate.java

new file mode 100644 (file)

index 0000000..814bcc3
--- /dev/null
+++ b/src/org/jsoup/helper/Validate.java
@@ -0,0 +1,112 @@
+package org.jsoup.helper;
+
+/**
+ * Simple validation methods. Designed for jsoup internal use
+ */
+public final class Validate {
+    
+    private Validate() {}
+
+    /**
+     * Validates that the object is not null
+     * @param obj object to test
+     */
+    public static void notNull(Object obj) {
+        if (obj == null)
+            throw new IllegalArgumentException("Object must not be null");
+    }
+
+    /**
+     * Validates that the object is not null
+     * @param obj object to test
+     * @param msg message to output if validation fails
+     */
+    public static void notNull(Object obj, String msg) {
+        if (obj == null)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the value is true
+     * @param val object to test
+     */
+    public static void isTrue(boolean val) {
+        if (!val)
+            throw new IllegalArgumentException("Must be true");
+    }
+
+    /**
+     * Validates that the value is true
+     * @param val object to test
+     * @param msg message to output if validation fails
+     */
+    public static void isTrue(boolean val, String msg) {
+        if (!val)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the value is false
+     * @param val object to test
+     */
+    public static void isFalse(boolean val) {
+        if (val)
+            throw new IllegalArgumentException("Must be false");
+    }
+
+    /**
+     * Validates that the value is false
+     * @param val object to test
+     * @param msg message to output if validation fails
+     */
+    public static void isFalse(boolean val, String msg) {
+        if (val)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the array contains no null elements
+     * @param objects the array to test
+     */
+    public static void noNullElements(Object[] objects) {
+        noNullElements(objects, "Array must not contain any null objects");
+    }
+
+    /**
+     * Validates that the array contains no null elements
+     * @param objects the array to test
+     * @param msg message to output if validation fails
+     */
+    public static void noNullElements(Object[] objects, String msg) {
+        for (Object obj : objects)
+            if (obj == null)
+                throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     * Validates that the string is not empty
+     * @param string the string to test
+     */
+    public static void notEmpty(String string) {
+        if (string == null || string.length() == 0)
+            throw new IllegalArgumentException("String must not be empty");
+    }
+
+    /**
+     * Validates that the string is not empty
+     * @param string the string to test
+     * @param msg message to output if validation fails
+     */
+    public static void notEmpty(String string, String msg) {
+        if (string == null || string.length() == 0)
+            throw new IllegalArgumentException(msg);
+    }
+
+    /**
+     Cause a failure.
+     @param msg message to output.
+     */
+    public static void fail(String msg) {
+        throw new IllegalArgumentException(msg);
+    }
+}
diff --git a/src/org/jsoup/nodes/Attribute.java b/src/org/jsoup/nodes/Attribute.java

new file mode 100644 (file)

index 0000000..02eb29d
--- /dev/null
+++ b/src/org/jsoup/nodes/Attribute.java
@@ -0,0 +1,131 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.Validate;
+
+import java.util.Map;
+
+/**
+ A single key + value attribute. Keys are trimmed and normalised to lower-case.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class Attribute implements Map.Entry<String, String>, Cloneable  {
+    private String key;
+    private String value;
+
+    /**
+     * Create a new attribute from unencoded (raw) key and value.
+     * @param key attribute key
+     * @param value attribute value
+     * @see #createFromEncoded
+     */
+    public Attribute(String key, String value) {
+        Validate.notEmpty(key);
+        Validate.notNull(value);
+        this.key = key.trim().toLowerCase();
+        this.value = value;
+    }
+
+    /**
+     Get the attribute key.
+     @return the attribute key
+     */
+    public String getKey() {
+        return key;
+    }
+
+    /**
+     Set the attribute key. Gets normalised as per the constructor method.
+     @param key the new key; must not be null
+     */
+    public void setKey(String key) {
+        Validate.notEmpty(key);
+        this.key = key.trim().toLowerCase();
+    }
+
+    /**
+     Get the attribute value.
+     @return the attribute value
+     */
+    public String getValue() {
+        return value;
+    }
+
+    /**
+     Set the attribute value.
+     @param value the new attribute value; must not be null
+     */
+    public String setValue(String value) {
+        Validate.notNull(value);
+        String old = this.value;
+        this.value = value;
+        return old;
+    }
+
+    /**
+     Get the HTML representation of this attribute; e.g. {@code href="index.html"}.
+     @return HTML
+     */
+    public String html() {
+        return key + "=\"" + Entities.escape(value, (new Document("")).outputSettings()) + "\"";
+    }
+    
+    protected void html(StringBuilder accum, Document.OutputSettings out) {
+        accum
+            .append(key)
+            .append("=\"")
+            .append(Entities.escape(value, out))
+            .append("\"");
+    }
+
+    /**
+     Get the string representation of this attribute, implemented as {@link #html()}.
+     @return string
+     */
+    public String toString() {
+        return html();
+    }
+
+    /**
+     * Create a new Attribute from an unencoded key and a HTML attribute encoded value.
+     * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars.
+     * @param encodedValue HTML attribute encoded value
+     * @return attribute
+     */
+    public static Attribute createFromEncoded(String unencodedKey, String encodedValue) {
+        String value = Entities.unescape(encodedValue, true);
+        return new Attribute(unencodedKey, value);
+    }
+
+    protected boolean isDataAttribute() {
+        return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length();
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof Attribute)) return false;
+
+        Attribute attribute = (Attribute) o;
+
+        if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false;
+        if (value != null ? !value.equals(attribute.value) : attribute.value != null) return false;
+
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = key != null ? key.hashCode() : 0;
+        result = 31 * result + (value != null ? value.hashCode() : 0);
+        return result;
+    }
+
+    @Override
+    public Attribute clone() {
+        try {
+            return (Attribute) super.clone(); // only fields are immutable strings key and value, so no more deep copy required
+        } catch (CloneNotSupportedException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/src/org/jsoup/nodes/Attributes.java b/src/org/jsoup/nodes/Attributes.java

new file mode 100644 (file)

index 0000000..9436750
--- /dev/null
+++ b/src/org/jsoup/nodes/Attributes.java
@@ -0,0 +1,249 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.Validate;
+
+import java.util.*;
+
+/**
+ * The attributes of an Element.
+ * <p/>
+ * Attributes are treated as a map: there can be only one value associated with an attribute key.
+ * <p/>
+ * Attribute key and value comparisons are done case insensitively, and keys are normalised to
+ * lower-case.
+ * 
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
+public class Attributes implements Iterable<Attribute>, Cloneable {
+    protected static final String dataPrefix = "data-";
+    
+    private LinkedHashMap<String, Attribute> attributes = null;
+    // linked hash map to preserve insertion order.
+    // null be default as so many elements have no attributes -- saves a good chunk of memory
+
+    /**
+     Get an attribute value by key.
+     @param key the attribute key
+     @return the attribute value if set; or empty string if not set.
+     @see #hasKey(String)
+     */
+    public String get(String key) {
+        Validate.notEmpty(key);
+
+        if (attributes == null)
+            return "";
+
+        Attribute attr = attributes.get(key.toLowerCase());
+        return attr != null ? attr.getValue() : "";
+    }
+
+    /**
+     Set a new attribute, or replace an existing one by key.
+     @param key attribute key
+     @param value attribute value
+     */
+    public void put(String key, String value) {
+        Attribute attr = new Attribute(key, value);
+        put(attr);
+    }
+
+    /**
+     Set a new attribute, or replace an existing one by key.
+     @param attribute attribute
+     */
+    public void put(Attribute attribute) {
+        Validate.notNull(attribute);
+        if (attributes == null)
+             attributes = new LinkedHashMap<String, Attribute>(2);
+        attributes.put(attribute.getKey(), attribute);
+    }
+
+    /**
+     Remove an attribute by key.
+     @param key attribute key to remove
+     */
+    public void remove(String key) {
+        Validate.notEmpty(key);
+        if (attributes == null)
+            return;
+        attributes.remove(key.toLowerCase());
+    }
+
+    /**
+     Tests if these attributes contain an attribute with this key.
+     @param key key to check for
+     @return true if key exists, false otherwise
+     */
+    public boolean hasKey(String key) {
+        return attributes != null && attributes.containsKey(key.toLowerCase());
+    }
+
+    /**
+     Get the number of attributes in this set.
+     @return size
+     */
+    public int size() {
+        if (attributes == null)
+            return 0;
+        return attributes.size();
+    }
+
+    /**
+     Add all the attributes from the incoming set to this set.
+     @param incoming attributes to add to these attributes.
+     */
+    public void addAll(Attributes incoming) {
+        if (incoming.size() == 0)
+            return;
+        if (attributes == null)
+            attributes = new LinkedHashMap<String, Attribute>(incoming.size());
+        attributes.putAll(incoming.attributes);
+    }
+    
+    public Iterator<Attribute> iterator() {
+        return asList().iterator();
+    }
+
+    /**
+     Get the attributes as a List, for iteration. Do not modify the keys of the attributes via this view, as changes
+     to keys will not be recognised in the containing set.
+     @return an view of the attributes as a List.
+     */
+    public List<Attribute> asList() {
+        if (attributes == null)
+            return Collections.emptyList();
+
+        List<Attribute> list = new ArrayList<Attribute>(attributes.size());
+        for (Map.Entry<String, Attribute> entry : attributes.entrySet()) {
+            list.add(entry.getValue());
+        }
+        return Collections.unmodifiableList(list);
+    }
+
+    /**
+     * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys
+     * starting with {@code data-}.
+     * @return map of custom data attributes.
+     */
+    public Map<String, String> dataset() {
+        return new Dataset();
+    }
+
+    /**
+     Get the HTML representation of these attributes.
+     @return HTML
+     */
+    public String html() {
+        StringBuilder accum = new StringBuilder();
+        html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used
+        return accum.toString();
+    }
+    
+    void html(StringBuilder accum, Document.OutputSettings out) {
+        if (attributes == null)
+            return;
+        
+        for (Map.Entry<String, Attribute> entry : attributes.entrySet()) {
+            Attribute attribute = entry.getValue();
+            accum.append(" ");
+            attribute.html(accum, out);
+        }
+    }
+    
+    public String toString() {
+        return html();
+    }
+    
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof Attributes)) return false;
+        
+        Attributes that = (Attributes) o;
+        
+        if (attributes != null ? !attributes.equals(that.attributes) : that.attributes != null) return false;
+        
+        return true;
+    }
+    
+    @Override
+    public int hashCode() {
+        return attributes != null ? attributes.hashCode() : 0;
+    }
+
+    @Override
+    public Attributes clone() {
+        if (attributes == null)
+            return new Attributes();
+
+        Attributes clone;
+        try {
+            clone = (Attributes) super.clone();
+        } catch (CloneNotSupportedException e) {
+            throw new RuntimeException(e);
+        }
+        clone.attributes = new LinkedHashMap<String, Attribute>(attributes.size());
+        for (Attribute attribute: this)
+            clone.attributes.put(attribute.getKey(), attribute.clone());
+        return clone;
+    }
+
+    private class Dataset extends AbstractMap<String, String> {
+
+        private Dataset() {
+            if (attributes == null)
+                attributes = new LinkedHashMap<String, Attribute>(2);
+        }
+
+        public Set<Entry<String, String>> entrySet() {
+            return new EntrySet();
+        }
+
+        @Override
+        public String put(String key, String value) {
+            String dataKey = dataKey(key);
+            String oldValue = hasKey(dataKey) ? attributes.get(dataKey).getValue() : null;
+            Attribute attr = new Attribute(dataKey, value);
+            attributes.put(dataKey, attr);
+            return oldValue;
+        }
+
+        private class EntrySet extends AbstractSet<Map.Entry<String, String>> {
+            public Iterator<Map.Entry<String, String>> iterator() {
+                return new DatasetIterator();
+            }
+
+            public int size() {
+                int count = 0;
+                Iterator iter = new DatasetIterator();
+                while (iter.hasNext())
+                    count++;
+                return count;
+            }
+        }
+
+        private class DatasetIterator implements Iterator<Map.Entry<String, String>> {
+            private Iterator<Attribute> attrIter = attributes.values().iterator();
+            private Attribute attr;
+            public boolean hasNext() {
+                while (attrIter.hasNext()) {
+                    attr = attrIter.next();
+                    if (attr.isDataAttribute()) return true;
+                }
+                return false;
+            }
+
+            public Entry<String, String> next() {
+                return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue());
+            }
+
+            public void remove() {
+                attributes.remove(attr.getKey());
+            }
+        }
+    }
+
+    private static String dataKey(String key) {
+        return dataPrefix + key;
+    }
+}
diff --git a/src/org/jsoup/nodes/Comment.java b/src/org/jsoup/nodes/Comment.java

new file mode 100644 (file)

index 0000000..37fd436
--- /dev/null
+++ b/src/org/jsoup/nodes/Comment.java
@@ -0,0 +1,46 @@
+package org.jsoup.nodes;
+
+/**
+ A comment node.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class Comment extends Node {
+    private static final String COMMENT_KEY = "comment";
+
+    /**
+     Create a new comment node.
+     @param data The contents of the comment
+     @param baseUri base URI
+     */
+    public Comment(String data, String baseUri) {
+        super(baseUri);
+        attributes.put(COMMENT_KEY, data);
+    }
+
+    public String nodeName() {
+        return "#comment";
+    }
+
+    /**
+     Get the contents of the comment.
+     @return comment content
+     */
+    public String getData() {
+        return attributes.get(COMMENT_KEY);
+    }
+
+    void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
+        if (out.prettyPrint())
+            indent(accum, depth, out);
+        accum
+                .append("<!--")
+                .append(getData())
+                .append("-->");
+    }
+
+    void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
+
+    public String toString() {
+        return outerHtml();
+    }
+}
diff --git a/src/org/jsoup/nodes/DataNode.java b/src/org/jsoup/nodes/DataNode.java

new file mode 100644 (file)

index 0000000..a64f56f
--- /dev/null
+++ b/src/org/jsoup/nodes/DataNode.java
@@ -0,0 +1,62 @@
+package org.jsoup.nodes;
+
+/**
+ A data node, for contents of style, script tags etc, where contents should not show in text().
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class DataNode extends Node{
+    private static final String DATA_KEY = "data";
+
+    /**
+     Create a new DataNode.
+     @param data data contents
+     @param baseUri base URI
+     */
+    public DataNode(String data, String baseUri) {
+        super(baseUri);
+        attributes.put(DATA_KEY, data);
+    }
+
+    public String nodeName() {
+        return "#data";
+    }
+
+    /**
+     Get the data contents of this node. Will be unescaped and with original new lines, space etc.
+     @return data
+     */
+    public String getWholeData() {
+        return attributes.get(DATA_KEY);
+    }
+
+    /**
+     * Set the data contents of this node.
+     * @param data unencoded data
+     * @return this node, for chaining
+     */
+    public DataNode setWholeData(String data) {
+        attributes.put(DATA_KEY, data);
+        return this;
+    }
+
+    void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
+        accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain
+    }
+
+    void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
+
+    public String toString() {
+        return outerHtml();
+    }
+
+    /**
+     Create a new DataNode from HTML encoded data.
+     @param encodedData encoded data
+     @param baseUri bass URI
+     @return new DataNode
+     */
+    public static DataNode createFromEncoded(String encodedData, String baseUri) {
+        String data = Entities.unescape(encodedData);
+        return new DataNode(data, baseUri);
+    }
+}
diff --git a/src/org/jsoup/nodes/Document.java b/src/org/jsoup/nodes/Document.java

new file mode 100644 (file)

index 0000000..adb371c
--- /dev/null
+++ b/src/org/jsoup/nodes/Document.java
@@ -0,0 +1,350 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Elements;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ A HTML Document.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class Document extends Element {
+    private OutputSettings outputSettings = new OutputSettings();
+    private QuirksMode quirksMode = QuirksMode.noQuirks;
+
+    /**
+     Create a new, empty Document.
+     @param baseUri base URI of document
+     @see org.jsoup.Jsoup#parse
+     @see #createShell
+     */
+    public Document(String baseUri) {
+        super(Tag.valueOf("#root"), baseUri);
+    }
+
+    /**
+     Create a valid, empty shell of a document, suitable for adding more elements to.
+     @param baseUri baseUri of document
+     @return document with html, head, and body elements.
+     */
+    static public Document createShell(String baseUri) {
+        Validate.notNull(baseUri);
+
+        Document doc = new Document(baseUri);
+        Element html = doc.appendElement("html");
+        html.appendElement("head");
+        html.appendElement("body");
+
+        return doc;
+    }
+
+    /**
+     Accessor to the document's {@code head} element.
+     @return {@code head}
+     */
+    public Element head() {
+        return findFirstElementByTagName("head", this);
+    }
+
+    /**
+     Accessor to the document's {@code body} element.
+     @return {@code body}
+     */
+    public Element body() {
+        return findFirstElementByTagName("body", this);
+    }
+
+    /**
+     Get the string contents of the document's {@code title} element.
+     @return Trimmed title, or empty string if none set.
+     */
+    public String title() {
+        Element titleEl = getElementsByTag("title").first();
+        return titleEl != null ? titleEl.text().trim() : "";
+    }
+
+    /**
+     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
+     not present
+     @param title string to set as title
+     */
+    public void title(String title) {
+        Validate.notNull(title);
+        Element titleEl = getElementsByTag("title").first();
+        if (titleEl == null) { // add to head
+            head().appendElement("title").text(title);
+        } else {
+            titleEl.text(title);
+        }
+    }
+
+    /**
+     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
+     @param tagName element tag name (e.g. {@code a})
+     @return new element
+     */
+    public Element createElement(String tagName) {
+        return new Element(Tag.valueOf(tagName), this.baseUri());
+    }
+
+    /**
+     Normalise the document. This happens after the parse phase so generally does not need to be called.
+     Moves any text content that is not in the body element into the body.
+     @return this document after normalisation
+     */
+    public Document normalise() {
+        Element htmlEl = findFirstElementByTagName("html", this);
+        if (htmlEl == null)
+            htmlEl = appendElement("html");
+        if (head() == null)
+            htmlEl.prependElement("head");
+        if (body() == null)
+            htmlEl.appendElement("body");
+
+        // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care
+        // of. do in inverse order to maintain text order.
+        normaliseTextNodes(head());
+        normaliseTextNodes(htmlEl);
+        normaliseTextNodes(this);
+
+        normaliseStructure("head", htmlEl);
+        normaliseStructure("body", htmlEl);
+        
+        return this;
+    }
+
+    // does not recurse.
+    private void normaliseTextNodes(Element element) {
+        List<Node> toMove = new ArrayList<Node>();
+        for (Node node: element.childNodes) {
+            if (node instanceof TextNode) {
+                TextNode tn = (TextNode) node;
+                if (!tn.isBlank())
+                    toMove.add(tn);
+            }
+        }
+
+        for (int i = toMove.size()-1; i >= 0; i--) {
+            Node node = toMove.get(i);
+            element.removeChild(node);
+            body().prependChild(new TextNode(" ", ""));
+            body().prependChild(node);
+        }
+    }
+
+    // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html>
+    private void normaliseStructure(String tag, Element htmlEl) {
+        Elements elements = this.getElementsByTag(tag);
+        Element master = elements.first(); // will always be available as created above if not existent
+        if (elements.size() > 1) { // dupes, move contents to master
+            List<Node> toMove = new ArrayList<Node>();
+            for (int i = 1; i < elements.size(); i++) {
+                Node dupe = elements.get(i);
+                for (Node node : dupe.childNodes)
+                    toMove.add(node);
+                dupe.remove();
+            }
+
+            for (Node dupe : toMove)
+                master.appendChild(dupe);
+        }
+        // ensure parented by <html>
+        if (!master.parent().equals(htmlEl)) {
+            htmlEl.appendChild(master); // includes remove()            
+        }
+    }
+
+    // fast method to get first by tag name, used for html, head, body finders
+    private Element findFirstElementByTagName(String tag, Node node) {
+        if (node.nodeName().equals(tag))
+            return (Element) node;
+        else {
+            for (Node child: node.childNodes) {
+                Element found = findFirstElementByTagName(tag, child);
+                if (found != null)
+                    return found;
+            }
+        }
+        return null;
+    }
+
+    @Override
+    public String outerHtml() {
+        return super.html(); // no outer wrapper tag
+    }
+
+    /**
+     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
+     @param text unencoded text
+     @return this document
+     */
+    @Override
+    public Element text(String text) {
+        body().text(text); // overridden to not nuke doc structure
+        return this;
+    }
+
+    @Override
+    public String nodeName() {
+        return "#document";
+    }
+
+    @Override
+    public Document clone() {
+        Document clone = (Document) super.clone();
+        clone.outputSettings = this.outputSettings.clone();
+        return clone;
+    }
+
+    /**
+     * A Document's output settings control the form of the text() and html() methods.
+     */
+    public static class OutputSettings implements Cloneable {
+        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
+        private Charset charset = Charset.forName("UTF-8");
+        private CharsetEncoder charsetEncoder = charset.newEncoder();
+        private boolean prettyPrint = true;
+        private int indentAmount = 1;
+
+        public OutputSettings() {}
+
+        /**
+         * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
+         * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
+         * which uses the complete set of HTML named entities.
+         * <p>
+         * The default escape mode is <code>base</code>.
+         * @return the document's current escape mode
+         */
+        public Entities.EscapeMode escapeMode() {
+            return escapeMode;
+        }
+
+        /**
+         * Set the document's escape mode
+         * @param escapeMode the new escape mode to use
+         * @return the document's output settings, for chaining
+         */
+        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
+            this.escapeMode = escapeMode;
+            return this;
+        }
+
+        /**
+         * Get the document's current output charset, which is used to control which characters are escaped when
+         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
+         * <p>
+         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
+         * input charset. Otherwise, it defaults to UTF-8.
+         * @return the document's current charset.
+         */
+        public Charset charset() {
+            return charset;
+        }
+
+        /**
+         * Update the document's output charset.
+         * @param charset the new charset to use.
+         * @return the document's output settings, for chaining
+         */
+        public OutputSettings charset(Charset charset) {
+            // todo: this should probably update the doc's meta charset
+            this.charset = charset;
+            charsetEncoder = charset.newEncoder();
+            return this;
+        }
+
+        /**
+         * Update the document's output charset.
+         * @param charset the new charset (by name) to use.
+         * @return the document's output settings, for chaining
+         */
+        public OutputSettings charset(String charset) {
+            charset(Charset.forName(charset));
+            return this;
+        }
+
+        CharsetEncoder encoder() {
+            return charsetEncoder;
+        }
+
+        /**
+         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
+         * the output, and the output will generally look like the input.
+         * @return if pretty printing is enabled.
+         */
+        public boolean prettyPrint() {
+            return prettyPrint;
+        }
+
+        /**
+         * Enable or disable pretty printing.
+         * @param pretty new pretty print setting
+         * @return this, for chaining
+         */
+        public OutputSettings prettyPrint(boolean pretty) {
+            prettyPrint = pretty;
+            return this;
+        }
+
+        /**
+         * Get the current tag indent amount, used when pretty printing.
+         * @return the current indent amount
+         */
+        public int indentAmount() {
+            return indentAmount;
+        }
+
+        /**
+         * Set the indent amount for pretty printing
+         * @param indentAmount number of spaces to use for indenting each level. Must be >= 0.
+         * @return this, for chaining
+         */
+        public OutputSettings indentAmount(int indentAmount) {
+            Validate.isTrue(indentAmount >= 0);
+            this.indentAmount = indentAmount;
+            return this;
+        }
+
+        @Override
+        public OutputSettings clone() {
+            OutputSettings clone;
+            try {
+                clone = (OutputSettings) super.clone();
+            } catch (CloneNotSupportedException e) {
+                throw new RuntimeException(e);
+            }
+            clone.charset(charset.name()); // new charset and charset encoder
+            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
+            // indentAmount, prettyPrint are primitives so object.clone() will handle
+            return clone;
+        }
+    }
+
+    /**
+     * Get the document's current output settings.
+     * @return the document's current output settings.
+     */
+    public OutputSettings outputSettings() {
+        return outputSettings;
+    }
+
+    public enum QuirksMode {
+        noQuirks, quirks, limitedQuirks;
+    }
+
+    public QuirksMode quirksMode() {
+        return quirksMode;
+    }
+
+    public Document quirksMode(QuirksMode quirksMode) {
+        this.quirksMode = quirksMode;
+        return this;
+    }
+}
+
diff --git a/src/org/jsoup/nodes/DocumentType.java b/src/org/jsoup/nodes/DocumentType.java

new file mode 100644 (file)

index 0000000..f8c79f0
--- /dev/null
+++ b/src/org/jsoup/nodes/DocumentType.java
@@ -0,0 +1,46 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+
+/**
+ * A {@code <!DOCTPYE>} node.
+ */
+public class DocumentType extends Node {
+    // todo: quirk mode from publicId and systemId
+
+    /**
+     * Create a new doctype element.
+     * @param name the doctype's name
+     * @param publicId the doctype's public ID
+     * @param systemId the doctype's system ID
+     * @param baseUri the doctype's base URI
+     */
+    public DocumentType(String name, String publicId, String systemId, String baseUri) {
+        super(baseUri);
+
+        Validate.notEmpty(name);
+        attr("name", name);
+        attr("publicId", publicId);
+        attr("systemId", systemId);
+    }
+
+    @Override
+    public String nodeName() {
+        return "#doctype";
+    }
+
+    @Override
+    void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
+        accum.append("<!DOCTYPE ").append(attr("name"));
+        if (!StringUtil.isBlank(attr("publicId")))
+            accum.append(" PUBLIC \"").append(attr("publicId")).append("\"");
+        if (!StringUtil.isBlank(attr("systemId")))
+            accum.append(" \"").append(attr("systemId")).append("\"");
+        accum.append('>');
+    }
+
+    @Override
+    void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {
+    }
+}
diff --git a/src/org/jsoup/nodes/Element.java b/src/org/jsoup/nodes/Element.java

new file mode 100644 (file)

index 0000000..5c1894c
--- /dev/null
+++ b/src/org/jsoup/nodes/Element.java
@@ -0,0 +1,1119 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Collector;
+import org.jsoup.select.Elements;
+import org.jsoup.select.Evaluator;
+import org.jsoup.select.Selector;
+
+import java.util.*;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * A HTML element consists of a tag name, attributes, and child nodes (including text nodes and
+ * other elements).
+ * 
+ * From an Element, you can extract data, traverse the node graph, and manipulate the HTML.
+ * 
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
+public class Element extends Node {
+    private Tag tag;
+    private Set<String> classNames;
+    
+    /**
+     * Create a new, standalone Element. (Standalone in that is has no parent.)
+     * 
+     * @param tag tag of this element
+     * @param baseUri the base URI
+     * @param attributes initial attributes
+     * @see #appendChild(Node)
+     * @see #appendElement(String)
+     */
+    public Element(Tag tag, String baseUri, Attributes attributes) {
+        super(baseUri, attributes);
+        
+        Validate.notNull(tag);    
+        this.tag = tag;
+    }
+    
+    /**
+     * Create a new Element from a tag and a base URI.
+     * 
+     * @param tag element tag
+     * @param baseUri the base URI of this element. It is acceptable for the base URI to be an empty
+     *            string, but not null.
+     * @see Tag#valueOf(String)
+     */
+    public Element(Tag tag, String baseUri) {
+        this(tag, baseUri, new Attributes());
+    }
+
+    @Override
+    public String nodeName() {
+        return tag.getName();
+    }
+
+    /**
+     * Get the name of the tag for this element. E.g. {@code div}
+     * 
+     * @return the tag name
+     */
+    public String tagName() {
+        return tag.getName();
+    }
+
+    /**
+     * Change the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with
+     * {@code el.tagName("div");}.
+     *
+     * @param tagName new tag name for this element
+     * @return this element, for chaining
+     */
+    public Element tagName(String tagName) {
+        Validate.notEmpty(tagName, "Tag name must not be empty.");
+        tag = Tag.valueOf(tagName);
+        return this;
+    }
+
+    /**
+     * Get the Tag for this element.
+     * 
+     * @return the tag object
+     */
+    public Tag tag() {
+        return tag;
+    }
+    
+    /**
+     * Test if this element is a block-level element. (E.g. {@code <div> == true} or an inline element
+     * {@code <p> == false}).
+     * 
+     * @return true if block, false if not (and thus inline)
+     */
+    public boolean isBlock() {
+        return tag.isBlock();
+    }
+
+    /**
+     * Get the {@code id} attribute of this element.
+     * 
+     * @return The id attribute, if present, or an empty string if not.
+     */
+    public String id() {
+        String id = attr("id");
+        return id == null ? "" : id;
+    }
+
+    /**
+     * Set an attribute value on this element. If this element already has an attribute with the
+     * key, its value is updated; otherwise, a new attribute is added.
+     * 
+     * @return this element
+     */
+    public Element attr(String attributeKey, String attributeValue) {
+        super.attr(attributeKey, attributeValue);
+        return this;
+    }
+
+    /**
+     * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key
+     * starting with "data-" is included the dataset.
+     * <p>
+     * E.g., the element {@code <div data-package="jsoup" data-language="Java" class="group">...} has the dataset
+     * {@code package=jsoup, language=java}.
+     * <p>
+     * This map is a filtered view of the element's attribute map. Changes to one map (add, remove, update) are reflected
+     * in the other map.
+     * <p>
+     * You can find elements that have data attributes using the {@code [^data-]} attribute key prefix selector.
+     * @return a map of {@code key=value} custom data attributes.
+     */
+    public Map<String, String> dataset() {
+        return attributes.dataset();
+    }
+
+    @Override
+    public final Element parent() {
+        return (Element) parentNode;
+    }
+
+    /**
+     * Get this element's parent and ancestors, up to the document root.
+     * @return this element's stack of parents, closest first.
+     */
+    public Elements parents() {
+        Elements parents = new Elements();
+        accumulateParents(this, parents);
+        return parents;
+    }
+
+    private static void accumulateParents(Element el, Elements parents) {
+        Element parent = el.parent();
+        if (parent != null && !parent.tagName().equals("#root")) {
+            parents.add(parent);
+            accumulateParents(parent, parents);
+        }
+    }
+
+    /**
+     * Get a child element of this element, by its 0-based index number.
+     * <p/>
+     * Note that an element can have both mixed Nodes and Elements as children. This method inspects
+     * a filtered list of children that are elements, and the index is based on that filtered list.
+     * 
+     * @param index the index number of the element to retrieve
+     * @return the child element, if it exists, or {@code null} if absent.
+     * @see #childNode(int)
+     */
+    public Element child(int index) {
+        return children().get(index);
+    }
+
+    /**
+     * Get this element's child elements.
+     * <p/>
+     * This is effectively a filter on {@link #childNodes()} to get Element nodes.
+     * @return child elements. If this element has no children, returns an
+     * empty list.
+     * @see #childNodes()
+     */
+    public Elements children() {
+        // create on the fly rather than maintaining two lists. if gets slow, memoize, and mark dirty on change
+        List<Element> elements = new ArrayList<Element>();
+        for (Node node : childNodes) {
+            if (node instanceof Element)
+                elements.add((Element) node);
+        }
+        return new Elements(elements);
+    }
+
+    /**
+     * Get this element's child text nodes. The list is unmodifiable but the text nodes may be manipulated.
+     * <p/>
+     * This is effectively a filter on {@link #childNodes()} to get Text nodes.
+     * @return child text nodes. If this element has no text nodes, returns an
+     * empty list.
+     * <p/>
+     * For example, with the input HTML: {@code <p>One <span>Two</span> Three <br> Four</p>} with the {@code p} element selected:
+     * <ul>
+     *     <li>{@code p.text()} = {@code "One Two Three Four"}</li>
+     *     <li>{@code p.ownText()} = {@code "One Three Four"}</li>
+     *     <li>{@code p.children()} = {@code Elements[<span>, <br>]}</li>
+     *     <li>{@code p.childNodes()} = {@code List<Node>["One ", <span>, " Three ", <br>, " Four"]}</li>
+     *     <li>{@code p.textNodes()} = {@code List<TextNode>["One ", " Three ", " Four"]}</li>
+     * </ul>
+     */
+    public List<TextNode> textNodes() {
+        List<TextNode> textNodes = new ArrayList<TextNode>();
+        for (Node node : childNodes) {
+            if (node instanceof TextNode)
+                textNodes.add((TextNode) node);
+        }
+        return Collections.unmodifiableList(textNodes);
+    }
+
+    /**
+     * Get this element's child data nodes. The list is unmodifiable but the data nodes may be manipulated.
+     * <p/>
+     * This is effectively a filter on {@link #childNodes()} to get Data nodes.
+     * @return child data nodes. If this element has no data nodes, returns an
+     * empty list.
+     * @see #data()
+     */
+    public List<DataNode> dataNodes() {
+        List<DataNode> dataNodes = new ArrayList<DataNode>();
+        for (Node node : childNodes) {
+            if (node instanceof DataNode)
+                dataNodes.add((DataNode) node);
+        }
+        return Collections.unmodifiableList(dataNodes);
+    }
+
+    /**
+     * Find elements that match the {@link Selector} CSS query, with this element as the starting context. Matched elements
+     * may include this element, or any of its children.
+     * <p/>
+     * This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because
+     * multiple filters can be combined, e.g.:
+     * <ul>
+     * <li>{@code el.select("a[href]")} - finds links ({@code a} tags with {@code href} attributes)
+     * <li>{@code el.select("a[href*=example.com]")} - finds links pointing to example.com (loosely)
+     * </ul>
+     * <p/>
+     * See the query syntax documentation in {@link org.jsoup.select.Selector}.
+     *
+     * @param cssQuery a {@link Selector} CSS-like query
+     * @return elements that match the query (empty if none match)
+     * @see org.jsoup.select.Selector
+     */
+    public Elements select(String cssQuery) {
+        return Selector.select(cssQuery, this);
+    }
+    
+    /**
+     * Add a node child node to this element.
+     * 
+     * @param child node to add. Must not already have a parent.
+     * @return this element, so that you can add more child nodes or elements.
+     */
+    public Element appendChild(Node child) {
+        Validate.notNull(child);
+        
+        addChildren(child);
+        return this;
+    }
+    
+    /**
+     * Add a node to the start of this element's children.
+     * 
+     * @param child node to add. Must not already have a parent.
+     * @return this element, so that you can add more child nodes or elements.
+     */
+    public Element prependChild(Node child) {
+        Validate.notNull(child);
+        
+        addChildren(0, child);
+        return this;
+    }
+    
+    /**
+     * Create a new element by tag name, and add it as the last child.
+     * 
+     * @param tagName the name of the tag (e.g. {@code div}).
+     * @return the new element, to allow you to add content to it, e.g.:
+     *  {@code parent.appendElement("h1").attr("id", "header").text("Welcome");}
+     */
+    public Element appendElement(String tagName) {
+        Element child = new Element(Tag.valueOf(tagName), baseUri());
+        appendChild(child);
+        return child;
+    }
+    
+    /**
+     * Create a new element by tag name, and add it as the first child.
+     * 
+     * @param tagName the name of the tag (e.g. {@code div}).
+     * @return the new element, to allow you to add content to it, e.g.:
+     *  {@code parent.prependElement("h1").attr("id", "header").text("Welcome");}
+     */
+    public Element prependElement(String tagName) {
+        Element child = new Element(Tag.valueOf(tagName), baseUri());
+        prependChild(child);
+        return child;
+    }
+    
+    /**
+     * Create and append a new TextNode to this element.
+     * 
+     * @param text the unencoded text to add
+     * @return this element
+     */
+    public Element appendText(String text) {
+        TextNode node = new TextNode(text, baseUri());
+        appendChild(node);
+        return this;
+    }
+    
+    /**
+     * Create and prepend a new TextNode to this element.
+     * 
+     * @param text the unencoded text to add
+     * @return this element
+     */
+    public Element prependText(String text) {
+        TextNode node = new TextNode(text, baseUri());
+        prependChild(node);
+        return this;
+    }
+    
+    /**
+     * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children.
+     * @param html HTML to add inside this element, after the existing HTML
+     * @return this element
+     * @see #html(String)
+     */
+    public Element append(String html) {
+        Validate.notNull(html);
+
+        List<Node> nodes = Parser.parseFragment(html, this, baseUri());
+        addChildren(nodes.toArray(new Node[nodes.size()]));
+        return this;
+    }
+    
+    /**
+     * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
+     * @param html HTML to add inside this element, before the existing HTML
+     * @return this element
+     * @see #html(String)
+     */
+    public Element prepend(String html) {
+        Validate.notNull(html);
+        
+        List<Node> nodes = Parser.parseFragment(html, this, baseUri());
+        addChildren(0, nodes.toArray(new Node[nodes.size()]));
+        return this;
+    }
+
+    /**
+     * Insert the specified HTML into the DOM before this element (i.e. as a preceding sibling).
+     *
+     * @param html HTML to add before this element
+     * @return this element, for chaining
+     * @see #after(String)
+     */
+    @Override
+    public Element before(String html) {
+        return (Element) super.before(html);
+    }
+
+    /**
+     * Insert the specified node into the DOM before this node (i.e. as a preceding sibling).
+     * @param node to add before this element
+     * @return this Element, for chaining
+     * @see #after(Node)
+     */
+    @Override
+    public Element before(Node node) {
+        return (Element) super.before(node);
+    }
+
+    /**
+     * Insert the specified HTML into the DOM after this element (i.e. as a following sibling).
+     *
+     * @param html HTML to add after this element
+     * @return this element, for chaining
+     * @see #before(String)
+     */
+    @Override
+    public Element after(String html) {
+        return (Element) super.after(html);
+    }
+
+    /**
+     * Insert the specified node into the DOM after this node (i.e. as a following sibling).
+     * @param node to add after this element
+     * @return this element, for chaining
+     * @see #before(Node)
+     */
+    @Override
+    public Element after(Node node) {
+        return (Element) super.after(node);
+    }
+
+    /**
+     * Remove all of the element's child nodes. Any attributes are left as-is.
+     * @return this element
+     */
+    public Element empty() {
+        childNodes.clear();
+        return this;
+    }
+
+    /**
+     * Wrap the supplied HTML around this element.
+     *
+     * @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep.
+     * @return this element, for chaining.
+     */
+    @Override
+    public Element wrap(String html) {
+        return (Element) super.wrap(html);
+    }
+
+    /**
+     * Get sibling elements. If the element has no sibling elements, returns an empty list. An element is not a sibling
+     * of itself, so will not be included in the returned list.
+     * @return sibling elements
+     */
+    public Elements siblingElements() {
+        if (parentNode == null)
+            return new Elements(0);
+
+        List<Element> elements = parent().children();
+        Elements siblings = new Elements(elements.size() - 1);
+        for (Element el: elements)
+            if (el != this)
+                siblings.add(el);
+        return siblings;
+    }
+
+    /**
+     * Gets the next sibling element of this element. E.g., if a {@code div} contains two {@code p}s, 
+     * the {@code nextElementSibling} of the first {@code p} is the second {@code p}.
+     * <p/>
+     * This is similar to {@link #nextSibling()}, but specifically finds only Elements
+     * @return the next element, or null if there is no next element
+     * @see #previousElementSibling()
+     */
+    public Element nextElementSibling() {
+        if (parentNode == null) return null;
+        List<Element> siblings = parent().children();
+        Integer index = indexInList(this, siblings);
+        Validate.notNull(index);
+        if (siblings.size() > index+1)
+            return siblings.get(index+1);
+        else
+            return null;
+    }
+
+    /**
+     * Gets the previous element sibling of this element.
+     * @return the previous element, or null if there is no previous element
+     * @see #nextElementSibling()
+     */
+    public Element previousElementSibling() {
+        if (parentNode == null) return null;
+        List<Element> siblings = parent().children();
+        Integer index = indexInList(this, siblings);
+        Validate.notNull(index);
+        if (index > 0)
+            return siblings.get(index-1);
+        else
+            return null;
+    }
+
+    /**
+     * Gets the first element sibling of this element.
+     * @return the first sibling that is an element (aka the parent's first element child) 
+     */
+    public Element firstElementSibling() {
+        // todo: should firstSibling() exclude this?
+        List<Element> siblings = parent().children();
+        return siblings.size() > 1 ? siblings.get(0) : null;
+    }
+    
+    /**
+     * Get the list index of this element in its element sibling list. I.e. if this is the first element
+     * sibling, returns 0.
+     * @return position in element sibling list
+     */
+    public Integer elementSiblingIndex() {
+       if (parent() == null) return 0;
+       return indexInList(this, parent().children()); 
+    }
+
+    /**
+     * Gets the last element sibling of this element
+     * @return the last sibling that is an element (aka the parent's last element child) 
+     */
+    public Element lastElementSibling() {
+        List<Element> siblings = parent().children();
+        return siblings.size() > 1 ? siblings.get(siblings.size() - 1) : null;
+    }
+    
+    private static <E extends Element> Integer indexInList(Element search, List<E> elements) {
+        Validate.notNull(search);
+        Validate.notNull(elements);
+
+        for (int i = 0; i < elements.size(); i++) {
+            E element = elements.get(i);
+            if (element.equals(search))
+                return i;
+        }
+        return null;
+    }
+
+    // DOM type methods
+
+    /**
+     * Finds elements, including and recursively under this element, with the specified tag name.
+     * @param tagName The tag name to search for (case insensitively).
+     * @return a matching unmodifiable list of elements. Will be empty if this element and none of its children match.
+     */
+    public Elements getElementsByTag(String tagName) {
+        Validate.notEmpty(tagName);
+        tagName = tagName.toLowerCase().trim();
+
+        return Collector.collect(new Evaluator.Tag(tagName), this);
+    }
+
+    /**
+     * Find an element by ID, including or under this element.
+     * <p>
+     * Note that this finds the first matching ID, starting with this element. If you search down from a different
+     * starting point, it is possible to find a different element by ID. For unique element by ID within a Document,
+     * use {@link Document#getElementById(String)}
+     * @param id The ID to search for.
+     * @return The first matching element by ID, starting with this element, or null if none found.
+     */
+    public Element getElementById(String id) {
+        Validate.notEmpty(id);
+        
+        Elements elements = Collector.collect(new Evaluator.Id(id), this);
+        if (elements.size() > 0)
+            return elements.get(0);
+        else
+            return null;
+    }
+
+    /**
+     * Find elements that have this class, including or under this element. Case insensitive.
+     * <p>
+     * Elements can have multiple classes (e.g. {@code <div class="header round first">}. This method
+     * checks each class, so you can find the above with {@code el.getElementsByClass("header");}.
+     * 
+     * @param className the name of the class to search for.
+     * @return elements with the supplied class name, empty if none
+     * @see #hasClass(String)
+     * @see #classNames()
+     */
+    public Elements getElementsByClass(String className) {
+        Validate.notEmpty(className);
+
+        return Collector.collect(new Evaluator.Class(className), this);
+    }
+
+    /**
+     * Find elements that have a named attribute set. Case insensitive.
+     *
+     * @param key name of the attribute, e.g. {@code href}
+     * @return elements that have this attribute, empty if none
+     */
+    public Elements getElementsByAttribute(String key) {
+        Validate.notEmpty(key);
+        key = key.trim().toLowerCase();
+
+        return Collector.collect(new Evaluator.Attribute(key), this);
+    }
+
+    /**
+     * Find elements that have an attribute name starting with the supplied prefix. Use {@code data-} to find elements
+     * that have HTML5 datasets.
+     * @param keyPrefix name prefix of the attribute e.g. {@code data-}
+     * @return elements that have attribute names that start with with the prefix, empty if none.
+     */
+    public Elements getElementsByAttributeStarting(String keyPrefix) {
+        Validate.notEmpty(keyPrefix);
+        keyPrefix = keyPrefix.trim().toLowerCase();
+
+        return Collector.collect(new Evaluator.AttributeStarting(keyPrefix), this);
+    }
+
+    /**
+     * Find elements that have an attribute with the specific value. Case insensitive.
+     * 
+     * @param key name of the attribute
+     * @param value value of the attribute
+     * @return elements that have this attribute with this value, empty if none
+     */
+    public Elements getElementsByAttributeValue(String key, String value) {
+        return Collector.collect(new Evaluator.AttributeWithValue(key, value), this);
+    }
+
+    /**
+     * Find elements that either do not have this attribute, or have it with a different value. Case insensitive.
+     * 
+     * @param key name of the attribute
+     * @param value value of the attribute
+     * @return elements that do not have a matching attribute
+     */
+    public Elements getElementsByAttributeValueNot(String key, String value) {
+        return Collector.collect(new Evaluator.AttributeWithValueNot(key, value), this);
+    }
+
+    /**
+     * Find elements that have attributes that start with the value prefix. Case insensitive.
+     * 
+     * @param key name of the attribute
+     * @param valuePrefix start of attribute value
+     * @return elements that have attributes that start with the value prefix
+     */
+    public Elements getElementsByAttributeValueStarting(String key, String valuePrefix) {
+        return Collector.collect(new Evaluator.AttributeWithValueStarting(key, valuePrefix), this);
+    }
+
+    /**
+     * Find elements that have attributes that end with the value suffix. Case insensitive.
+     * 
+     * @param key name of the attribute
+     * @param valueSuffix end of the attribute value
+     * @return elements that have attributes that end with the value suffix
+     */
+    public Elements getElementsByAttributeValueEnding(String key, String valueSuffix) {
+        return Collector.collect(new Evaluator.AttributeWithValueEnding(key, valueSuffix), this);
+    }
+
+    /**
+     * Find elements that have attributes whose value contains the match string. Case insensitive.
+     * 
+     * @param key name of the attribute
+     * @param match substring of value to search for
+     * @return elements that have attributes containing this text
+     */
+    public Elements getElementsByAttributeValueContaining(String key, String match) {
+        return Collector.collect(new Evaluator.AttributeWithValueContaining(key, match), this);
+    }
+    
+    /**
+     * Find elements that have attributes whose values match the supplied regular expression.
+     * @param key name of the attribute
+     * @param pattern compiled regular expression to match against attribute values
+     * @return elements that have attributes matching this regular expression
+     */
+    public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) {
+        return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
+        
+    }
+    
+    /**
+     * Find elements that have attributes whose values match the supplied regular expression.
+     * @param key name of the attribute
+     * @param regex regular expression to match against attribute values. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options.
+     * @return elements that have attributes matching this regular expression
+     */
+    public Elements getElementsByAttributeValueMatching(String key, String regex) {
+        Pattern pattern;
+        try {
+            pattern = Pattern.compile(regex);
+        } catch (PatternSyntaxException e) {
+            throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
+        }
+        return getElementsByAttributeValueMatching(key, pattern);
+    }
+    
+    /**
+     * Find elements whose sibling index is less than the supplied index.
+     * @param index 0-based index
+     * @return elements less than index
+     */
+    public Elements getElementsByIndexLessThan(int index) {
+        return Collector.collect(new Evaluator.IndexLessThan(index), this);
+    }
+    
+    /**
+     * Find elements whose sibling index is greater than the supplied index.
+     * @param index 0-based index
+     * @return elements greater than index
+     */
+    public Elements getElementsByIndexGreaterThan(int index) {
+        return Collector.collect(new Evaluator.IndexGreaterThan(index), this);
+    }
+    
+    /**
+     * Find elements whose sibling index is equal to the supplied index.
+     * @param index 0-based index
+     * @return elements equal to index
+     */
+    public Elements getElementsByIndexEquals(int index) {
+        return Collector.collect(new Evaluator.IndexEquals(index), this);
+    }
+    
+    /**
+     * Find elements that contain the specified string. The search is case insensitive. The text may appear directly
+     * in the element, or in any of its descendants.
+     * @param searchText to look for in the element's text
+     * @return elements that contain the string, case insensitive.
+     * @see Element#text()
+     */
+    public Elements getElementsContainingText(String searchText) {
+        return Collector.collect(new Evaluator.ContainsText(searchText), this);
+    }
+    
+    /**
+     * Find elements that directly contain the specified string. The search is case insensitive. The text must appear directly
+     * in the element, not in any of its descendants.
+     * @param searchText to look for in the element's own text
+     * @return elements that contain the string, case insensitive.
+     * @see Element#ownText()
+     */
+    public Elements getElementsContainingOwnText(String searchText) {
+        return Collector.collect(new Evaluator.ContainsOwnText(searchText), this);
+    }
+    
+    /**
+     * Find elements whose text matches the supplied regular expression.
+     * @param pattern regular expression to match text against
+     * @return elements matching the supplied regular expression.
+     * @see Element#text()
+     */
+    public Elements getElementsMatchingText(Pattern pattern) {
+        return Collector.collect(new Evaluator.Matches(pattern), this);
+    }
+    
+    /**
+     * Find elements whose text matches the supplied regular expression.
+     * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options.
+     * @return elements matching the supplied regular expression.
+     * @see Element#text()
+     */
+    public Elements getElementsMatchingText(String regex) {
+        Pattern pattern;
+        try {
+            pattern = Pattern.compile(regex);
+        } catch (PatternSyntaxException e) {
+            throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
+        }
+        return getElementsMatchingText(pattern);
+    }
+    
+    /**
+     * Find elements whose own text matches the supplied regular expression.
+     * @param pattern regular expression to match text against
+     * @return elements matching the supplied regular expression.
+     * @see Element#ownText()
+     */
+    public Elements getElementsMatchingOwnText(Pattern pattern) {
+        return Collector.collect(new Evaluator.MatchesOwn(pattern), this);
+    }
+    
+    /**
+     * Find elements whose text matches the supplied regular expression.
+     * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options.
+     * @return elements matching the supplied regular expression.
+     * @see Element#ownText()
+     */
+    public Elements getElementsMatchingOwnText(String regex) {
+        Pattern pattern;
+        try {
+            pattern = Pattern.compile(regex);
+        } catch (PatternSyntaxException e) {
+            throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
+        }
+        return getElementsMatchingOwnText(pattern);
+    }
+    
+    /**
+     * Find all elements under this element (including self, and children of children).
+     * 
+     * @return all elements
+     */
+    public Elements getAllElements() {
+        return Collector.collect(new Evaluator.AllElements(), this);
+    }
+
+    /**
+     * Gets the combined text of this element and all its children.
+     * <p>
+     * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.text()} returns {@code "Hello there now!"}
+     *
+     * @return unencoded text, or empty string if none.
+     * @see #ownText()
+     * @see #textNodes()
+     */
+    public String text() {
+        StringBuilder sb = new StringBuilder();
+        text(sb);
+        return sb.toString().trim();
+    }
+
+    private void text(StringBuilder accum) {
+        appendWhitespaceIfBr(this, accum);
+        
+        for (Node child : childNodes) {
+            if (child instanceof TextNode) {
+                TextNode textNode = (TextNode) child;
+                appendNormalisedText(accum, textNode);
+            } else if (child instanceof Element) {
+                Element element = (Element) child;
+                if (accum.length() > 0 && element.isBlock() && !TextNode.lastCharIsWhitespace(accum))
+                    accum.append(" ");
+                element.text(accum);
+            }
+        }
+    }
+
+    /**
+     * Gets the text owned by this element only; does not get the combined text of all children.
+     * <p>
+     * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.ownText()} returns {@code "Hello now!"},
+     * whereas {@code p.text()} returns {@code "Hello there now!"}.
+     * Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element.
+     *
+     * @return unencoded text, or empty string if none.
+     * @see #text()
+     * @see #textNodes()
+     */
+    public String ownText() {
+        StringBuilder sb = new StringBuilder();
+        ownText(sb);
+        return sb.toString().trim();
+    }
+
+    private void ownText(StringBuilder accum) {
+        for (Node child : childNodes) {
+            if (child instanceof TextNode) {
+                TextNode textNode = (TextNode) child;
+                appendNormalisedText(accum, textNode);
+            } else if (child instanceof Element) {
+                appendWhitespaceIfBr((Element) child, accum);
+            }
+        }
+    }
+
+    private void appendNormalisedText(StringBuilder accum, TextNode textNode) {
+        String text = textNode.getWholeText();
+
+        if (!preserveWhitespace()) {
+            text = TextNode.normaliseWhitespace(text);
+            if (TextNode.lastCharIsWhitespace(accum))
+                text = TextNode.stripLeadingWhitespace(text);
+        }
+        accum.append(text);
+    }
+
+    private static void appendWhitespaceIfBr(Element element, StringBuilder accum) {
+        if (element.tag.getName().equals("br") && !TextNode.lastCharIsWhitespace(accum))
+            accum.append(" ");
+    }
+
+    boolean preserveWhitespace() {
+        return tag.preserveWhitespace() || parent() != null && parent().preserveWhitespace();
+    }
+
+    /**
+     * Set the text of this element. Any existing contents (text or elements) will be cleared
+     * @param text unencoded text
+     * @return this element
+     */
+    public Element text(String text) {
+        Validate.notNull(text);
+
+        empty();
+        TextNode textNode = new TextNode(text, baseUri);
+        appendChild(textNode);
+
+        return this;
+    }
+
+    /**
+     Test if this element has any text content (that is not just whitespace).
+     @return true if element has non-blank text content.
+     */
+    public boolean hasText() {
+        for (Node child: childNodes) {
+            if (child instanceof TextNode) {
+                TextNode textNode = (TextNode) child;
+                if (!textNode.isBlank())
+                    return true;
+            } else if (child instanceof Element) {
+                Element el = (Element) child;
+                if (el.hasText())
+                    return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag.
+     * @return the data, or empty string if none
+     *
+     * @see #dataNodes()
+     */
+    public String data() {
+        StringBuilder sb = new StringBuilder();
+
+        for (Node childNode : childNodes) {
+            if (childNode instanceof DataNode) {
+                DataNode data = (DataNode) childNode;
+                sb.append(data.getWholeData());
+            } else if (childNode instanceof Element) {
+                Element element = (Element) childNode;
+                String elementData = element.data();
+                sb.append(elementData);
+            }
+        }
+        return sb.toString();
+    }   
+
+    /**
+     * Gets the literal value of this element's "class" attribute, which may include multiple class names, space
+     * separated. (E.g. on <code>&lt;div class="header gray"></code> returns, "<code>header gray</code>")
+     * @return The literal class attribute, or <b>empty string</b> if no class attribute set.
+     */
+    public String className() {
+        return attr("class");
+    }
+
+    /**
+     * Get all of the element's class names. E.g. on element {@code <div class="header gray"}>},
+     * returns a set of two elements {@code "header", "gray"}. Note that modifications to this set are not pushed to
+     * the backing {@code class} attribute; use the {@link #classNames(java.util.Set)} method to persist them.
+     * @return set of classnames, empty if no class attribute
+     */
+    public Set<String> classNames() {
+        if (classNames == null) {
+            String[] names = className().split("\\s+");
+            classNames = new LinkedHashSet<String>(Arrays.asList(names));
+        }
+        return classNames;
+    }
+
+    /**
+     Set the element's {@code class} attribute to the supplied class names.
+     @param classNames set of classes
+     @return this element, for chaining
+     */
+    public Element classNames(Set<String> classNames) {
+        Validate.notNull(classNames);
+        attributes.put("class", StringUtil.join(classNames, " "));
+        return this;
+    }
+
+    /**
+     * Tests if this element has a class. Case insensitive.
+     * @param className name of class to check for
+     * @return true if it does, false if not
+     */
+    public boolean hasClass(String className) {
+        Set<String> classNames = classNames();
+        for (String name : classNames) {
+            if (className.equalsIgnoreCase(name))
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     Add a class name to this element's {@code class} attribute.
+     @param className class name to add
+     @return this element
+     */
+    public Element addClass(String className) {
+        Validate.notNull(className);
+
+        Set<String> classes = classNames();
+        classes.add(className);
+        classNames(classes);
+
+        return this;
+    }
+
+    /**
+     Remove a class name from this element's {@code class} attribute.
+     @param className class name to remove
+     @return this element
+     */
+    public Element removeClass(String className) {
+        Validate.notNull(className);
+
+        Set<String> classes = classNames();
+        classes.remove(className);
+        classNames(classes);
+
+        return this;
+    }
+
+    /**
+     Toggle a class name on this element's {@code class} attribute: if present, remove it; otherwise add it.
+     @param className class name to toggle
+     @return this element
+     */
+    public Element toggleClass(String className) {
+        Validate.notNull(className);
+
+        Set<String> classes = classNames();
+        if (classes.contains(className))
+            classes.remove(className);
+        else
+            classes.add(className);
+        classNames(classes);
+
+        return this;
+    }
+    
+    /**
+     * Get the value of a form element (input, textarea, etc).
+     * @return the value of the form element, or empty string if not set.
+     */
+    public String val() {
+        if (tagName().equals("textarea"))
+            return text();
+        else
+            return attr("value");
+    }
+    
+    /**
+     * Set the value of a form element (input, textarea, etc).
+     * @param value value to set
+     * @return this element (for chaining)
+     */
+    public Element val(String value) {
+        if (tagName().equals("textarea"))
+            text(value);
+        else
+            attr("value", value);
+        return this;
+    }
+
+    void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
+        if (accum.length() > 0 && out.prettyPrint() && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock())))
+            indent(accum, depth, out);
+        accum
+                .append("<")
+                .append(tagName());
+        attributes.html(accum, out);
+
+        if (childNodes.isEmpty() && tag.isSelfClosing())
+            accum.append(" />");
+        else
+            accum.append(">");
+    }
+
+    void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {
+        if (!(childNodes.isEmpty() && tag.isSelfClosing())) {
+            if (out.prettyPrint() && !childNodes.isEmpty() && tag.formatAsBlock())
+                indent(accum, depth, out);
+            accum.append("</").append(tagName()).append(">");
+        }
+    }
+
+    /**
+     * Retrieves the element's inner HTML. E.g. on a {@code <div>} with one empty {@code <p>}, would return
+     * {@code <p></p>}. (Whereas {@link #outerHtml()} would return {@code <div><p></p></div>}.)
+     * 
+     * @return String of HTML.
+     * @see #outerHtml()
+     */
+    public String html() {
+        StringBuilder accum = new StringBuilder();
+        html(accum); 
+        return accum.toString().trim();
+    }
+
+    private void html(StringBuilder accum) {
+        for (Node node : childNodes)
+            node.outerHtml(accum);
+    }
+    
+    /**
+     * Set this element's inner HTML. Clears the existing HTML first.
+     * @param html HTML to parse and set into this element
+     * @return this element
+     * @see #append(String)
+     */
+    public Element html(String html) {
+        empty();
+        append(html);
+        return this;
+    }
+
+    public String toString() {
+        return outerHtml();
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        return this == o;
+    }
+
+    @Override
+    public int hashCode() {
+        // todo: fixup, not very useful
+        int result = super.hashCode();
+        result = 31 * result + (tag != null ? tag.hashCode() : 0);
+        return result;
+    }
+
+    @Override
+    public Element clone() {
+        Element clone = (Element) super.clone();
+        clone.classNames(); // creates linked set of class names from class attribute
+        return clone;
+    }
+}
diff --git a/src/org/jsoup/nodes/Entities.java b/src/org/jsoup/nodes/Entities.java

new file mode 100644 (file)

index 0000000..0ae83e1
--- /dev/null
+++ b/src/org/jsoup/nodes/Entities.java
@@ -0,0 +1,184 @@
+package org.jsoup.nodes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.CharsetEncoder;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * HTML entities, and escape routines.
+ * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML
+ * named character references</a>.
+ */
+public class Entities {
+    public enum EscapeMode {
+        /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */
+        xhtml(xhtmlByVal),
+        /** Default HTML output entities. */
+        base(baseByVal),
+        /** Complete HTML entities. */
+        extended(fullByVal);
+
+        private Map<Character, String> map;
+
+        EscapeMode(Map<Character, String> map) {
+            this.map = map;
+        }
+
+        public Map<Character, String> getMap() {
+            return map;
+        }
+    }
+
+    private static final Map<String, Character> full;
+    private static final Map<Character, String> xhtmlByVal;
+    private static final Map<Character, String> baseByVal;
+    private static final Map<Character, String> fullByVal;
+    private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?");
+    private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);");
+
+    private Entities() {}
+
+    /**
+     * Check if the input is a known named entity
+     * @param name the possible entity name (e.g. "lt" or "amp"
+     * @return true if a known named entity
+     */
+    public static boolean isNamedEntity(String name) {
+        return full.containsKey(name);
+    }
+
+    /**
+     * Get the Character value of the named entity
+     * @param name named entity (e.g. "lt" or "amp")
+     * @return the Character value of the named entity (e.g. '<' or '&')
+     */
+    public static Character getCharacterByName(String name) {
+        return full.get(name);
+    }
+    
+    static String escape(String string, Document.OutputSettings out) {
+        return escape(string, out.encoder(), out.escapeMode());
+    }
+
+    static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) {
+        StringBuilder accum = new StringBuilder(string.length() * 2);
+        Map<Character, String> map = escapeMode.getMap();
+
+        for (int pos = 0; pos < string.length(); pos++) {
+            Character c = string.charAt(pos);
+            if (map.containsKey(c))
+                accum.append('&').append(map.get(c)).append(';');
+            else if (encoder.canEncode(c))
+                accum.append(c.charValue());
+            else
+                accum.append("&#").append((int) c).append(';');
+        }
+
+        return accum.toString();
+    }
+
+    static String unescape(String string) {
+        return unescape(string, false);
+    }
+
+    /**
+     * Unescape the input string.
+     * @param string
+     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
+     * @return
+     */
+    static String unescape(String string, boolean strict) {
+        // todo: change this method to use Tokeniser.consumeCharacterReference
+        if (!string.contains("&"))
+            return string;
+
+        Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);?
+        StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs
+        // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required
+
+        while (m.find()) {
+            int charval = -1;
+            String num = m.group(3);
+            if (num != null) {
+                try {
+                    int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator
+                    charval = Integer.valueOf(num, base);
+                } catch (NumberFormatException e) {
+                } // skip
+            } else {
+                String name = m.group(1);
+                if (full.containsKey(name))
+                    charval = full.get(name);
+            }
+
+            if (charval != -1 || charval > 0xFFFF) { // out of range
+                String c = Character.toString((char) charval);
+                m.appendReplacement(accum, Matcher.quoteReplacement(c));
+            } else {
+                m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string
+            }
+        }
+        m.appendTail(accum);
+        return accum.toString();
+    }
+
+    // xhtml has restricted entities
+    private static final Object[][] xhtmlArray = {
+            {"quot", 0x00022},
+            {"amp", 0x00026},
+            {"apos", 0x00027},
+            {"lt", 0x0003C},
+            {"gt", 0x0003E}
+    };
+
+    static {
+        xhtmlByVal = new HashMap<Character, String>();
+        baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default
+        full = loadEntities("entities-full.properties"); // extended and overblown.
+        fullByVal = toCharacterKey(full);
+
+        for (Object[] entity : xhtmlArray) {
+            Character c = Character.valueOf((char) ((Integer) entity[1]).intValue());
+            xhtmlByVal.put(c, ((String) entity[0]));
+        }
+    }
+
+    private static Map<String, Character> loadEntities(String filename) {
+        Properties properties = new Properties();
+        Map<String, Character> entities = new HashMap<String, Character>();
+        try {
+            InputStream in = Entities.class.getResourceAsStream(filename);
+            properties.load(in);
+            in.close();
+        } catch (IOException e) {
+            throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename);
+        }
+
+        for (Map.Entry entry: properties.entrySet()) {
+            Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16));
+            String name = (String) entry.getKey();
+            entities.put(name, val);
+        }
+        return entities;
+    }
+
+    private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) {
+        Map<Character, String> outMap = new HashMap<Character, String>();
+        for (Map.Entry<String, Character> entry: inMap.entrySet()) {
+            Character character = entry.getValue();
+            String name = entry.getKey();
+
+            if (outMap.containsKey(character)) {
+                // dupe, prefer the lower case version
+                if (name.toLowerCase().equals(name))
+                    outMap.put(character, name);
+            } else {
+                outMap.put(character, name);
+            }
+        }
+        return outMap;
+    }
+}
diff --git a/src/org/jsoup/nodes/Node.java b/src/org/jsoup/nodes/Node.java

new file mode 100644 (file)

index 0000000..eb2b40e
--- /dev/null
+++ b/src/org/jsoup/nodes/Node.java
@@ -0,0 +1,615 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+import org.jsoup.parser.Parser;
+import org.jsoup.select.NodeTraversor;
+import org.jsoup.select.NodeVisitor;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ The base, abstract Node model. Elements, Documents, Comments etc are all Node instances.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public abstract class Node implements Cloneable {
+    Node parentNode;
+    List<Node> childNodes;
+    Attributes attributes;
+    String baseUri;
+    int siblingIndex;
+
+    /**
+     Create a new Node.
+     @param baseUri base URI
+     @param attributes attributes (not null, but may be empty)
+     */
+    protected Node(String baseUri, Attributes attributes) {
+        Validate.notNull(baseUri);
+        Validate.notNull(attributes);
+        
+        childNodes = new ArrayList<Node>(4);
+        this.baseUri = baseUri.trim();
+        this.attributes = attributes;
+    }
+
+    protected Node(String baseUri) {
+        this(baseUri, new Attributes());
+    }
+
+    /**
+     * Default constructor. Doesn't setup base uri, children, or attributes; use with caution.
+     */
+    protected Node() {
+        childNodes = Collections.emptyList();
+        attributes = null;
+    }
+
+    /**
+     Get the node name of this node. Use for debugging purposes and not logic switching (for that, use instanceof).
+     @return node name
+     */
+    public abstract String nodeName();
+
+    /**
+     * Get an attribute's value by its key.
+     * <p/>
+     * To get an absolute URL from an attribute that may be a relative URL, prefix the key with <code><b>abs</b></code>,
+     * which is a shortcut to the {@link #absUrl} method.
+     * E.g.: <blockquote><code>String url = a.attr("abs:href");</code></blockquote>
+     * @param attributeKey The attribute key.
+     * @return The attribute, or empty string if not present (to avoid nulls).
+     * @see #attributes()
+     * @see #hasAttr(String)
+     * @see #absUrl(String)
+     */
+    public String attr(String attributeKey) {
+        Validate.notNull(attributeKey);
+
+        if (attributes.hasKey(attributeKey))
+            return attributes.get(attributeKey);
+        else if (attributeKey.toLowerCase().startsWith("abs:"))
+            return absUrl(attributeKey.substring("abs:".length()));
+        else return "";
+    }
+
+    /**
+     * Get all of the element's attributes.
+     * @return attributes (which implements iterable, in same order as presented in original HTML).
+     */
+    public Attributes attributes() {
+        return attributes;
+    }
+
+    /**
+     * Set an attribute (key=value). If the attribute already exists, it is replaced.
+     * @param attributeKey The attribute key.
+     * @param attributeValue The attribute value.
+     * @return this (for chaining)
+     */
+    public Node attr(String attributeKey, String attributeValue) {
+        attributes.put(attributeKey, attributeValue);
+        return this;
+    }
+
+    /**
+     * Test if this element has an attribute.
+     * @param attributeKey The attribute key to check.
+     * @return true if the attribute exists, false if not.
+     */
+    public boolean hasAttr(String attributeKey) {
+        Validate.notNull(attributeKey);
+
+        if (attributeKey.toLowerCase().startsWith("abs:")) {
+            String key = attributeKey.substring("abs:".length());
+            if (attributes.hasKey(key) && !absUrl(key).equals(""))
+                return true;
+        }
+        return attributes.hasKey(attributeKey);
+    }
+
+    /**
+     * Remove an attribute from this element.
+     * @param attributeKey The attribute to remove.
+     * @return this (for chaining)
+     */
+    public Node removeAttr(String attributeKey) {
+        Validate.notNull(attributeKey);
+        attributes.remove(attributeKey);
+        return this;
+    }
+
+    /**
+     Get the base URI of this node.
+     @return base URI
+     */
+    public String baseUri() {
+        return baseUri;
+    }
+
+    /**
+     Update the base URI of this node and all of its descendants.
+     @param baseUri base URI to set
+     */
+    public void setBaseUri(final String baseUri) {
+        Validate.notNull(baseUri);
+
+        traverse(new NodeVisitor() {
+            public void head(Node node, int depth) {
+                node.baseUri = baseUri;
+            }
+
+            public void tail(Node node, int depth) {
+            }
+        });
+    }
+
+    /**
+     * Get an absolute URL from a URL attribute that may be relative (i.e. an <code>&lt;a href></code> or
+     * <code>&lt;img src></code>).
+     * <p/>
+     * E.g.: <code>String absUrl = linkEl.absUrl("href");</code>
+     * <p/>
+     * If the attribute value is already absolute (i.e. it starts with a protocol, like
+     * <code>http://</code> or <code>https://</code> etc), and it successfully parses as a URL, the attribute is
+     * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made
+     * absolute using that.
+     * <p/>
+     * As an alternate, you can use the {@link #attr} method with the <code>abs:</code> prefix, e.g.:
+     * <code>String absUrl = linkEl.attr("abs:href");</code>
+     *
+     * @param attributeKey The attribute key
+     * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or
+     * could not be made successfully into a URL.
+     * @see #attr
+     * @see java.net.URL#URL(java.net.URL, String)
+     */
+    public String absUrl(String attributeKey) {
+        Validate.notEmpty(attributeKey);
+
+        String relUrl = attr(attributeKey);
+        if (!hasAttr(attributeKey)) {
+            return ""; // nothing to make absolute with
+        } else {
+            URL base;
+            try {
+                try {
+                    base = new URL(baseUri);
+                } catch (MalformedURLException e) {
+                    // the base is unsuitable, but the attribute may be abs on its own, so try that
+                    URL abs = new URL(relUrl);
+                    return abs.toExternalForm();
+                }
+                // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
+                if (relUrl.startsWith("?"))
+                    relUrl = base.getPath() + relUrl;
+                URL abs = new URL(base, relUrl);
+                return abs.toExternalForm();
+            } catch (MalformedURLException e) {
+                return "";
+            }
+        }
+    }
+
+    /**
+     Get a child node by index
+     @param index index of child node
+     @return the child node at this index.
+     */
+    public Node childNode(int index) {
+        return childNodes.get(index);
+    }
+
+    /**
+     Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes
+     themselves can be manipulated.
+     @return list of children. If no children, returns an empty list.
+     */
+    public List<Node> childNodes() {
+        return Collections.unmodifiableList(childNodes);
+    }
+    
+    protected Node[] childNodesAsArray() {
+        return childNodes.toArray(new Node[childNodes().size()]);
+    }
+
+    /**
+     Gets this node's parent node.
+     @return parent node; or null if no parent.
+     */
+    public Node parent() {
+        return parentNode;
+    }
+    
+    /**
+     * Gets the Document associated with this Node. 
+     * @return the Document associated with this Node, or null if there is no such Document.
+     */
+    public Document ownerDocument() {
+        if (this instanceof Document)
+            return (Document) this;
+        else if (parentNode == null)
+            return null;
+        else
+            return parentNode.ownerDocument();
+    }
+    
+    /**
+     * Remove (delete) this node from the DOM tree. If this node has children, they are also removed.
+     */
+    public void remove() {
+        Validate.notNull(parentNode);
+        parentNode.removeChild(this);
+    }
+
+    /**
+     * Insert the specified HTML into the DOM before this node (i.e. as a preceding sibling).
+     * @param html HTML to add before this node
+     * @return this node, for chaining
+     * @see #after(String)
+     */
+    public Node before(String html) {
+        addSiblingHtml(siblingIndex(), html);
+        return this;
+    }
+
+    /**
+     * Insert the specified node into the DOM before this node (i.e. as a preceding sibling).
+     * @param node to add before this node
+     * @return this node, for chaining
+     * @see #after(Node)
+     */
+    public Node before(Node node) {
+        Validate.notNull(node);
+        Validate.notNull(parentNode);
+
+        parentNode.addChildren(siblingIndex(), node);
+        return this;
+    }
+
+    /**
+     * Insert the specified HTML into the DOM after this node (i.e. as a following sibling).
+     * @param html HTML to add after this node
+     * @return this node, for chaining
+     * @see #before(String)
+     */
+    public Node after(String html) {
+        addSiblingHtml(siblingIndex()+1, html);
+        return this;
+    }
+
+    /**
+     * Insert the specified node into the DOM after this node (i.e. as a following sibling).
+     * @param node to add after this node
+     * @return this node, for chaining
+     * @see #before(Node)
+     */
+    public Node after(Node node) {
+        Validate.notNull(node);
+        Validate.notNull(parentNode);
+
+        parentNode.addChildren(siblingIndex()+1, node);
+        return this;
+    }
+
+    private void addSiblingHtml(int index, String html) {
+        Validate.notNull(html);
+        Validate.notNull(parentNode);
+
+        Element context = parent() instanceof Element ? (Element) parent() : null;        
+        List<Node> nodes = Parser.parseFragment(html, context, baseUri());
+        parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
+    }
+
+    /**
+     Wrap the supplied HTML around this node.
+     @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep.
+     @return this node, for chaining.
+     */
+    public Node wrap(String html) {
+        Validate.notEmpty(html);
+
+        Element context = parent() instanceof Element ? (Element) parent() : null;
+        List<Node> wrapChildren = Parser.parseFragment(html, context, baseUri());
+        Node wrapNode = wrapChildren.get(0);
+        if (wrapNode == null || !(wrapNode instanceof Element)) // nothing to wrap with; noop
+            return null;
+
+        Element wrap = (Element) wrapNode;
+        Element deepest = getDeepChild(wrap);
+        parentNode.replaceChild(this, wrap);
+        deepest.addChildren(this);
+
+        // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is remainder
+        if (wrapChildren.size() > 0) {
+            for (int i = 0; i < wrapChildren.size(); i++) {
+                Node remainder = wrapChildren.get(i);
+                remainder.parentNode.removeChild(remainder);
+                wrap.appendChild(remainder);
+            }
+        }
+        return this;
+    }
+
+    /**
+     * Removes this node from the DOM, and moves its children up into the node's parent. This has the effect of dropping
+     * the node but keeping its children.
+     * <p/>
+     * For example, with the input html:<br/>
+     * {@code <div>One <span>Two <b>Three</b></span></div>}<br/>
+     * Calling {@code element.unwrap()} on the {@code span} element will result in the html:<br/>
+     * {@code <div>One Two <b>Three</b></div>}<br/>
+     * and the {@code "Two "} {@link TextNode} being returned.
+     * @return the first child of this node, after the node has been unwrapped. Null if the node had no children.
+     * @see #remove()
+     * @see #wrap(String)
+     */
+    public Node unwrap() {
+        Validate.notNull(parentNode);
+
+        int index = siblingIndex;
+        Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null;
+        parentNode.addChildren(index, this.childNodesAsArray());
+        this.remove();
+
+        return firstChild;
+    }
+
+    private Element getDeepChild(Element el) {
+        List<Element> children = el.children();
+        if (children.size() > 0)
+            return getDeepChild(children.get(0));
+        else
+            return el;
+    }
+    
+    /**
+     * Replace this node in the DOM with the supplied node.
+     * @param in the node that will will replace the existing node.
+     */
+    public void replaceWith(Node in) {
+        Validate.notNull(in);
+        Validate.notNull(parentNode);
+        parentNode.replaceChild(this, in);
+    }
+
+    protected void setParentNode(Node parentNode) {
+        if (this.parentNode != null)
+            this.parentNode.removeChild(this);
+        this.parentNode = parentNode;
+    }
+
+    protected void replaceChild(Node out, Node in) {
+        Validate.isTrue(out.parentNode == this);
+        Validate.notNull(in);
+        if (in.parentNode != null)
+            in.parentNode.removeChild(in);
+        
+        Integer index = out.siblingIndex();
+        childNodes.set(index, in);
+        in.parentNode = this;
+        in.setSiblingIndex(index);
+        out.parentNode = null;
+    }
+
+    protected void removeChild(Node out) {
+        Validate.isTrue(out.parentNode == this);
+        int index = out.siblingIndex();
+        childNodes.remove(index);
+        reindexChildren();
+        out.parentNode = null;
+    }
+
+    protected void addChildren(Node... children) {
+        //most used. short circuit addChildren(int), which hits reindex children and array copy
+        for (Node child: children) {
+            reparentChild(child);
+            childNodes.add(child);
+            child.setSiblingIndex(childNodes.size()-1);
+        }
+    }
+
+    protected void addChildren(int index, Node... children) {
+        Validate.noNullElements(children);
+        for (int i = children.length - 1; i >= 0; i--) {
+            Node in = children[i];
+            reparentChild(in);
+            childNodes.add(index, in);
+        }
+        reindexChildren();
+    }
+
+    private void reparentChild(Node child) {
+        if (child.parentNode != null)
+            child.parentNode.removeChild(child);
+        child.setParentNode(this);
+    }
+    
+    private void reindexChildren() {
+        for (int i = 0; i < childNodes.size(); i++) {
+            childNodes.get(i).setSiblingIndex(i);
+        }
+    }
+    
+    /**
+     Retrieves this node's sibling nodes. Similar to {@link #childNodes()  node.parent.childNodes()}, but does not
+     include this node (a node is not a sibling of itself).
+     @return node siblings. If the node has no parent, returns an empty list.
+     */
+    public List<Node> siblingNodes() {
+        if (parentNode == null)
+            return Collections.emptyList();
+
+        List<Node> nodes = parentNode.childNodes;
+        List<Node> siblings = new ArrayList<Node>(nodes.size() - 1);
+        for (Node node: nodes)
+            if (node != this)
+                siblings.add(node);
+        return siblings;
+    }
+
+    /**
+     Get this node's next sibling.
+     @return next sibling, or null if this is the last sibling
+     */
+    public Node nextSibling() {
+        if (parentNode == null)
+            return null; // root
+        
+        List<Node> siblings = parentNode.childNodes;
+        Integer index = siblingIndex();
+        Validate.notNull(index);
+        if (siblings.size() > index+1)
+            return siblings.get(index+1);
+        else
+            return null;
+    }
+
+    /**
+     Get this node's previous sibling.
+     @return the previous sibling, or null if this is the first sibling
+     */
+    public Node previousSibling() {
+        if (parentNode == null)
+            return null; // root
+
+        List<Node> siblings = parentNode.childNodes;
+        Integer index = siblingIndex();
+        Validate.notNull(index);
+        if (index > 0)
+            return siblings.get(index-1);
+        else
+            return null;
+    }
+
+    /**
+     * Get the list index of this node in its node sibling list. I.e. if this is the first node
+     * sibling, returns 0.
+     * @return position in node sibling list
+     * @see org.jsoup.nodes.Element#elementSiblingIndex()
+     */
+    public int siblingIndex() {
+        return siblingIndex;
+    }
+    
+    protected void setSiblingIndex(int siblingIndex) {
+        this.siblingIndex = siblingIndex;
+    }
+
+    /**
+     * Perform a depth-first traversal through this node and its descendants.
+     * @param nodeVisitor the visitor callbacks to perform on each node
+     * @return this node, for chaining
+     */
+    public Node traverse(NodeVisitor nodeVisitor) {
+        Validate.notNull(nodeVisitor);
+        NodeTraversor traversor = new NodeTraversor(nodeVisitor);
+        traversor.traverse(this);
+        return this;
+    }
+
+    /**
+     Get the outer HTML of this node.
+     @return HTML
+     */
+    public String outerHtml() {
+        StringBuilder accum = new StringBuilder(128);
+        outerHtml(accum);
+        return accum.toString();
+    }
+
+    protected void outerHtml(StringBuilder accum) {
+        new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())).traverse(this);
+    }
+
+    // if this node has no document (or parent), retrieve the default output settings
+    private Document.OutputSettings getOutputSettings() {
+        return ownerDocument() != null ? ownerDocument().outputSettings() : (new Document("")).outputSettings();
+    }
+
+    /**
+     Get the outer HTML of this node.
+     @param accum accumulator to place HTML into
+     */
+    abstract void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out);
+
+    abstract void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out);
+
+    public String toString() {
+        return outerHtml();
+    }
+
+    protected void indent(StringBuilder accum, int depth, Document.OutputSettings out) {
+        accum.append("\n").append(StringUtil.padding(depth * out.indentAmount()));
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        // todo: have nodes hold a child index, compare against that and parent (not children)
+        return false;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = parentNode != null ? parentNode.hashCode() : 0;
+        // not children, or will block stack as they go back up to parent)
+        result = 31 * result + (attributes != null ? attributes.hashCode() : 0);
+        return result;
+    }
+
+    /**
+     * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or
+     * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the
+     * original node.
+     * <p>
+     * The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}.
+     * @return stand-alone cloned node
+     */
+    @Override
+    public Node clone() {
+        return doClone(null); // splits for orphan
+    }
+
+    protected Node doClone(Node parent) {
+        Node clone;
+        try {
+            clone = (Node) super.clone();
+        } catch (CloneNotSupportedException e) {
+            throw new RuntimeException(e);
+        }
+
+        clone.parentNode = parent; // can be null, to create an orphan split
+        clone.siblingIndex = parent == null ? 0 : siblingIndex;
+        clone.attributes = attributes != null ? attributes.clone() : null;
+        clone.baseUri = baseUri;
+        clone.childNodes = new ArrayList<Node>(childNodes.size());
+        for (Node child: childNodes)
+            clone.childNodes.add(child.doClone(clone)); // clone() creates orphans, doClone() keeps parent
+
+        return clone;
+    }
+
+    private static class OuterHtmlVisitor implements NodeVisitor {
+        private StringBuilder accum;
+        private Document.OutputSettings out;
+
+        OuterHtmlVisitor(StringBuilder accum, Document.OutputSettings out) {
+            this.accum = accum;
+            this.out = out;
+        }
+
+        public void head(Node node, int depth) {
+            node.outerHtmlHead(accum, depth, out);
+        }
+
+        public void tail(Node node, int depth) {
+            if (!node.nodeName().equals("#text")) // saves a void hit.
+                node.outerHtmlTail(accum, depth, out);
+        }
+    }
+}
diff --git a/src/org/jsoup/nodes/TextNode.java b/src/org/jsoup/nodes/TextNode.java

new file mode 100644 (file)

index 0000000..9fd0fea
--- /dev/null
+++ b/src/org/jsoup/nodes/TextNode.java
@@ -0,0 +1,175 @@
+package org.jsoup.nodes;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+
+/**
+ A text node.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class TextNode extends Node {
+    /*
+    TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use
+    memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create
+    them as needed on the fly.
+     */
+    private static final String TEXT_KEY = "text";
+    String text;
+
+    /**
+     Create a new TextNode representing the supplied (unencoded) text).
+
+     @param text raw text
+     @param baseUri base uri
+     @see #createFromEncoded(String, String)
+     */
+    public TextNode(String text, String baseUri) {
+        this.baseUri = baseUri;
+        this.text = text;
+    }
+
+    public String nodeName() {
+        return "#text";
+    }
+    
+    /**
+     * Get the text content of this text node.
+     * @return Unencoded, normalised text.
+     * @see TextNode#getWholeText()
+     */
+    public String text() {
+        return normaliseWhitespace(getWholeText());
+    }
+    
+    /**
+     * Set the text content of this text node.
+     * @param text unencoded text
+     * @return this, for chaining
+     */
+    public TextNode text(String text) {
+        this.text = text;
+        if (attributes != null)
+            attributes.put(TEXT_KEY, text);
+        return this;
+    }
+
+    /**
+     Get the (unencoded) text of this text node, including any newlines and spaces present in the original.
+     @return text
+     */
+    public String getWholeText() {
+        return attributes == null ? text : attributes.get(TEXT_KEY);
+    }
+
+    /**
+     Test if this text node is blank -- that is, empty or only whitespace (including newlines).
+     @return true if this document is empty or only whitespace, false if it contains any text content.
+     */
+    public boolean isBlank() {
+        return StringUtil.isBlank(getWholeText());
+    }
+
+    /**
+     * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the
+     * original text up to the offset, and will have a new text node sibling containing the text after the offset.
+     * @param offset string offset point to split node at.
+     * @return the newly created text node containing the text after the offset.
+     */
+    public TextNode splitText(int offset) {
+        Validate.isTrue(offset >= 0, "Split offset must be not be negative");
+        Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length");
+
+        String head = getWholeText().substring(0, offset);
+        String tail = getWholeText().substring(offset);
+        text(head);
+        TextNode tailNode = new TextNode(tail, this.baseUri());
+        if (parent() != null)
+            parent().addChildren(siblingIndex()+1, tailNode);
+
+        return tailNode;
+    }
+
+    void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
+        String html = Entities.escape(getWholeText(), out);
+        if (out.prettyPrint() && parent() instanceof Element && !((Element) parent()).preserveWhitespace()) {
+            html = normaliseWhitespace(html);
+        }
+
+        if (out.prettyPrint() && siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank())
+            indent(accum, depth, out);
+        accum.append(html);
+    }
+
+    void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
+
+    public String toString() {
+        return outerHtml();
+    }
+
+    /**
+     * Create a new TextNode from HTML encoded (aka escaped) data.
+     * @param encodedText Text containing encoded HTML (e.g. &amp;lt;)
+     * @return TextNode containing unencoded data (e.g. &lt;)
+     */
+    public static TextNode createFromEncoded(String encodedText, String baseUri) {
+        String text = Entities.unescape(encodedText);
+        return new TextNode(text, baseUri);
+    }
+
+    static String normaliseWhitespace(String text) {
+        text = StringUtil.normaliseWhitespace(text);
+        return text;
+    }
+
+    static String stripLeadingWhitespace(String text) {
+        return text.replaceFirst("^\\s+", "");
+    }
+
+    static boolean lastCharIsWhitespace(StringBuilder sb) {
+        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
+    }
+
+    // attribute fiddling. create on first access.
+    private void ensureAttributes() {
+        if (attributes == null) {
+            attributes = new Attributes();
+            attributes.put(TEXT_KEY, text);
+        }
+    }
+
+    @Override
+    public String attr(String attributeKey) {
+        ensureAttributes();
+        return super.attr(attributeKey);
+    }
+
+    @Override
+    public Attributes attributes() {
+        ensureAttributes();
+        return super.attributes();
+    }
+
+    @Override
+    public Node attr(String attributeKey, String attributeValue) {
+        ensureAttributes();
+        return super.attr(attributeKey, attributeValue);
+    }
+
+    @Override
+    public boolean hasAttr(String attributeKey) {
+        ensureAttributes();
+        return super.hasAttr(attributeKey);
+    }
+
+    @Override
+    public Node removeAttr(String attributeKey) {
+        ensureAttributes();
+        return super.removeAttr(attributeKey);
+    }
+
+    @Override
+    public String absUrl(String attributeKey) {
+        ensureAttributes();
+        return super.absUrl(attributeKey);
+    }
+}
diff --git a/src/org/jsoup/nodes/XmlDeclaration.java b/src/org/jsoup/nodes/XmlDeclaration.java

new file mode 100644 (file)

index 0000000..80d4a01
--- /dev/null
+++ b/src/org/jsoup/nodes/XmlDeclaration.java
@@ -0,0 +1,48 @@
+package org.jsoup.nodes;
+
+/**
+ An XML Declaration.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class XmlDeclaration extends Node {
+    private static final String DECL_KEY = "declaration";
+    private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?)
+
+    /**
+     Create a new XML declaration
+     @param data data
+     @param baseUri base uri
+     @param isProcessingInstruction is processing instruction
+     */
+    public XmlDeclaration(String data, String baseUri, boolean isProcessingInstruction) {
+        super(baseUri);
+        attributes.put(DECL_KEY, data);
+        this.isProcessingInstruction = isProcessingInstruction;
+    }
+
+    public String nodeName() {
+        return "#declaration";
+    }
+
+    /**
+     Get the unencoded XML declaration.
+     @return XML declaration
+     */
+    public String getWholeDeclaration() {
+        return attributes.get(DECL_KEY);
+    }
+
+    void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
+        accum
+                .append("<")
+                .append(isProcessingInstruction ? "!" : "?")
+                .append(getWholeDeclaration())
+                .append(">");
+    }
+
+    void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {}
+
+    public String toString() {
+        return outerHtml();
+    }
+}
diff --git a/src/org/jsoup/nodes/entities-base.properties b/src/org/jsoup/nodes/entities-base.properties

new file mode 100644 (file)

index 0000000..3d1d11e
--- /dev/null
+++ b/src/org/jsoup/nodes/entities-base.properties
@@ -0,0 +1,106 @@
+AElig=000C6
+AMP=00026
+Aacute=000C1
+Acirc=000C2
+Agrave=000C0
+Aring=000C5
+Atilde=000C3
+Auml=000C4
+COPY=000A9
+Ccedil=000C7
+ETH=000D0
+Eacute=000C9
+Ecirc=000CA
+Egrave=000C8
+Euml=000CB
+GT=0003E
+Iacute=000CD
+Icirc=000CE
+Igrave=000CC
+Iuml=000CF
+LT=0003C
+Ntilde=000D1
+Oacute=000D3
+Ocirc=000D4
+Ograve=000D2
+Oslash=000D8
+Otilde=000D5
+Ouml=000D6
+QUOT=00022
+REG=000AE
+THORN=000DE
+Uacute=000DA
+Ucirc=000DB
+Ugrave=000D9
+Uuml=000DC
+Yacute=000DD
+aacute=000E1
+acirc=000E2
+acute=000B4
+aelig=000E6
+agrave=000E0
+amp=00026
+aring=000E5
+atilde=000E3
+auml=000E4
+brvbar=000A6
+ccedil=000E7
+cedil=000B8
+cent=000A2
+copy=000A9
+curren=000A4
+deg=000B0
+divide=000F7
+eacute=000E9
+ecirc=000EA
+egrave=000E8
+eth=000F0
+euml=000EB
+frac12=000BD
+frac14=000BC
+frac34=000BE
+gt=0003E
+iacute=000ED
+icirc=000EE
+iexcl=000A1
+igrave=000EC
+iquest=000BF
+iuml=000EF
+laquo=000AB
+lt=0003C
+macr=000AF
+micro=000B5
+middot=000B7
+nbsp=000A0
+not=000AC
+ntilde=000F1
+oacute=000F3
+ocirc=000F4
+ograve=000F2
+ordf=000AA
+ordm=000BA
+oslash=000F8
+otilde=000F5
+ouml=000F6
+para=000B6
+plusmn=000B1
+pound=000A3
+quot=00022
+raquo=000BB
+reg=000AE
+sect=000A7
+shy=000AD
+sup1=000B9
+sup2=000B2
+sup3=000B3
+szlig=000DF
+thorn=000FE
+times=000D7
+uacute=000FA
+ucirc=000FB
+ugrave=000F9
+uml=000A8
+uuml=000FC
+yacute=000FD
+yen=000A5
+yuml=000FF
diff --git a/src/org/jsoup/nodes/entities-full.properties b/src/org/jsoup/nodes/entities-full.properties

new file mode 100644 (file)

index 0000000..92f124f
--- /dev/null
+++ b/src/org/jsoup/nodes/entities-full.properties
@@ -0,0 +1,2032 @@
+AElig=000C6
+AMP=00026
+Aacute=000C1
+Abreve=00102
+Acirc=000C2
+Acy=00410
+Afr=1D504
+Agrave=000C0
+Alpha=00391
+Amacr=00100
+And=02A53
+Aogon=00104
+Aopf=1D538
+ApplyFunction=02061
+Aring=000C5
+Ascr=1D49C
+Assign=02254
+Atilde=000C3
+Auml=000C4
+Backslash=02216
+Barv=02AE7
+Barwed=02306
+Bcy=00411
+Because=02235
+Bernoullis=0212C
+Beta=00392
+Bfr=1D505
+Bopf=1D539
+Breve=002D8
+Bscr=0212C
+Bumpeq=0224E
+CHcy=00427
+COPY=000A9
+Cacute=00106
+Cap=022D2
+CapitalDifferentialD=02145
+Cayleys=0212D
+Ccaron=0010C
+Ccedil=000C7
+Ccirc=00108
+Cconint=02230
+Cdot=0010A
+Cedilla=000B8
+CenterDot=000B7
+Cfr=0212D
+Chi=003A7
+CircleDot=02299
+CircleMinus=02296
+CirclePlus=02295
+CircleTimes=02297
+ClockwiseContourIntegral=02232
+CloseCurlyDoubleQuote=0201D
+CloseCurlyQuote=02019
+Colon=02237
+Colone=02A74
+Congruent=02261
+Conint=0222F
+ContourIntegral=0222E
+Copf=02102
+Coproduct=02210
+CounterClockwiseContourIntegral=02233
+Cross=02A2F
+Cscr=1D49E
+Cup=022D3
+CupCap=0224D
+DD=02145
+DDotrahd=02911
+DJcy=00402
+DScy=00405
+DZcy=0040F
+Dagger=02021
+Darr=021A1
+Dashv=02AE4
+Dcaron=0010E
+Dcy=00414
+Del=02207
+Delta=00394
+Dfr=1D507
+DiacriticalAcute=000B4
+DiacriticalDot=002D9
+DiacriticalDoubleAcute=002DD
+DiacriticalGrave=00060
+DiacriticalTilde=002DC
+Diamond=022C4
+DifferentialD=02146
+Dopf=1D53B
+Dot=000A8
+DotDot=020DC
+DotEqual=02250
+DoubleContourIntegral=0222F
+DoubleDot=000A8
+DoubleDownArrow=021D3
+DoubleLeftArrow=021D0
+DoubleLeftRightArrow=021D4
+DoubleLeftTee=02AE4
+DoubleLongLeftArrow=027F8
+DoubleLongLeftRightArrow=027FA
+DoubleLongRightArrow=027F9
+DoubleRightArrow=021D2
+DoubleRightTee=022A8
+DoubleUpArrow=021D1
+DoubleUpDownArrow=021D5
+DoubleVerticalBar=02225
+DownArrow=02193
+DownArrowBar=02913
+DownArrowUpArrow=021F5
+DownBreve=00311
+DownLeftRightVector=02950
+DownLeftTeeVector=0295E
+DownLeftVector=021BD
+DownLeftVectorBar=02956
+DownRightTeeVector=0295F
+DownRightVector=021C1
+DownRightVectorBar=02957
+DownTee=022A4
+DownTeeArrow=021A7
+Downarrow=021D3
+Dscr=1D49F
+Dstrok=00110
+ENG=0014A
+ETH=000D0
+Eacute=000C9
+Ecaron=0011A
+Ecirc=000CA
+Ecy=0042D
+Edot=00116
+Efr=1D508
+Egrave=000C8
+Element=02208
+Emacr=00112
+EmptySmallSquare=025FB
+EmptyVerySmallSquare=025AB
+Eogon=00118
+Eopf=1D53C
+Epsilon=00395
+Equal=02A75
+EqualTilde=02242
+Equilibrium=021CC
+Escr=02130
+Esim=02A73
+Eta=00397
+Euml=000CB
+Exists=02203
+ExponentialE=02147
+Fcy=00424
+Ffr=1D509
+FilledSmallSquare=025FC
+FilledVerySmallSquare=025AA
+Fopf=1D53D
+ForAll=02200
+Fouriertrf=02131
+Fscr=02131
+GJcy=00403
+GT=0003E
+Gamma=00393
+Gammad=003DC
+Gbreve=0011E
+Gcedil=00122
+Gcirc=0011C
+Gcy=00413
+Gdot=00120
+Gfr=1D50A
+Gg=022D9
+Gopf=1D53E
+GreaterEqual=02265
+GreaterEqualLess=022DB
+GreaterFullEqual=02267
+GreaterGreater=02AA2
+GreaterLess=02277
+GreaterSlantEqual=02A7E
+GreaterTilde=02273
+Gscr=1D4A2
+Gt=0226B
+HARDcy=0042A
+Hacek=002C7
+Hat=0005E
+Hcirc=00124
+Hfr=0210C
+HilbertSpace=0210B
+Hopf=0210D
+HorizontalLine=02500
+Hscr=0210B
+Hstrok=00126
+HumpDownHump=0224E
+HumpEqual=0224F
+IEcy=00415
+IJlig=00132
+IOcy=00401
+Iacute=000CD
+Icirc=000CE
+Icy=00418
+Idot=00130
+Ifr=02111
+Igrave=000CC
+Im=02111
+Imacr=0012A
+ImaginaryI=02148
+Implies=021D2
+Int=0222C
+Integral=0222B
+Intersection=022C2
+InvisibleComma=02063
+InvisibleTimes=02062
+Iogon=0012E
+Iopf=1D540
+Iota=00399
+Iscr=02110
+Itilde=00128
+Iukcy=00406
+Iuml=000CF
+Jcirc=00134
+Jcy=00419
+Jfr=1D50D
+Jopf=1D541
+Jscr=1D4A5
+Jsercy=00408
+Jukcy=00404
+KHcy=00425
+KJcy=0040C
+Kappa=0039A
+Kcedil=00136
+Kcy=0041A
+Kfr=1D50E
+Kopf=1D542
+Kscr=1D4A6
+LJcy=00409
+LT=0003C
+Lacute=00139
+Lambda=0039B
+Lang=027EA
+Laplacetrf=02112
+Larr=0219E
+Lcaron=0013D
+Lcedil=0013B
+Lcy=0041B
+LeftAngleBracket=027E8
+LeftArrow=02190
+LeftArrowBar=021E4
+LeftArrowRightArrow=021C6
+LeftCeiling=02308
+LeftDoubleBracket=027E6
+LeftDownTeeVector=02961
+LeftDownVector=021C3
+LeftDownVectorBar=02959
+LeftFloor=0230A
+LeftRightArrow=02194
+LeftRightVector=0294E
+LeftTee=022A3
+LeftTeeArrow=021A4
+LeftTeeVector=0295A
+LeftTriangle=022B2
+LeftTriangleBar=029CF
+LeftTriangleEqual=022B4
+LeftUpDownVector=02951
+LeftUpTeeVector=02960
+LeftUpVector=021BF
+LeftUpVectorBar=02958
+LeftVector=021BC
+LeftVectorBar=02952
+Leftarrow=021D0
+Leftrightarrow=021D4
+LessEqualGreater=022DA
+LessFullEqual=02266
+LessGreater=02276
+LessLess=02AA1
+LessSlantEqual=02A7D
+LessTilde=02272
+Lfr=1D50F
+Ll=022D8
+Lleftarrow=021DA
+Lmidot=0013F
+LongLeftArrow=027F5
+LongLeftRightArrow=027F7
+LongRightArrow=027F6
+Longleftarrow=027F8
+Longleftrightarrow=027FA
+Longrightarrow=027F9
+Lopf=1D543
+LowerLeftArrow=02199
+LowerRightArrow=02198
+Lscr=02112
+Lsh=021B0
+Lstrok=00141
+Lt=0226A
+Map=02905
+Mcy=0041C
+MediumSpace=0205F
+Mellintrf=02133
+Mfr=1D510
+MinusPlus=02213
+Mopf=1D544
+Mscr=02133
+Mu=0039C
+NJcy=0040A
+Nacute=00143
+Ncaron=00147
+Ncedil=00145
+Ncy=0041D
+NegativeMediumSpace=0200B
+NegativeThickSpace=0200B
+NegativeThinSpace=0200B
+NegativeVeryThinSpace=0200B
+NestedGreaterGreater=0226B
+NestedLessLess=0226A
+NewLine=0000A
+Nfr=1D511
+NoBreak=02060
+NonBreakingSpace=000A0
+Nopf=02115
+Not=02AEC
+NotCongruent=02262
+NotCupCap=0226D
+NotDoubleVerticalBar=02226
+NotElement=02209
+NotEqual=02260
+NotExists=02204
+NotGreater=0226F
+NotGreaterEqual=02271
+NotGreaterLess=02279
+NotGreaterTilde=02275
+NotLeftTriangle=022EA
+NotLeftTriangleEqual=022EC
+NotLess=0226E
+NotLessEqual=02270
+NotLessGreater=02278
+NotLessTilde=02274
+NotPrecedes=02280
+NotPrecedesSlantEqual=022E0
+NotReverseElement=0220C
+NotRightTriangle=022EB
+NotRightTriangleEqual=022ED
+NotSquareSubsetEqual=022E2
+NotSquareSupersetEqual=022E3
+NotSubsetEqual=02288
+NotSucceeds=02281
+NotSucceedsSlantEqual=022E1
+NotSupersetEqual=02289
+NotTilde=02241
+NotTildeEqual=02244
+NotTildeFullEqual=02247
+NotTildeTilde=02249
+NotVerticalBar=02224
+Nscr=1D4A9
+Ntilde=000D1
+Nu=0039D
+OElig=00152
+Oacute=000D3
+Ocirc=000D4
+Ocy=0041E
+Odblac=00150
+Ofr=1D512
+Ograve=000D2
+Omacr=0014C
+Omega=003A9
+Omicron=0039F
+Oopf=1D546
+OpenCurlyDoubleQuote=0201C
+OpenCurlyQuote=02018
+Or=02A54
+Oscr=1D4AA
+Oslash=000D8
+Otilde=000D5
+Otimes=02A37
+Ouml=000D6
+OverBar=0203E
+OverBrace=023DE
+OverBracket=023B4
+OverParenthesis=023DC
+PartialD=02202
+Pcy=0041F
+Pfr=1D513
+Phi=003A6
+Pi=003A0
+PlusMinus=000B1
+Poincareplane=0210C
+Popf=02119
+Pr=02ABB
+Precedes=0227A
+PrecedesEqual=02AAF
+PrecedesSlantEqual=0227C
+PrecedesTilde=0227E
+Prime=02033
+Product=0220F
+Proportion=02237
+Proportional=0221D
+Pscr=1D4AB
+Psi=003A8
+QUOT=00022
+Qfr=1D514
+Qopf=0211A
+Qscr=1D4AC
+RBarr=02910
+REG=000AE
+Racute=00154
+Rang=027EB
+Rarr=021A0
+Rarrtl=02916
+Rcaron=00158
+Rcedil=00156
+Rcy=00420
+Re=0211C
+ReverseElement=0220B
+ReverseEquilibrium=021CB
+ReverseUpEquilibrium=0296F
+Rfr=0211C
+Rho=003A1
+RightAngleBracket=027E9
+RightArrow=02192
+RightArrowBar=021E5
+RightArrowLeftArrow=021C4
+RightCeiling=02309
+RightDoubleBracket=027E7
+RightDownTeeVector=0295D
+RightDownVector=021C2
+RightDownVectorBar=02955
+RightFloor=0230B
+RightTee=022A2
+RightTeeArrow=021A6
+RightTeeVector=0295B
+RightTriangle=022B3
+RightTriangleBar=029D0
+RightTriangleEqual=022B5
+RightUpDownVector=0294F
+RightUpTeeVector=0295C
+RightUpVector=021BE
+RightUpVectorBar=02954
+RightVector=021C0
+RightVectorBar=02953
+Rightarrow=021D2
+Ropf=0211D
+RoundImplies=02970
+Rrightarrow=021DB
+Rscr=0211B
+Rsh=021B1
+RuleDelayed=029F4
+SHCHcy=00429
+SHcy=00428
+SOFTcy=0042C
+Sacute=0015A
+Sc=02ABC
+Scaron=00160
+Scedil=0015E
+Scirc=0015C
+Scy=00421
+Sfr=1D516
+ShortDownArrow=02193
+ShortLeftArrow=02190
+ShortRightArrow=02192
+ShortUpArrow=02191
+Sigma=003A3
+SmallCircle=02218
+Sopf=1D54A
+Sqrt=0221A
+Square=025A1
+SquareIntersection=02293
+SquareSubset=0228F
+SquareSubsetEqual=02291
+SquareSuperset=02290
+SquareSupersetEqual=02292
+SquareUnion=02294
+Sscr=1D4AE
+Star=022C6
+Sub=022D0
+Subset=022D0
+SubsetEqual=02286
+Succeeds=0227B
+SucceedsEqual=02AB0
+SucceedsSlantEqual=0227D
+SucceedsTilde=0227F
+SuchThat=0220B
+Sum=02211
+Sup=022D1
+Superset=02283
+SupersetEqual=02287
+Supset=022D1
+THORN=000DE
+TRADE=02122
+TSHcy=0040B
+TScy=00426
+Tab=00009
+Tau=003A4
+Tcaron=00164
+Tcedil=00162
+Tcy=00422
+Tfr=1D517
+Therefore=02234
+Theta=00398
+ThinSpace=02009
+Tilde=0223C
+TildeEqual=02243
+TildeFullEqual=02245
+TildeTilde=02248
+Topf=1D54B
+TripleDot=020DB
+Tscr=1D4AF
+Tstrok=00166
+Uacute=000DA
+Uarr=0219F
+Uarrocir=02949
+Ubrcy=0040E
+Ubreve=0016C
+Ucirc=000DB
+Ucy=00423
+Udblac=00170
+Ufr=1D518
+Ugrave=000D9
+Umacr=0016A
+UnderBar=0005F
+UnderBrace=023DF
+UnderBracket=023B5
+UnderParenthesis=023DD
+Union=022C3
+UnionPlus=0228E
+Uogon=00172
+Uopf=1D54C
+UpArrow=02191
+UpArrowBar=02912
+UpArrowDownArrow=021C5
+UpDownArrow=02195
+UpEquilibrium=0296E
+UpTee=022A5
+UpTeeArrow=021A5
+Uparrow=021D1
+Updownarrow=021D5
+UpperLeftArrow=02196
+UpperRightArrow=02197
+Upsi=003D2
+Upsilon=003A5
+Uring=0016E
+Uscr=1D4B0
+Utilde=00168
+Uuml=000DC
+VDash=022AB
+Vbar=02AEB
+Vcy=00412
+Vdash=022A9
+Vdashl=02AE6
+Vee=022C1
+Verbar=02016
+Vert=02016
+VerticalBar=02223
+VerticalLine=0007C
+VerticalSeparator=02758
+VerticalTilde=02240
+VeryThinSpace=0200A
+Vfr=1D519
+Vopf=1D54D
+Vscr=1D4B1
+Vvdash=022AA
+Wcirc=00174
+Wedge=022C0
+Wfr=1D51A
+Wopf=1D54E
+Wscr=1D4B2
+Xfr=1D51B
+Xi=0039E
+Xopf=1D54F
+Xscr=1D4B3
+YAcy=0042F
+YIcy=00407
+YUcy=0042E
+Yacute=000DD
+Ycirc=00176
+Ycy=0042B
+Yfr=1D51C
+Yopf=1D550
+Yscr=1D4B4
+Yuml=00178
+ZHcy=00416
+Zacute=00179
+Zcaron=0017D
+Zcy=00417
+Zdot=0017B
+ZeroWidthSpace=0200B
+Zeta=00396
+Zfr=02128
+Zopf=02124
+Zscr=1D4B5
+aacute=000E1
+abreve=00103
+ac=0223E
+acd=0223F
+acirc=000E2
+acute=000B4
+acy=00430
+aelig=000E6
+af=02061
+afr=1D51E
+agrave=000E0
+alefsym=02135
+aleph=02135
+alpha=003B1
+amacr=00101
+amalg=02A3F
+amp=00026
+and=02227
+andand=02A55
+andd=02A5C
+andslope=02A58
+andv=02A5A
+ang=02220
+ange=029A4
+angle=02220
+angmsd=02221
+angmsdaa=029A8
+angmsdab=029A9
+angmsdac=029AA
+angmsdad=029AB
+angmsdae=029AC
+angmsdaf=029AD
+angmsdag=029AE
+angmsdah=029AF
+angrt=0221F
+angrtvb=022BE
+angrtvbd=0299D
+angsph=02222
+angst=000C5
+angzarr=0237C
+aogon=00105
+aopf=1D552
+ap=02248
+apE=02A70
+apacir=02A6F
+ape=0224A
+apid=0224B
+apos=00027
+approx=02248
+approxeq=0224A
+aring=000E5
+ascr=1D4B6
+ast=0002A
+asymp=02248
+asympeq=0224D
+atilde=000E3
+auml=000E4
+awconint=02233
+awint=02A11
+bNot=02AED
+backcong=0224C
+backepsilon=003F6
+backprime=02035
+backsim=0223D
+backsimeq=022CD
+barvee=022BD
+barwed=02305
+barwedge=02305
+bbrk=023B5
+bbrktbrk=023B6
+bcong=0224C
+bcy=00431
+bdquo=0201E
+becaus=02235
+because=02235
+bemptyv=029B0
+bepsi=003F6
+bernou=0212C
+beta=003B2
+beth=02136
+between=0226C
+bfr=1D51F
+bigcap=022C2
+bigcirc=025EF
+bigcup=022C3
+bigodot=02A00
+bigoplus=02A01
+bigotimes=02A02
+bigsqcup=02A06
+bigstar=02605
+bigtriangledown=025BD
+bigtriangleup=025B3
+biguplus=02A04
+bigvee=022C1
+bigwedge=022C0
+bkarow=0290D
+blacklozenge=029EB
+blacksquare=025AA
+blacktriangle=025B4
+blacktriangledown=025BE
+blacktriangleleft=025C2
+blacktriangleright=025B8
+blank=02423
+blk12=02592
+blk14=02591
+blk34=02593
+block=02588
+bnot=02310
+bopf=1D553
+bot=022A5
+bottom=022A5
+bowtie=022C8
+boxDL=02557
+boxDR=02554
+boxDl=02556
+boxDr=02553
+boxH=02550
+boxHD=02566
+boxHU=02569
+boxHd=02564
+boxHu=02567
+boxUL=0255D
+boxUR=0255A
+boxUl=0255C
+boxUr=02559
+boxV=02551
+boxVH=0256C
+boxVL=02563
+boxVR=02560
+boxVh=0256B
+boxVl=02562
+boxVr=0255F
+boxbox=029C9
+boxdL=02555
+boxdR=02552
+boxdl=02510
+boxdr=0250C
+boxh=02500
+boxhD=02565
+boxhU=02568
+boxhd=0252C
+boxhu=02534
+boxminus=0229F
+boxplus=0229E
+boxtimes=022A0
+boxuL=0255B
+boxuR=02558
+boxul=02518
+boxur=02514
+boxv=02502
+boxvH=0256A
+boxvL=02561
+boxvR=0255E
+boxvh=0253C
+boxvl=02524
+boxvr=0251C
+bprime=02035
+breve=002D8
+brvbar=000A6
+bscr=1D4B7
+bsemi=0204F
+bsim=0223D
+bsime=022CD
+bsol=0005C
+bsolb=029C5
+bsolhsub=027C8
+bull=02022
+bullet=02022
+bump=0224E
+bumpE=02AAE
+bumpe=0224F
+bumpeq=0224F
+cacute=00107
+cap=02229
+capand=02A44
+capbrcup=02A49
+capcap=02A4B
+capcup=02A47
+capdot=02A40
+caret=02041
+caron=002C7
+ccaps=02A4D
+ccaron=0010D
+ccedil=000E7
+ccirc=00109
+ccups=02A4C
+ccupssm=02A50
+cdot=0010B
+cedil=000B8
+cemptyv=029B2
+cent=000A2
+centerdot=000B7
+cfr=1D520
+chcy=00447
+check=02713
+checkmark=02713
+chi=003C7
+cir=025CB
+cirE=029C3
+circ=002C6
+circeq=02257
+circlearrowleft=021BA
+circlearrowright=021BB
+circledR=000AE
+circledS=024C8
+circledast=0229B
+circledcirc=0229A
+circleddash=0229D
+cire=02257
+cirfnint=02A10
+cirmid=02AEF
+cirscir=029C2
+clubs=02663
+clubsuit=02663
+colon=0003A
+colone=02254
+coloneq=02254
+comma=0002C
+commat=00040
+comp=02201
+compfn=02218
+complement=02201
+complexes=02102
+cong=02245
+congdot=02A6D
+conint=0222E
+copf=1D554
+coprod=02210
+copy=000A9
+copysr=02117
+crarr=021B5
+cross=02717
+cscr=1D4B8
+csub=02ACF
+csube=02AD1
+csup=02AD0
+csupe=02AD2
+ctdot=022EF
+cudarrl=02938
+cudarrr=02935
+cuepr=022DE
+cuesc=022DF
+cularr=021B6
+cularrp=0293D
+cup=0222A
+cupbrcap=02A48
+cupcap=02A46
+cupcup=02A4A
+cupdot=0228D
+cupor=02A45
+curarr=021B7
+curarrm=0293C
+curlyeqprec=022DE
+curlyeqsucc=022DF
+curlyvee=022CE
+curlywedge=022CF
+curren=000A4
+curvearrowleft=021B6
+curvearrowright=021B7
+cuvee=022CE
+cuwed=022CF
+cwconint=02232
+cwint=02231
+cylcty=0232D
+dArr=021D3
+dHar=02965
+dagger=02020
+daleth=02138
+darr=02193
+dash=02010
+dashv=022A3
+dbkarow=0290F
+dblac=002DD
+dcaron=0010F
+dcy=00434
+dd=02146
+ddagger=02021
+ddarr=021CA
+ddotseq=02A77
+deg=000B0
+delta=003B4
+demptyv=029B1
+dfisht=0297F
+dfr=1D521
+dharl=021C3
+dharr=021C2
+diam=022C4
+diamond=022C4
+diamondsuit=02666
+diams=02666
+die=000A8
+digamma=003DD
+disin=022F2
+div=000F7
+divide=000F7
+divideontimes=022C7
+divonx=022C7
+djcy=00452
+dlcorn=0231E
+dlcrop=0230D
+dollar=00024
+dopf=1D555
+dot=002D9
+doteq=02250
+doteqdot=02251
+dotminus=02238
+dotplus=02214
+dotsquare=022A1
+doublebarwedge=02306
+downarrow=02193
+downdownarrows=021CA
+downharpoonleft=021C3
+downharpoonright=021C2
+drbkarow=02910
+drcorn=0231F
+drcrop=0230C
+dscr=1D4B9
+dscy=00455
+dsol=029F6
+dstrok=00111
+dtdot=022F1
+dtri=025BF
+dtrif=025BE
+duarr=021F5
+duhar=0296F
+dwangle=029A6
+dzcy=0045F
+dzigrarr=027FF
+eDDot=02A77
+eDot=02251
+eacute=000E9
+easter=02A6E
+ecaron=0011B
+ecir=02256
+ecirc=000EA
+ecolon=02255
+ecy=0044D
+edot=00117
+ee=02147
+efDot=02252
+efr=1D522
+eg=02A9A
+egrave=000E8
+egs=02A96
+egsdot=02A98
+el=02A99
+elinters=023E7
+ell=02113
+els=02A95
+elsdot=02A97
+emacr=00113
+empty=02205
+emptyset=02205
+emptyv=02205
+emsp13=02004
+emsp14=02005
+emsp=02003
+eng=0014B
+ensp=02002
+eogon=00119
+eopf=1D556
+epar=022D5
+eparsl=029E3
+eplus=02A71
+epsi=003B5
+epsilon=003B5
+epsiv=003F5
+eqcirc=02256
+eqcolon=02255
+eqsim=02242
+eqslantgtr=02A96
+eqslantless=02A95
+equals=0003D
+equest=0225F
+equiv=02261
+equivDD=02A78
+eqvparsl=029E5
+erDot=02253
+erarr=02971
+escr=0212F
+esdot=02250
+esim=02242
+eta=003B7
+eth=000F0
+euml=000EB
+euro=020AC
+excl=00021
+exist=02203
+expectation=02130
+exponentiale=02147
+fallingdotseq=02252
+fcy=00444
+female=02640
+ffilig=0FB03
+fflig=0FB00
+ffllig=0FB04
+ffr=1D523
+filig=0FB01
+flat=0266D
+fllig=0FB02
+fltns=025B1
+fnof=00192
+fopf=1D557
+forall=02200
+fork=022D4
+forkv=02AD9
+fpartint=02A0D
+frac12=000BD
+frac13=02153
+frac14=000BC
+frac15=02155
+frac16=02159
+frac18=0215B
+frac23=02154
+frac25=02156
+frac34=000BE
+frac35=02157
+frac38=0215C
+frac45=02158
+frac56=0215A
+frac58=0215D
+frac78=0215E
+frasl=02044
+frown=02322
+fscr=1D4BB
+gE=02267
+gEl=02A8C
+gacute=001F5
+gamma=003B3
+gammad=003DD
+gap=02A86
+gbreve=0011F
+gcirc=0011D
+gcy=00433
+gdot=00121
+ge=02265
+gel=022DB
+geq=02265
+geqq=02267
+geqslant=02A7E
+ges=02A7E
+gescc=02AA9
+gesdot=02A80
+gesdoto=02A82
+gesdotol=02A84
+gesles=02A94
+gfr=1D524
+gg=0226B
+ggg=022D9
+gimel=02137
+gjcy=00453
+gl=02277
+glE=02A92
+gla=02AA5
+glj=02AA4
+gnE=02269
+gnap=02A8A
+gnapprox=02A8A
+gne=02A88
+gneq=02A88
+gneqq=02269
+gnsim=022E7
+gopf=1D558
+grave=00060
+gscr=0210A
+gsim=02273
+gsime=02A8E
+gsiml=02A90
+gt=0003E
+gtcc=02AA7
+gtcir=02A7A
+gtdot=022D7
+gtlPar=02995
+gtquest=02A7C
+gtrapprox=02A86
+gtrarr=02978
+gtrdot=022D7
+gtreqless=022DB
+gtreqqless=02A8C
+gtrless=02277
+gtrsim=02273
+hArr=021D4
+hairsp=0200A
+half=000BD
+hamilt=0210B
+hardcy=0044A
+harr=02194
+harrcir=02948
+harrw=021AD
+hbar=0210F
+hcirc=00125
+hearts=02665
+heartsuit=02665
+hellip=02026
+hercon=022B9
+hfr=1D525
+hksearow=02925
+hkswarow=02926
+hoarr=021FF
+homtht=0223B
+hookleftarrow=021A9
+hookrightarrow=021AA
+hopf=1D559
+horbar=02015
+hscr=1D4BD
+hslash=0210F
+hstrok=00127
+hybull=02043
+hyphen=02010
+iacute=000ED
+ic=02063
+icirc=000EE
+icy=00438
+iecy=00435
+iexcl=000A1
+iff=021D4
+ifr=1D526
+igrave=000EC
+ii=02148
+iiiint=02A0C
+iiint=0222D
+iinfin=029DC
+iiota=02129
+ijlig=00133
+imacr=0012B
+image=02111
+imagline=02110
+imagpart=02111
+imath=00131
+imof=022B7
+imped=001B5
+in=02208
+incare=02105
+infin=0221E
+infintie=029DD
+inodot=00131
+int=0222B
+intcal=022BA
+integers=02124
+intercal=022BA
+intlarhk=02A17
+intprod=02A3C
+iocy=00451
+iogon=0012F
+iopf=1D55A
+iota=003B9
+iprod=02A3C
+iquest=000BF
+iscr=1D4BE
+isin=02208
+isinE=022F9
+isindot=022F5
+isins=022F4
+isinsv=022F3
+isinv=02208
+it=02062
+itilde=00129
+iukcy=00456
+iuml=000EF
+jcirc=00135
+jcy=00439
+jfr=1D527
+jmath=00237
+jopf=1D55B
+jscr=1D4BF
+jsercy=00458
+jukcy=00454
+kappa=003BA
+kappav=003F0
+kcedil=00137
+kcy=0043A
+kfr=1D528
+kgreen=00138
+khcy=00445
+kjcy=0045C
+kopf=1D55C
+kscr=1D4C0
+lAarr=021DA
+lArr=021D0
+lAtail=0291B
+lBarr=0290E
+lE=02266
+lEg=02A8B
+lHar=02962
+lacute=0013A
+laemptyv=029B4
+lagran=02112
+lambda=003BB
+lang=027E8
+langd=02991
+langle=027E8
+lap=02A85
+laquo=000AB
+larr=02190
+larrb=021E4
+larrbfs=0291F
+larrfs=0291D
+larrhk=021A9
+larrlp=021AB
+larrpl=02939
+larrsim=02973
+larrtl=021A2
+lat=02AAB
+latail=02919
+late=02AAD
+lbarr=0290C
+lbbrk=02772
+lbrace=0007B
+lbrack=0005B
+lbrke=0298B
+lbrksld=0298F
+lbrkslu=0298D
+lcaron=0013E
+lcedil=0013C
+lceil=02308
+lcub=0007B
+lcy=0043B
+ldca=02936
+ldquo=0201C
+ldquor=0201E
+ldrdhar=02967
+ldrushar=0294B
+ldsh=021B2
+le=02264
+leftarrow=02190
+leftarrowtail=021A2
+leftharpoondown=021BD
+leftharpoonup=021BC
+leftleftarrows=021C7
+leftrightarrow=02194
+leftrightarrows=021C6
+leftrightharpoons=021CB
+leftrightsquigarrow=021AD
+leftthreetimes=022CB
+leg=022DA
+leq=02264
+leqq=02266
+leqslant=02A7D
+les=02A7D
+lescc=02AA8
+lesdot=02A7F
+lesdoto=02A81
+lesdotor=02A83
+lesges=02A93
+lessapprox=02A85
+lessdot=022D6
+lesseqgtr=022DA
+lesseqqgtr=02A8B
+lessgtr=02276
+lesssim=02272
+lfisht=0297C
+lfloor=0230A
+lfr=1D529
+lg=02276
+lgE=02A91
+lhard=021BD
+lharu=021BC
+lharul=0296A
+lhblk=02584
+ljcy=00459
+ll=0226A
+llarr=021C7
+llcorner=0231E
+llhard=0296B
+lltri=025FA
+lmidot=00140
+lmoust=023B0
+lmoustache=023B0
+lnE=02268
+lnap=02A89
+lnapprox=02A89
+lne=02A87
+lneq=02A87
+lneqq=02268
+lnsim=022E6
+loang=027EC
+loarr=021FD
+lobrk=027E6
+longleftarrow=027F5
+longleftrightarrow=027F7
+longmapsto=027FC
+longrightarrow=027F6
+looparrowleft=021AB
+looparrowright=021AC
+lopar=02985
+lopf=1D55D
+loplus=02A2D
+lotimes=02A34
+lowast=02217
+lowbar=0005F
+loz=025CA
+lozenge=025CA
+lozf=029EB
+lpar=00028
+lparlt=02993
+lrarr=021C6
+lrcorner=0231F
+lrhar=021CB
+lrhard=0296D
+lrm=0200E
+lrtri=022BF
+lsaquo=02039
+lscr=1D4C1
+lsh=021B0
+lsim=02272
+lsime=02A8D
+lsimg=02A8F
+lsqb=0005B
+lsquo=02018
+lsquor=0201A
+lstrok=00142
+lt=0003C
+ltcc=02AA6
+ltcir=02A79
+ltdot=022D6
+lthree=022CB
+ltimes=022C9
+ltlarr=02976
+ltquest=02A7B
+ltrPar=02996
+ltri=025C3
+ltrie=022B4
+ltrif=025C2
+lurdshar=0294A
+luruhar=02966
+mDDot=0223A
+macr=000AF
+male=02642
+malt=02720
+maltese=02720
+map=021A6
+mapsto=021A6
+mapstodown=021A7
+mapstoleft=021A4
+mapstoup=021A5
+marker=025AE
+mcomma=02A29
+mcy=0043C
+mdash=02014
+measuredangle=02221
+mfr=1D52A
+mho=02127
+micro=000B5
+mid=02223
+midast=0002A
+midcir=02AF0
+middot=000B7
+minus=02212
+minusb=0229F
+minusd=02238
+minusdu=02A2A
+mlcp=02ADB
+mldr=02026
+mnplus=02213
+models=022A7
+mopf=1D55E
+mp=02213
+mscr=1D4C2
+mstpos=0223E
+mu=003BC
+multimap=022B8
+mumap=022B8
+nLeftarrow=021CD
+nLeftrightarrow=021CE
+nRightarrow=021CF
+nVDash=022AF
+nVdash=022AE
+nabla=02207
+nacute=00144
+nap=02249
+napos=00149
+napprox=02249
+natur=0266E
+natural=0266E
+naturals=02115
+nbsp=000A0
+ncap=02A43
+ncaron=00148
+ncedil=00146
+ncong=02247
+ncup=02A42
+ncy=0043D
+ndash=02013
+ne=02260
+neArr=021D7
+nearhk=02924
+nearr=02197
+nearrow=02197
+nequiv=02262
+nesear=02928
+nexist=02204
+nexists=02204
+nfr=1D52B
+nge=02271
+ngeq=02271
+ngsim=02275
+ngt=0226F
+ngtr=0226F
+nhArr=021CE
+nharr=021AE
+nhpar=02AF2
+ni=0220B
+nis=022FC
+nisd=022FA
+niv=0220B
+njcy=0045A
+nlArr=021CD
+nlarr=0219A
+nldr=02025
+nle=02270
+nleftarrow=0219A
+nleftrightarrow=021AE
+nleq=02270
+nless=0226E
+nlsim=02274
+nlt=0226E
+nltri=022EA
+nltrie=022EC
+nmid=02224
+nopf=1D55F
+not=000AC
+notin=02209
+notinva=02209
+notinvb=022F7
+notinvc=022F6
+notni=0220C
+notniva=0220C
+notnivb=022FE
+notnivc=022FD
+npar=02226
+nparallel=02226
+npolint=02A14
+npr=02280
+nprcue=022E0
+nprec=02280
+nrArr=021CF
+nrarr=0219B
+nrightarrow=0219B
+nrtri=022EB
+nrtrie=022ED
+nsc=02281
+nsccue=022E1
+nscr=1D4C3
+nshortmid=02224
+nshortparallel=02226
+nsim=02241
+nsime=02244
+nsimeq=02244
+nsmid=02224
+nspar=02226
+nsqsube=022E2
+nsqsupe=022E3
+nsub=02284
+nsube=02288
+nsubseteq=02288
+nsucc=02281
+nsup=02285
+nsupe=02289
+nsupseteq=02289
+ntgl=02279
+ntilde=000F1
+ntlg=02278
+ntriangleleft=022EA
+ntrianglelefteq=022EC
+ntriangleright=022EB
+ntrianglerighteq=022ED
+nu=003BD
+num=00023
+numero=02116
+numsp=02007
+nvDash=022AD
+nvHarr=02904
+nvdash=022AC
+nvinfin=029DE
+nvlArr=02902
+nvrArr=02903
+nwArr=021D6
+nwarhk=02923
+nwarr=02196
+nwarrow=02196
+nwnear=02927
+oS=024C8
+oacute=000F3
+oast=0229B
+ocir=0229A
+ocirc=000F4
+ocy=0043E
+odash=0229D
+odblac=00151
+odiv=02A38
+odot=02299
+odsold=029BC
+oelig=00153
+ofcir=029BF
+ofr=1D52C
+ogon=002DB
+ograve=000F2
+ogt=029C1
+ohbar=029B5
+ohm=003A9
+oint=0222E
+olarr=021BA
+olcir=029BE
+olcross=029BB
+oline=0203E
+olt=029C0
+omacr=0014D
+omega=003C9
+omicron=003BF
+omid=029B6
+ominus=02296
+oopf=1D560
+opar=029B7
+operp=029B9
+oplus=02295
+or=02228
+orarr=021BB
+ord=02A5D
+order=02134
+orderof=02134
+ordf=000AA
+ordm=000BA
+origof=022B6
+oror=02A56
+orslope=02A57
+orv=02A5B
+oscr=02134
+oslash=000F8
+osol=02298
+otilde=000F5
+otimes=02297
+otimesas=02A36
+ouml=000F6
+ovbar=0233D
+par=02225
+para=000B6
+parallel=02225
+parsim=02AF3
+parsl=02AFD
+part=02202
+pcy=0043F
+percnt=00025
+period=0002E
+permil=02030
+perp=022A5
+pertenk=02031
+pfr=1D52D
+phi=003C6
+phiv=003D5
+phmmat=02133
+phone=0260E
+pi=003C0
+pitchfork=022D4
+piv=003D6
+planck=0210F
+planckh=0210E
+plankv=0210F
+plus=0002B
+plusacir=02A23
+plusb=0229E
+pluscir=02A22
+plusdo=02214
+plusdu=02A25
+pluse=02A72
+plusmn=000B1
+plussim=02A26
+plustwo=02A27
+pm=000B1
+pointint=02A15
+popf=1D561
+pound=000A3
+pr=0227A
+prE=02AB3
+prap=02AB7
+prcue=0227C
+pre=02AAF
+prec=0227A
+precapprox=02AB7
+preccurlyeq=0227C
+preceq=02AAF
+precnapprox=02AB9
+precneqq=02AB5
+precnsim=022E8
+precsim=0227E
+prime=02032
+primes=02119
+prnE=02AB5
+prnap=02AB9
+prnsim=022E8
+prod=0220F
+profalar=0232E
+profline=02312
+profsurf=02313
+prop=0221D
+propto=0221D
+prsim=0227E
+prurel=022B0
+pscr=1D4C5
+psi=003C8
+puncsp=02008
+qfr=1D52E
+qint=02A0C
+qopf=1D562
+qprime=02057
+qscr=1D4C6
+quaternions=0210D
+quatint=02A16
+quest=0003F
+questeq=0225F
+quot=00022
+rAarr=021DB
+rArr=021D2
+rAtail=0291C
+rBarr=0290F
+rHar=02964
+racute=00155
+radic=0221A
+raemptyv=029B3
+rang=027E9
+rangd=02992
+range=029A5
+rangle=027E9
+raquo=000BB
+rarr=02192
+rarrap=02975
+rarrb=021E5
+rarrbfs=02920
+rarrc=02933
+rarrfs=0291E
+rarrhk=021AA
+rarrlp=021AC
+rarrpl=02945
+rarrsim=02974
+rarrtl=021A3
+rarrw=0219D
+ratail=0291A
+ratio=02236
+rationals=0211A
+rbarr=0290D
+rbbrk=02773
+rbrace=0007D
+rbrack=0005D
+rbrke=0298C
+rbrksld=0298E
+rbrkslu=02990
+rcaron=00159
+rcedil=00157
+rceil=02309
+rcub=0007D
+rcy=00440
+rdca=02937
+rdldhar=02969
+rdquo=0201D
+rdquor=0201D
+rdsh=021B3
+real=0211C
+realine=0211B
+realpart=0211C
+reals=0211D
+rect=025AD
+reg=000AE
+rfisht=0297D
+rfloor=0230B
+rfr=1D52F
+rhard=021C1
+rharu=021C0
+rharul=0296C
+rho=003C1
+rhov=003F1
+rightarrow=02192
+rightarrowtail=021A3
+rightharpoondown=021C1
+rightharpoonup=021C0
+rightleftarrows=021C4
+rightleftharpoons=021CC
+rightrightarrows=021C9
+rightsquigarrow=0219D
+rightthreetimes=022CC
+ring=002DA
+risingdotseq=02253
+rlarr=021C4
+rlhar=021CC
+rlm=0200F
+rmoust=023B1
+rmoustache=023B1
+rnmid=02AEE
+roang=027ED
+roarr=021FE
+robrk=027E7
+ropar=02986
+ropf=1D563
+roplus=02A2E
+rotimes=02A35
+rpar=00029
+rpargt=02994
+rppolint=02A12
+rrarr=021C9
+rsaquo=0203A
+rscr=1D4C7
+rsh=021B1
+rsqb=0005D
+rsquo=02019
+rsquor=02019
+rthree=022CC
+rtimes=022CA
+rtri=025B9
+rtrie=022B5
+rtrif=025B8
+rtriltri=029CE
+ruluhar=02968
+rx=0211E
+sacute=0015B
+sbquo=0201A
+sc=0227B
+scE=02AB4
+scap=02AB8
+scaron=00161
+sccue=0227D
+sce=02AB0
+scedil=0015F
+scirc=0015D
+scnE=02AB6
+scnap=02ABA
+scnsim=022E9
+scpolint=02A13
+scsim=0227F
+scy=00441
+sdot=022C5
+sdotb=022A1
+sdote=02A66
+seArr=021D8
+searhk=02925
+searr=02198
+searrow=02198
+sect=000A7
+semi=0003B
+seswar=02929
+setminus=02216
+setmn=02216
+sext=02736
+sfr=1D530
+sfrown=02322
+sharp=0266F
+shchcy=00449
+shcy=00448
+shortmid=02223
+shortparallel=02225
+shy=000AD
+sigma=003C3
+sigmaf=003C2
+sigmav=003C2
+sim=0223C
+simdot=02A6A
+sime=02243
+simeq=02243
+simg=02A9E
+simgE=02AA0
+siml=02A9D
+simlE=02A9F
+simne=02246
+simplus=02A24
+simrarr=02972
+slarr=02190
+smallsetminus=02216
+smashp=02A33
+smeparsl=029E4
+smid=02223
+smile=02323
+smt=02AAA
+smte=02AAC
+softcy=0044C
+sol=0002F
+solb=029C4
+solbar=0233F
+sopf=1D564
+spades=02660
+spadesuit=02660
+spar=02225
+sqcap=02293
+sqcup=02294
+sqsub=0228F
+sqsube=02291
+sqsubset=0228F
+sqsubseteq=02291
+sqsup=02290
+sqsupe=02292
+sqsupset=02290
+sqsupseteq=02292
+squ=025A1
+square=025A1
+squarf=025AA
+squf=025AA
+srarr=02192
+sscr=1D4C8
+ssetmn=02216
+ssmile=02323
+sstarf=022C6
+star=02606
+starf=02605
+straightepsilon=003F5
+straightphi=003D5
+strns=000AF
+sub=02282
+subE=02AC5
+subdot=02ABD
+sube=02286
+subedot=02AC3
+submult=02AC1
+subnE=02ACB
+subne=0228A
+subplus=02ABF
+subrarr=02979
+subset=02282
+subseteq=02286
+subseteqq=02AC5
+subsetneq=0228A
+subsetneqq=02ACB
+subsim=02AC7
+subsub=02AD5
+subsup=02AD3
+succ=0227B
+succapprox=02AB8
+succcurlyeq=0227D
+succeq=02AB0
+succnapprox=02ABA
+succneqq=02AB6
+succnsim=022E9
+succsim=0227F
+sum=02211
+sung=0266A
+sup1=000B9
+sup2=000B2
+sup3=000B3
+sup=02283
+supE=02AC6
+supdot=02ABE
+supdsub=02AD8
+supe=02287
+supedot=02AC4
+suphsol=027C9
+suphsub=02AD7
+suplarr=0297B
+supmult=02AC2
+supnE=02ACC
+supne=0228B
+supplus=02AC0
+supset=02283
+supseteq=02287
+supseteqq=02AC6
+supsetneq=0228B
+supsetneqq=02ACC
+supsim=02AC8
+supsub=02AD4
+supsup=02AD6
+swArr=021D9
+swarhk=02926
+swarr=02199
+swarrow=02199
+swnwar=0292A
+szlig=000DF
+target=02316
+tau=003C4
+tbrk=023B4
+tcaron=00165
+tcedil=00163
+tcy=00442
+tdot=020DB
+telrec=02315
+tfr=1D531
+there4=02234
+therefore=02234
+theta=003B8
+thetasym=003D1
+thetav=003D1
+thickapprox=02248
+thicksim=0223C
+thinsp=02009
+thkap=02248
+thksim=0223C
+thorn=000FE
+tilde=002DC
+times=000D7
+timesb=022A0
+timesbar=02A31
+timesd=02A30
+tint=0222D
+toea=02928
+top=022A4
+topbot=02336
+topcir=02AF1
+topf=1D565
+topfork=02ADA
+tosa=02929
+tprime=02034
+trade=02122
+triangle=025B5
+triangledown=025BF
+triangleleft=025C3
+trianglelefteq=022B4
+triangleq=0225C
+triangleright=025B9
+trianglerighteq=022B5
+tridot=025EC
+trie=0225C
+triminus=02A3A
+triplus=02A39
+trisb=029CD
+tritime=02A3B
+trpezium=023E2
+tscr=1D4C9
+tscy=00446
+tshcy=0045B
+tstrok=00167
+twixt=0226C
+twoheadleftarrow=0219E
+twoheadrightarrow=021A0
+uArr=021D1
+uHar=02963
+uacute=000FA
+uarr=02191
+ubrcy=0045E
+ubreve=0016D
+ucirc=000FB
+ucy=00443
+udarr=021C5
+udblac=00171
+udhar=0296E
+ufisht=0297E
+ufr=1D532
+ugrave=000F9
+uharl=021BF
+uharr=021BE
+uhblk=02580
+ulcorn=0231C
+ulcorner=0231C
+ulcrop=0230F
+ultri=025F8
+umacr=0016B
+uml=000A8
+uogon=00173
+uopf=1D566
+uparrow=02191
+updownarrow=02195
+upharpoonleft=021BF
+upharpoonright=021BE
+uplus=0228E
+upsi=003C5
+upsih=003D2
+upsilon=003C5
+upuparrows=021C8
+urcorn=0231D
+urcorner=0231D
+urcrop=0230E
+uring=0016F
+urtri=025F9
+uscr=1D4CA
+utdot=022F0
+utilde=00169
+utri=025B5
+utrif=025B4
+uuarr=021C8
+uuml=000FC
+uwangle=029A7
+vArr=021D5
+vBar=02AE8
+vBarv=02AE9
+vDash=022A8
+vangrt=0299C
+varepsilon=003F5
+varkappa=003F0
+varnothing=02205
+varphi=003D5
+varpi=003D6
+varpropto=0221D
+varr=02195
+varrho=003F1
+varsigma=003C2
+vartheta=003D1
+vartriangleleft=022B2
+vartriangleright=022B3
+vcy=00432
+vdash=022A2
+vee=02228
+veebar=022BB
+veeeq=0225A
+vellip=022EE
+verbar=0007C
+vert=0007C
+vfr=1D533
+vltri=022B2
+vopf=1D567
+vprop=0221D
+vrtri=022B3
+vscr=1D4CB
+vzigzag=0299A
+wcirc=00175
+wedbar=02A5F
+wedge=02227
+wedgeq=02259
+weierp=02118
+wfr=1D534
+wopf=1D568
+wp=02118
+wr=02240
+wreath=02240
+wscr=1D4CC
+xcap=022C2
+xcirc=025EF
+xcup=022C3
+xdtri=025BD
+xfr=1D535
+xhArr=027FA
+xharr=027F7
+xi=003BE
+xlArr=027F8
+xlarr=027F5
+xmap=027FC
+xnis=022FB
+xodot=02A00
+xopf=1D569
+xoplus=02A01
+xotime=02A02
+xrArr=027F9
+xrarr=027F6
+xscr=1D4CD
+xsqcup=02A06
+xuplus=02A04
+xutri=025B3
+xvee=022C1
+xwedge=022C0
+yacute=000FD
+yacy=0044F
+ycirc=00177
+ycy=0044B
+yen=000A5
+yfr=1D536
+yicy=00457
+yopf=1D56A
+yscr=1D4CE
+yucy=0044E
+yuml=000FF
+zacute=0017A
+zcaron=0017E
+zcy=00437
+zdot=0017C
+zeetrf=02128
+zeta=003B6
+zfr=1D537
+zhcy=00436
+zigrarr=021DD
+zopf=1D56B
+zscr=1D4CF
+zwj=0200D
+zwnj=0200C
diff --git a/src/org/jsoup/nodes/package-info.java b/src/org/jsoup/nodes/package-info.java

new file mode 100644 (file)

index 0000000..24b1280
--- /dev/null
+++ b/src/org/jsoup/nodes/package-info.java
@@ -0,0 +1,4 @@
+/**
+ HTML document structure nodes.
+ */
+package org.jsoup.nodes;
+\ No newline at end of file
diff --git a/src/org/jsoup/package-info.java b/src/org/jsoup/package-info.java

new file mode 100644 (file)

index 0000000..4952611
--- /dev/null
+++ b/src/org/jsoup/package-info.java
@@ -0,0 +1,4 @@
+/**
+ Contains the main {@link org.jsoup.Jsoup} class, which provides convenient static access to the jsoup functionality. 
+ */
+package org.jsoup;
+\ No newline at end of file
diff --git a/src/org/jsoup/parser/CharacterReader.java b/src/org/jsoup/parser/CharacterReader.java

new file mode 100644 (file)

index 0000000..b549a57
--- /dev/null
+++ b/src/org/jsoup/parser/CharacterReader.java
@@ -0,0 +1,230 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+
+/**
+ CharacterReader consumes tokens off a string. To replace the old TokenQueue.
+ */
+class CharacterReader {
+    static final char EOF = (char) -1;
+
+    private final String input;
+    private final int length;
+    private int pos = 0;
+    private int mark = 0;
+
+    CharacterReader(String input) {
+        Validate.notNull(input);
+        input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns to newlines
+
+        this.input = input;
+        this.length = input.length();
+    }
+
+    int pos() {
+        return pos;
+    }
+
+    boolean isEmpty() {
+        return pos >= length;
+    }
+
+    char current() {
+        return isEmpty() ? EOF : input.charAt(pos);
+    }
+
+    char consume() {
+        char val = isEmpty() ? EOF : input.charAt(pos);
+        pos++;
+        return val;
+    }
+
+    void unconsume() {
+        pos--;
+    }
+
+    void advance() {
+        pos++;
+    }
+
+    void mark() {
+        mark = pos;
+    }
+
+    void rewindToMark() {
+        pos = mark;
+    }
+
+    String consumeAsString() {
+        return input.substring(pos, pos++);
+    }
+
+    String consumeTo(char c) {
+        int offset = input.indexOf(c, pos);
+        if (offset != -1) {
+            String consumed = input.substring(pos, offset);
+            pos += consumed.length();
+            return consumed;
+        } else {
+            return consumeToEnd();
+        }
+    }
+
+    String consumeTo(String seq) {
+        int offset = input.indexOf(seq, pos);
+        if (offset != -1) {
+            String consumed = input.substring(pos, offset);
+            pos += consumed.length();
+            return consumed;
+        } else {
+            return consumeToEnd();
+        }
+    }
+
+    String consumeToAny(char... seq) {
+        int start = pos;
+
+        OUTER: while (!isEmpty()) {
+            char c = input.charAt(pos);
+            for (char seek : seq) {
+                if (seek == c)
+                    break OUTER;
+            }
+            pos++;
+        }
+
+        return pos > start ? input.substring(start, pos) : "";
+    }
+
+    String consumeToEnd() {
+        String data = input.substring(pos, input.length());
+        pos = input.length();
+        return data;
+    }
+
+    String consumeLetterSequence() {
+        int start = pos;
+        while (!isEmpty()) {
+            char c = input.charAt(pos);
+            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+                pos++;
+            else
+                break;
+        }
+
+        return input.substring(start, pos);
+    }
+
+    String consumeLetterThenDigitSequence() {
+        int start = pos;
+        while (!isEmpty()) {
+            char c = input.charAt(pos);
+            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+                pos++;
+            else
+                break;
+        }
+        while (!isEmpty()) {
+            char c = input.charAt(pos);
+            if (c >= '0' && c <= '9')
+                pos++;
+            else
+                break;
+        }
+
+        return input.substring(start, pos);
+    }
+
+    String consumeHexSequence() {
+        int start = pos;
+        while (!isEmpty()) {
+            char c = input.charAt(pos);
+            if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
+                pos++;
+            else
+                break;
+        }
+        return input.substring(start, pos);
+    }
+
+    String consumeDigitSequence() {
+        int start = pos;
+        while (!isEmpty()) {
+            char c = input.charAt(pos);
+            if (c >= '0' && c <= '9')
+                pos++;
+            else
+                break;
+        }
+        return input.substring(start, pos);
+    }
+
+    boolean matches(char c) {
+        return !isEmpty() && input.charAt(pos) == c;
+
+    }
+
+    boolean matches(String seq) {
+        return input.startsWith(seq, pos);
+    }
+
+    boolean matchesIgnoreCase(String seq) {
+        return input.regionMatches(true, pos, seq, 0, seq.length());
+    }
+
+    boolean matchesAny(char... seq) {
+        if (isEmpty())
+            return false;
+
+        char c = input.charAt(pos);
+        for (char seek : seq) {
+            if (seek == c)
+                return true;
+        }
+        return false;
+    }
+
+    boolean matchesLetter() {
+        if (isEmpty())
+            return false;
+        char c = input.charAt(pos);
+        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+    }
+
+    boolean matchesDigit() {
+        if (isEmpty())
+            return false;
+        char c = input.charAt(pos);
+        return (c >= '0' && c <= '9');
+    }
+
+    boolean matchConsume(String seq) {
+        if (matches(seq)) {
+            pos += seq.length();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    boolean matchConsumeIgnoreCase(String seq) {
+        if (matchesIgnoreCase(seq)) {
+            pos += seq.length();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    boolean containsIgnoreCase(String seq) {
+        // used to check presence of </title>, </style>. only finds consistent case.
+        String loScan = seq.toLowerCase();
+        String hiScan = seq.toUpperCase();
+        return (input.indexOf(loScan, pos) > -1) || (input.indexOf(hiScan, pos) > -1);
+    }
+
+    @Override
+    public String toString() {
+        return input.substring(pos);
+    }
+}
diff --git a/src/org/jsoup/parser/HtmlTreeBuilder.java b/src/org/jsoup/parser/HtmlTreeBuilder.java

new file mode 100644 (file)

index 0000000..457a4c3
--- /dev/null
+++ b/src/org/jsoup/parser/HtmlTreeBuilder.java
@@ -0,0 +1,672 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.DescendableLinkedList;
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * HTML Tree Builder; creates a DOM from Tokens.
+ */
+class HtmlTreeBuilder extends TreeBuilder {
+
+    private HtmlTreeBuilderState state; // the current state
+    private HtmlTreeBuilderState originalState; // original / marked state
+
+    private boolean baseUriSetFromDoc = false;
+    private Element headElement; // the current head element
+    private Element formElement; // the current form element
+    private Element contextElement; // fragment parse context -- could be null even if fragment parsing
+    private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active (open) formatting elements
+    private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars in table to be shifted out
+
+    private boolean framesetOk = true; // if ok to go into frameset
+    private boolean fosterInserts = false; // if next inserts should be fostered
+    private boolean fragmentParsing = false; // if parsing a fragment of html
+
+    HtmlTreeBuilder() {}
+
+    @Override
+    Document parse(String input, String baseUri, ParseErrorList errors) {
+        state = HtmlTreeBuilderState.Initial;
+        return super.parse(input, baseUri, errors);
+    }
+
+    List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors) {
+        // context may be null
+        state = HtmlTreeBuilderState.Initial;
+        initialiseParse(inputFragment, baseUri, errors);
+        contextElement = context;
+        fragmentParsing = true;
+        Element root = null;
+
+        if (context != null) {
+            if (context.ownerDocument() != null) // quirks setup:
+                doc.quirksMode(context.ownerDocument().quirksMode());
+
+            // initialise the tokeniser state:
+            String contextTag = context.tagName();
+            if (StringUtil.in(contextTag, "title", "textarea"))
+                tokeniser.transition(TokeniserState.Rcdata);
+            else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp"))
+                tokeniser.transition(TokeniserState.Rawtext);
+            else if (contextTag.equals("script"))
+                tokeniser.transition(TokeniserState.ScriptData);
+            else if (contextTag.equals(("noscript")))
+                tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext
+            else if (contextTag.equals("plaintext"))
+                tokeniser.transition(TokeniserState.Data);
+            else
+                tokeniser.transition(TokeniserState.Data); // default
+
+            root = new Element(Tag.valueOf("html"), baseUri);
+            doc.appendChild(root);
+            stack.push(root);
+            resetInsertionMode();
+            // todo: setup form element to nearest form on context (up ancestor chain)
+        }
+
+        runParser();
+        if (context != null)
+            return root.childNodes();
+        else
+            return doc.childNodes();
+    }
+
+    @Override
+    protected boolean process(Token token) {
+        currentToken = token;
+        return this.state.process(token, this);
+    }
+
+    boolean process(Token token, HtmlTreeBuilderState state) {
+        currentToken = token;
+        return state.process(token, this);
+    }
+
+    void transition(HtmlTreeBuilderState state) {
+        this.state = state;
+    }
+
+    HtmlTreeBuilderState state() {
+        return state;
+    }
+
+    void markInsertionMode() {
+        originalState = state;
+    }
+
+    HtmlTreeBuilderState originalState() {
+        return originalState;
+    }
+
+    void framesetOk(boolean framesetOk) {
+        this.framesetOk = framesetOk;
+    }
+
+    boolean framesetOk() {
+        return framesetOk;
+    }
+
+    Document getDocument() {
+        return doc;
+    }
+
+    String getBaseUri() {
+        return baseUri;
+    }
+
+    void maybeSetBaseUri(Element base) {
+        if (baseUriSetFromDoc) // only listen to the first <base href> in parse
+            return;
+
+        String href = base.absUrl("href");
+        if (href.length() != 0) { // ignore <base target> etc
+            baseUri = href;
+            baseUriSetFromDoc = true;
+            doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
+        }
+    }
+
+    boolean isFragmentParsing() {
+        return fragmentParsing;
+    }
+
+    void error(HtmlTreeBuilderState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state));
+    }
+
+    Element insert(Token.StartTag startTag) {
+        // handle empty unknown tags
+        // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate fake end tag.
+        if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) {
+            Element el = insertEmpty(startTag);
+            process(new Token.EndTag(el.tagName())); // ensure we get out of whatever state we are in
+            return el;
+        }
+        
+        Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes);
+        insert(el);
+        return el;
+    }
+
+    Element insert(String startTagName) {
+        Element el = new Element(Tag.valueOf(startTagName), baseUri);
+        insert(el);
+        return el;
+    }
+
+    void insert(Element el) {
+        insertNode(el);
+        stack.add(el);
+    }
+
+    Element insertEmpty(Token.StartTag startTag) {
+        Tag tag = Tag.valueOf(startTag.name());
+        Element el = new Element(tag, baseUri, startTag.attributes);
+        insertNode(el);
+        if (startTag.isSelfClosing()) {
+            tokeniser.acknowledgeSelfClosingFlag();
+            if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output
+                tag.setSelfClosing();
+        }
+        return el;
+    }
+
+    void insert(Token.Comment commentToken) {
+        Comment comment = new Comment(commentToken.getData(), baseUri);
+        insertNode(comment);
+    }
+
+    void insert(Token.Character characterToken) {
+        Node node;
+        // characters in script and style go in as datanodes, not text nodes
+        if (StringUtil.in(currentElement().tagName(), "script", "style"))
+            node = new DataNode(characterToken.getData(), baseUri);
+        else
+            node = new TextNode(characterToken.getData(), baseUri);
+        currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack.
+    }
+
+    private void insertNode(Node node) {
+        // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
+        if (stack.size() == 0)
+            doc.appendChild(node);
+        else if (isFosterInserts())
+            insertInFosterParent(node);
+        else
+            currentElement().appendChild(node);
+    }
+
+    Element pop() {
+        // todo - dev, remove validation check
+        if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell"))
+            Validate.isFalse(true, "pop td not in cell");
+        if (stack.peekLast().nodeName().equals("html"))
+            Validate.isFalse(true, "popping html!");
+        return stack.pollLast();
+    }
+
+    void push(Element element) {
+        stack.add(element);
+    }
+
+    DescendableLinkedList<Element> getStack() {
+        return stack;
+    }
+
+    boolean onStack(Element el) {
+        return isElementInQueue(stack, el);
+    }
+
+    private boolean isElementInQueue(DescendableLinkedList<Element> queue, Element element) {
+        Iterator<Element> it = queue.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next == element) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    Element getFromStack(String elName) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next.nodeName().equals(elName)) {
+                return next;
+            }
+        }
+        return null;
+    }
+
+    boolean removeFromStack(Element el) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next == el) {
+                it.remove();
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void popStackToClose(String elName) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next.nodeName().equals(elName)) {
+                it.remove();
+                break;
+            } else {
+                it.remove();
+            }
+        }
+    }
+
+    void popStackToClose(String... elNames) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (StringUtil.in(next.nodeName(), elNames)) {
+                it.remove();
+                break;
+            } else {
+                it.remove();
+            }
+        }
+    }
+
+    void popStackToBefore(String elName) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next.nodeName().equals(elName)) {
+                break;
+            } else {
+                it.remove();
+            }
+        }
+    }
+
+    void clearStackToTableContext() {
+        clearStackToContext("table");
+    }
+
+    void clearStackToTableBodyContext() {
+        clearStackToContext("tbody", "tfoot", "thead");
+    }
+
+    void clearStackToTableRowContext() {
+        clearStackToContext("tr");
+    }
+
+    private void clearStackToContext(String... nodeNames) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html"))
+                break;
+            else
+                it.remove();
+        }
+    }
+
+    Element aboveOnStack(Element el) {
+        assert onStack(el);
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next == el) {
+                return it.next();
+            }
+        }
+        return null;
+    }
+
+    void insertOnStackAfter(Element after, Element in) {
+        int i = stack.lastIndexOf(after);
+        Validate.isTrue(i != -1);
+        stack.add(i+1, in);
+    }
+
+    void replaceOnStack(Element out, Element in) {
+        replaceInQueue(stack, out, in);
+    }
+
+    private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) {
+        int i = queue.lastIndexOf(out);
+        Validate.isTrue(i != -1);
+        queue.remove(i);
+        queue.add(i, in);
+    }
+
+    void resetInsertionMode() {
+        boolean last = false;
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element node = it.next();
+            if (!it.hasNext()) {
+                last = true;
+                node = contextElement;
+            }
+            String name = node.nodeName();
+            if ("select".equals(name)) {
+                transition(HtmlTreeBuilderState.InSelect);
+                break; // frag
+            } else if (("td".equals(name) || "td".equals(name) && !last)) {
+                transition(HtmlTreeBuilderState.InCell);
+                break;
+            } else if ("tr".equals(name)) {
+                transition(HtmlTreeBuilderState.InRow);
+                break;
+            } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) {
+                transition(HtmlTreeBuilderState.InTableBody);
+                break;
+            } else if ("caption".equals(name)) {
+                transition(HtmlTreeBuilderState.InCaption);
+                break;
+            } else if ("colgroup".equals(name)) {
+                transition(HtmlTreeBuilderState.InColumnGroup);
+                break; // frag
+            } else if ("table".equals(name)) {
+                transition(HtmlTreeBuilderState.InTable);
+                break;
+            } else if ("head".equals(name)) {
+                transition(HtmlTreeBuilderState.InBody);
+                break; // frag
+            } else if ("body".equals(name)) {
+                transition(HtmlTreeBuilderState.InBody);
+                break;
+            } else if ("frameset".equals(name)) {
+                transition(HtmlTreeBuilderState.InFrameset);
+                break; // frag
+            } else if ("html".equals(name)) {
+                transition(HtmlTreeBuilderState.BeforeHead);
+                break; // frag
+            } else if (last) {
+                transition(HtmlTreeBuilderState.InBody);
+                break; // frag
+            }
+        }
+    }
+
+    // todo: tidy up in specific scope methods
+    private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) {
+        return inSpecificScope(new String[]{targetName}, baseTypes, extraTypes);
+    }
+
+    private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element el = it.next();
+            String elName = el.nodeName();
+            if (StringUtil.in(elName, targetNames))
+                return true;
+            if (StringUtil.in(elName, baseTypes))
+                return false;
+            if (extraTypes != null && StringUtil.in(elName, extraTypes))
+                return false;
+        }
+        Validate.fail("Should not be reachable");
+        return false;
+    }
+
+    boolean inScope(String[] targetNames) {
+        return inSpecificScope(targetNames, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, null);
+    }
+
+    boolean inScope(String targetName) {
+        return inScope(targetName, null);
+    }
+
+    boolean inScope(String targetName, String[] extras) {
+        return inSpecificScope(targetName, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, extras);
+        // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml
+        // todo: in svg namespace: forignOjbect, desc, title
+    }
+
+    boolean inListItemScope(String targetName) {
+        return inScope(targetName, new String[]{"ol", "ul"});
+    }
+
+    boolean inButtonScope(String targetName) {
+        return inScope(targetName, new String[]{"button"});
+    }
+
+    boolean inTableScope(String targetName) {
+        return inSpecificScope(targetName, new String[]{"html", "table"}, null);
+    }
+
+    boolean inSelectScope(String targetName) {
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element el = it.next();
+            String elName = el.nodeName();
+            if (elName.equals(targetName))
+                return true;
+            if (!StringUtil.in(elName, "optgroup", "option")) // all elements except
+                return false;
+        }
+        Validate.fail("Should not be reachable");
+        return false;
+    }
+
+    void setHeadElement(Element headElement) {
+        this.headElement = headElement;
+    }
+
+    Element getHeadElement() {
+        return headElement;
+    }
+
+    boolean isFosterInserts() {
+        return fosterInserts;
+    }
+
+    void setFosterInserts(boolean fosterInserts) {
+        this.fosterInserts = fosterInserts;
+    }
+
+    Element getFormElement() {
+        return formElement;
+    }
+
+    void setFormElement(Element formElement) {
+        this.formElement = formElement;
+    }
+
+    void newPendingTableCharacters() {
+        pendingTableCharacters = new ArrayList<Token.Character>();
+    }
+
+    List<Token.Character> getPendingTableCharacters() {
+        return pendingTableCharacters;
+    }
+
+    void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) {
+        this.pendingTableCharacters = pendingTableCharacters;
+    }
+
+    /**
+     11.2.5.2 Closing elements that have implied end tags<p/>
+     When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a
+     dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element,
+     the UA must pop the current node off the stack of open elements.
+
+     @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the
+     process, then the UA must perform the above steps as if that element was not in the above list.
+     */
+    void generateImpliedEndTags(String excludeTag) {
+        while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) &&
+                StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
+            pop();
+    }
+
+    void generateImpliedEndTags() {
+        generateImpliedEndTags(null);
+    }
+
+    boolean isSpecial(Element el) {
+        // todo: mathml's mi, mo, mn
+        // todo: svg's foreigObject, desc, title
+        String name = el.nodeName();
+        return StringUtil.in(name, "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound",
+                "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd",
+                "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form",
+                "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
+                "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav",
+                "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
+                "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
+                "title", "tr", "ul", "wbr", "xmp");
+    }
+
+    // active formatting elements
+    void pushActiveFormattingElements(Element in) {
+        int numSeen = 0;
+        Iterator<Element> iter = formattingElements.descendingIterator();
+        while (iter.hasNext()) {
+            Element el =  iter.next();
+            if (el == null) // marker
+                break;
+
+            if (isSameFormattingElement(in, el))
+                numSeen++;
+
+            if (numSeen == 3) {
+                iter.remove();
+                break;
+            }
+        }
+        formattingElements.add(in);
+    }
+
+    private boolean isSameFormattingElement(Element a, Element b) {
+        // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children
+        return a.nodeName().equals(b.nodeName()) &&
+                // a.namespace().equals(b.namespace()) &&
+                a.attributes().equals(b.attributes());
+        // todo: namespaces
+    }
+
+    void reconstructFormattingElements() {
+        int size = formattingElements.size();
+        if (size == 0 || formattingElements.getLast() == null || onStack(formattingElements.getLast()))
+            return;
+
+        Element entry = formattingElements.getLast();
+        int pos = size - 1;
+        boolean skip = false;
+        while (true) {
+            if (pos == 0) { // step 4. if none before, skip to 8
+                skip = true;
+                break;
+            }
+            entry = formattingElements.get(--pos); // step 5. one earlier than entry
+            if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack
+                break; // jump to 8, else continue back to 4
+        }
+        while(true) {
+            if (!skip) // step 7: on later than entry
+                entry = formattingElements.get(++pos);
+            Validate.notNull(entry); // should not occur, as we break at last element
+
+            // 8. create new element from element, 9 insert into current node, onto stack
+            skip = false; // can only skip increment from 4.
+            Element newEl = insert(entry.nodeName()); // todo: avoid fostering here?
+            // newEl.namespace(entry.namespace()); // todo: namespaces
+            newEl.attributes().addAll(entry.attributes());
+
+            // 10. replace entry with new entry
+            formattingElements.add(pos, newEl);
+            formattingElements.remove(pos + 1);
+
+            // 11
+            if (pos == size-1) // if not last entry in list, jump to 7
+                break;
+        }
+    }
+
+    void clearFormattingElementsToLastMarker() {
+        while (!formattingElements.isEmpty()) {
+            Element el = formattingElements.peekLast();
+            formattingElements.removeLast();
+            if (el == null)
+                break;
+        }
+    }
+
+    void removeFromActiveFormattingElements(Element el) {
+        Iterator<Element> it = formattingElements.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next == el) {
+                it.remove();
+                break;
+            }
+        }
+    }
+
+    boolean isInActiveFormattingElements(Element el) {
+        return isElementInQueue(formattingElements, el);
+    }
+
+    Element getActiveFormattingElement(String nodeName) {
+        Iterator<Element> it = formattingElements.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next == null) // scope marker
+                break;
+            else if (next.nodeName().equals(nodeName))
+                return next;
+        }
+        return null;
+    }
+
+    void replaceActiveFormattingElement(Element out, Element in) {
+        replaceInQueue(formattingElements, out, in);
+    }
+
+    void insertMarkerToFormattingElements() {
+        formattingElements.add(null);
+    }
+
+    void insertInFosterParent(Node in) {
+        Element fosterParent = null;
+        Element lastTable = getFromStack("table");
+        boolean isLastTableParent = false;
+        if (lastTable != null) {
+            if (lastTable.parent() != null) {
+                fosterParent = lastTable.parent();
+                isLastTableParent = true;
+            } else
+                fosterParent = aboveOnStack(lastTable);
+        } else { // no table == frag
+            fosterParent = stack.get(0);
+        }
+
+        if (isLastTableParent) {
+            Validate.notNull(lastTable); // last table cannot be null by this point.
+            lastTable.before(in);
+        }
+        else
+            fosterParent.appendChild(in);
+    }
+
+    @Override
+    public String toString() {
+        return "TreeBuilder{" +
+                "currentToken=" + currentToken +
+                ", state=" + state +
+                ", currentElement=" + currentElement() +
+                '}';
+    }
+}
diff --git a/src/org/jsoup/parser/HtmlTreeBuilderState.java b/src/org/jsoup/parser/HtmlTreeBuilderState.java

new file mode 100644 (file)

index 0000000..ceab9fa
--- /dev/null
+++ b/src/org/jsoup/parser/HtmlTreeBuilderState.java
@@ -0,0 +1,1482 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.DescendableLinkedList;
+import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.*;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+
+/**
+ * The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states.
+ */
+enum HtmlTreeBuilderState {
+    Initial {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                return true; // ignore whitespace
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype()) {
+                // todo: parse error check on expected doctypes
+                // todo: quirk state check on doctype ids
+                Token.Doctype d = t.asDoctype();
+                DocumentType doctype = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri());
+                tb.getDocument().appendChild(doctype);
+                if (d.isForceQuirks())
+                    tb.getDocument().quirksMode(Document.QuirksMode.quirks);
+                tb.transition(BeforeHtml);
+            } else {
+                // todo: check not iframe srcdoc
+                tb.transition(BeforeHtml);
+                return tb.process(t); // re-process token
+            }
+            return true;
+        }
+    },
+    BeforeHtml {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isDoctype()) {
+                tb.error(this);
+                return false;
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (isWhitespace(t)) {
+                return true; // ignore whitespace
+            } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
+                tb.insert(t.asStartTag());
+                tb.transition(BeforeHead);
+            } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) {
+                return anythingElse(t, tb);
+            } else if (t.isEndTag()) {
+                tb.error(this);
+                return false;
+            } else {
+                return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            tb.insert("html");
+            tb.transition(BeforeHead);
+            return tb.process(t);
+        }
+    },
+    BeforeHead {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                return true;
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype()) {
+                tb.error(this);
+                return false;
+            } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
+                return InBody.process(t, tb); // does not transition
+            } else if (t.isStartTag() && t.asStartTag().name().equals("head")) {
+                Element head = tb.insert(t.asStartTag());
+                tb.setHeadElement(head);
+                tb.transition(InHead);
+            } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) {
+                tb.process(new Token.StartTag("head"));
+                return tb.process(t);
+            } else if (t.isEndTag()) {
+                tb.error(this);
+                return false;
+            } else {
+                tb.process(new Token.StartTag("head"));
+                return tb.process(t);
+            }
+            return true;
+        }
+    },
+    InHead {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                tb.insert(t.asCharacter());
+                return true;
+            }
+            switch (t.type) {
+                case Comment:
+                    tb.insert(t.asComment());
+                    break;
+                case Doctype:
+                    tb.error(this);
+                    return false;
+                case StartTag:
+                    Token.StartTag start = t.asStartTag();
+                    String name = start.name();
+                    if (name.equals("html")) {
+                        return InBody.process(t, tb);
+                    } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) {
+                        Element el = tb.insertEmpty(start);
+                        // jsoup special: update base the frist time it is seen
+                        if (name.equals("base") && el.hasAttr("href"))
+                            tb.maybeSetBaseUri(el);
+                    } else if (name.equals("meta")) {
+                        Element meta = tb.insertEmpty(start);
+                        // todo: charset switches
+                    } else if (name.equals("title")) {
+                        handleRcData(start, tb);
+                    } else if (StringUtil.in(name, "noframes", "style")) {
+                        handleRawtext(start, tb);
+                    } else if (name.equals("noscript")) {
+                        // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript)
+                        tb.insert(start);
+                        tb.transition(InHeadNoscript);
+                    } else if (name.equals("script")) {
+                        // skips some script rules as won't execute them
+                        tb.insert(start);
+                        tb.tokeniser.transition(TokeniserState.ScriptData);
+                        tb.markInsertionMode();
+                        tb.transition(Text);
+                    } else if (name.equals("head")) {
+                        tb.error(this);
+                        return false;
+                    } else {
+                        return anythingElse(t, tb);
+                    }
+                    break;
+                case EndTag:
+                    Token.EndTag end = t.asEndTag();
+                    name = end.name();
+                    if (name.equals("head")) {
+                        tb.pop();
+                        tb.transition(AfterHead);
+                    } else if (StringUtil.in(name, "body", "html", "br")) {
+                        return anythingElse(t, tb);
+                    } else {
+                        tb.error(this);
+                        return false;
+                    }
+                    break;
+                default:
+                    return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, TreeBuilder tb) {
+            tb.process(new Token.EndTag("head"));
+            return tb.process(t);
+        }
+    },
+    InHeadNoscript {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isDoctype()) {
+                tb.error(this);
+            } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
+                return tb.process(t, InBody);
+            } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) {
+                tb.pop();
+                tb.transition(InHead);
+            } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(),
+                    "basefont", "bgsound", "link", "meta", "noframes", "style"))) {
+                return tb.process(t, InHead);
+            } else if (t.isEndTag() && t.asEndTag().name().equals("br")) {
+                return anythingElse(t, tb);
+            } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) {
+                tb.error(this);
+                return false;
+            } else {
+                return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            tb.error(this);
+            tb.process(new Token.EndTag("noscript"));
+            return tb.process(t);
+        }
+    },
+    AfterHead {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                tb.insert(t.asCharacter());
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype()) {
+                tb.error(this);
+            } else if (t.isStartTag()) {
+                Token.StartTag startTag = t.asStartTag();
+                String name = startTag.name();
+                if (name.equals("html")) {
+                    return tb.process(t, InBody);
+                } else if (name.equals("body")) {
+                    tb.insert(startTag);
+                    tb.framesetOk(false);
+                    tb.transition(InBody);
+                } else if (name.equals("frameset")) {
+                    tb.insert(startTag);
+                    tb.transition(InFrameset);
+                } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) {
+                    tb.error(this);
+                    Element head = tb.getHeadElement();
+                    tb.push(head);
+                    tb.process(t, InHead);
+                    tb.removeFromStack(head);
+                } else if (name.equals("head")) {
+                    tb.error(this);
+                    return false;
+                } else {
+                    anythingElse(t, tb);
+                }
+            } else if (t.isEndTag()) {
+                if (StringUtil.in(t.asEndTag().name(), "body", "html")) {
+                    anythingElse(t, tb);
+                } else {
+                    tb.error(this);
+                    return false;
+                }
+            } else {
+                anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            tb.process(new Token.StartTag("body"));
+            tb.framesetOk(true);
+            return tb.process(t);
+        }
+    },
+    InBody {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            switch (t.type) {
+                case Character: {
+                    Token.Character c = t.asCharacter();
+                    if (c.getData().equals(nullString)) {
+                        // todo confirm that check
+                        tb.error(this);
+                        return false;
+                    } else if (isWhitespace(c)) {
+                        tb.reconstructFormattingElements();
+                        tb.insert(c);
+                    } else {
+                        tb.reconstructFormattingElements();
+                        tb.insert(c);
+                        tb.framesetOk(false);
+                    }
+                    break;
+                }
+                case Comment: {
+                    tb.insert(t.asComment());
+                    break;
+                }
+                case Doctype: {
+                    tb.error(this);
+                    return false;
+                }
+                case StartTag:
+                    Token.StartTag startTag = t.asStartTag();
+                    String name = startTag.name();
+                    if (name.equals("html")) {
+                        tb.error(this);
+                        // merge attributes onto real html
+                        Element html = tb.getStack().getFirst();
+                        for (Attribute attribute : startTag.getAttributes()) {
+                            if (!html.hasAttr(attribute.getKey()))
+                                html.attributes().put(attribute);
+                        }
+                    } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title")) {
+                        return tb.process(t, InHead);
+                    } else if (name.equals("body")) {
+                        tb.error(this);
+                        LinkedList<Element> stack = tb.getStack();
+                        if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) {
+                            // only in fragment case
+                            return false; // ignore
+                        } else {
+                            tb.framesetOk(false);
+                            Element body = stack.get(1);
+                            for (Attribute attribute : startTag.getAttributes()) {
+                                if (!body.hasAttr(attribute.getKey()))
+                                    body.attributes().put(attribute);
+                            }
+                        }
+                    } else if (name.equals("frameset")) {
+                        tb.error(this);
+                        LinkedList<Element> stack = tb.getStack();
+                        if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) {
+                            // only in fragment case
+                            return false; // ignore
+                        } else if (!tb.framesetOk()) {
+                            return false; // ignore frameset
+                        } else {
+                            Element second = stack.get(1);
+                            if (second.parent() != null)
+                                second.remove();
+                            // pop up to html element
+                            while (stack.size() > 1)
+                                stack.removeLast();
+                            tb.insert(startTag);
+                            tb.transition(InFrameset);
+                        }
+                    } else if (StringUtil.in(name,
+                            "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl",
+                            "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol",
+                            "p", "section", "summary", "ul")) {
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insert(startTag);
+                    } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) {
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        if (StringUtil.in(tb.currentElement().nodeName(), "h1", "h2", "h3", "h4", "h5", "h6")) {
+                            tb.error(this);
+                            tb.pop();
+                        }
+                        tb.insert(startTag);
+                    } else if (StringUtil.in(name, "pre", "listing")) {
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insert(startTag);
+                        // todo: ignore LF if next token
+                        tb.framesetOk(false);
+                    } else if (name.equals("form")) {
+                        if (tb.getFormElement() != null) {
+                            tb.error(this);
+                            return false;
+                        }
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        Element form = tb.insert(startTag);
+                        tb.setFormElement(form);
+                    } else if (name.equals("li")) {
+                        tb.framesetOk(false);
+                        LinkedList<Element> stack = tb.getStack();
+                        for (int i = stack.size() - 1; i > 0; i--) {
+                            Element el = stack.get(i);
+                            if (el.nodeName().equals("li")) {
+                                tb.process(new Token.EndTag("li"));
+                                break;
+                            }
+                            if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p"))
+                                break;
+                        }
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insert(startTag);
+                    } else if (StringUtil.in(name, "dd", "dt")) {
+                        tb.framesetOk(false);
+                        LinkedList<Element> stack = tb.getStack();
+                        for (int i = stack.size() - 1; i > 0; i--) {
+                            Element el = stack.get(i);
+                            if (StringUtil.in(el.nodeName(), "dd", "dt")) {
+                                tb.process(new Token.EndTag(el.nodeName()));
+                                break;
+                            }
+                            if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p"))
+                                break;
+                        }
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insert(startTag);
+                    } else if (name.equals("plaintext")) {
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insert(startTag);
+                        tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out
+                    } else if (name.equals("button")) {
+                        if (tb.inButtonScope("button")) {
+                            // close and reprocess
+                            tb.error(this);
+                            tb.process(new Token.EndTag("button"));
+                            tb.process(startTag);
+                        } else {
+                            tb.reconstructFormattingElements();
+                            tb.insert(startTag);
+                            tb.framesetOk(false);
+                        }
+                    } else if (name.equals("a")) {
+                        if (tb.getActiveFormattingElement("a") != null) {
+                            tb.error(this);
+                            tb.process(new Token.EndTag("a"));
+
+                            // still on stack?
+                            Element remainingA = tb.getFromStack("a");
+                            if (remainingA != null) {
+                                tb.removeFromActiveFormattingElements(remainingA);
+                                tb.removeFromStack(remainingA);
+                            }
+                        }
+                        tb.reconstructFormattingElements();
+                        Element a = tb.insert(startTag);
+                        tb.pushActiveFormattingElements(a);
+                    } else if (StringUtil.in(name,
+                            "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) {
+                        tb.reconstructFormattingElements();
+                        Element el = tb.insert(startTag);
+                        tb.pushActiveFormattingElements(el);
+                    } else if (name.equals("nobr")) {
+                        tb.reconstructFormattingElements();
+                        if (tb.inScope("nobr")) {
+                            tb.error(this);
+                            tb.process(new Token.EndTag("nobr"));
+                            tb.reconstructFormattingElements();
+                        }
+                        Element el = tb.insert(startTag);
+                        tb.pushActiveFormattingElements(el);
+                    } else if (StringUtil.in(name, "applet", "marquee", "object")) {
+                        tb.reconstructFormattingElements();
+                        tb.insert(startTag);
+                        tb.insertMarkerToFormattingElements();
+                        tb.framesetOk(false);
+                    } else if (name.equals("table")) {
+                        if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insert(startTag);
+                        tb.framesetOk(false);
+                        tb.transition(InTable);
+                    } else if (StringUtil.in(name, "area", "br", "embed", "img", "keygen", "wbr")) {
+                        tb.reconstructFormattingElements();
+                        tb.insertEmpty(startTag);
+                        tb.framesetOk(false);
+                    } else if (name.equals("input")) {
+                        tb.reconstructFormattingElements();
+                        Element el = tb.insertEmpty(startTag);
+                        if (!el.attr("type").equalsIgnoreCase("hidden"))
+                            tb.framesetOk(false);
+                    } else if (StringUtil.in(name, "param", "source", "track")) {
+                        tb.insertEmpty(startTag);
+                    } else if (name.equals("hr")) {
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.insertEmpty(startTag);
+                        tb.framesetOk(false);
+                    } else if (name.equals("image")) {
+                        // we're not supposed to ask.
+                        startTag.name("img");
+                        return tb.process(startTag);
+                    } else if (name.equals("isindex")) {
+                        // how much do we care about the early 90s?
+                        tb.error(this);
+                        if (tb.getFormElement() != null)
+                            return false;
+
+                        tb.tokeniser.acknowledgeSelfClosingFlag();
+                        tb.process(new Token.StartTag("form"));
+                        if (startTag.attributes.hasKey("action")) {
+                            Element form = tb.getFormElement();
+                            form.attr("action", startTag.attributes.get("action"));
+                        }
+                        tb.process(new Token.StartTag("hr"));
+                        tb.process(new Token.StartTag("label"));
+                        // hope you like english.
+                        String prompt = startTag.attributes.hasKey("prompt") ?
+                                startTag.attributes.get("prompt") :
+                                "This is a searchable index. Enter search keywords: ";
+
+                        tb.process(new Token.Character(prompt));
+
+                        // input
+                        Attributes inputAttribs = new Attributes();
+                        for (Attribute attr : startTag.attributes) {
+                            if (!StringUtil.in(attr.getKey(), "name", "action", "prompt"))
+                                inputAttribs.put(attr);
+                        }
+                        inputAttribs.put("name", "isindex");
+                        tb.process(new Token.StartTag("input", inputAttribs));
+                        tb.process(new Token.EndTag("label"));
+                        tb.process(new Token.StartTag("hr"));
+                        tb.process(new Token.EndTag("form"));
+                    } else if (name.equals("textarea")) {
+                        tb.insert(startTag);
+                        // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.)
+                        tb.tokeniser.transition(TokeniserState.Rcdata);
+                        tb.markInsertionMode();
+                        tb.framesetOk(false);
+                        tb.transition(Text);
+                    } else if (name.equals("xmp")) {
+                        if (tb.inButtonScope("p")) {
+                            tb.process(new Token.EndTag("p"));
+                        }
+                        tb.reconstructFormattingElements();
+                        tb.framesetOk(false);
+                        handleRawtext(startTag, tb);
+                    } else if (name.equals("iframe")) {
+                        tb.framesetOk(false);
+                        handleRawtext(startTag, tb);
+                    } else if (name.equals("noembed")) {
+                        // also handle noscript if script enabled
+                        handleRawtext(startTag, tb);
+                    } else if (name.equals("select")) {
+                        tb.reconstructFormattingElements();
+                        tb.insert(startTag);
+                        tb.framesetOk(false);
+
+                        HtmlTreeBuilderState state = tb.state();
+                        if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell))
+                            tb.transition(InSelectInTable);
+                        else
+                            tb.transition(InSelect);
+                    } else if (StringUtil.in("optgroup", "option")) {
+                        if (tb.currentElement().nodeName().equals("option"))
+                            tb.process(new Token.EndTag("option"));
+                        tb.reconstructFormattingElements();
+                        tb.insert(startTag);
+                    } else if (StringUtil.in("rp", "rt")) {
+                        if (tb.inScope("ruby")) {
+                            tb.generateImpliedEndTags();
+                            if (!tb.currentElement().nodeName().equals("ruby")) {
+                                tb.error(this);
+                                tb.popStackToBefore("ruby"); // i.e. close up to but not include name
+                            }
+                            tb.insert(startTag);
+                        }
+                    } else if (name.equals("math")) {
+                        tb.reconstructFormattingElements();
+                        // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml)
+                        tb.insert(startTag);
+                        tb.tokeniser.acknowledgeSelfClosingFlag();
+                    } else if (name.equals("svg")) {
+                        tb.reconstructFormattingElements();
+                        // todo: handle A start tag whose tag name is "svg" (xlink, svg)
+                        tb.insert(startTag);
+                        tb.tokeniser.acknowledgeSelfClosingFlag();
+                    } else if (StringUtil.in(name,
+                            "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) {
+                        tb.error(this);
+                        return false;
+                    } else {
+                        tb.reconstructFormattingElements();
+                        tb.insert(startTag);
+                    }
+                    break;
+
+                case EndTag:
+                    Token.EndTag endTag = t.asEndTag();
+                    name = endTag.name();
+                    if (name.equals("body")) {
+                        if (!tb.inScope("body")) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html
+                            tb.transition(AfterBody);
+                        }
+                    } else if (name.equals("html")) {
+                        boolean notIgnored = tb.process(new Token.EndTag("body"));
+                        if (notIgnored)
+                            return tb.process(endTag);
+                    } else if (StringUtil.in(name,
+                            "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div",
+                            "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu",
+                            "nav", "ol", "pre", "section", "summary", "ul")) {
+                        // todo: refactor these lookups
+                        if (!tb.inScope(name)) {
+                            // nothing to close
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.generateImpliedEndTags();
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            tb.popStackToClose(name);
+                        }
+                    } else if (name.equals("form")) {
+                        Element currentForm = tb.getFormElement();
+                        tb.setFormElement(null);
+                        if (currentForm == null || !tb.inScope(name)) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.generateImpliedEndTags();
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            // remove currentForm from stack. will shift anything under up.
+                            tb.removeFromStack(currentForm);
+                        }
+                    } else if (name.equals("p")) {
+                        if (!tb.inButtonScope(name)) {
+                            tb.error(this);
+                            tb.process(new Token.StartTag(name)); // if no p to close, creates an empty <p></p>
+                            return tb.process(endTag);
+                        } else {
+                            tb.generateImpliedEndTags(name);
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            tb.popStackToClose(name);
+                        }
+                    } else if (name.equals("li")) {
+                        if (!tb.inListItemScope(name)) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.generateImpliedEndTags(name);
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            tb.popStackToClose(name);
+                        }
+                    } else if (StringUtil.in(name, "dd", "dt")) {
+                        if (!tb.inScope(name)) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.generateImpliedEndTags(name);
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            tb.popStackToClose(name);
+                        }
+                    } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) {
+                        if (!tb.inScope(new String[]{"h1", "h2", "h3", "h4", "h5", "h6"})) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.generateImpliedEndTags(name);
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6");
+                        }
+                    } else if (name.equals("sarcasm")) {
+                        // *sigh*
+                        return anyOtherEndTag(t, tb);
+                    } else if (StringUtil.in(name,
+                            "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u")) {
+                        // Adoption Agency Algorithm.
+                        OUTER:
+                        for (int i = 0; i < 8; i++) {
+                            Element formatEl = tb.getActiveFormattingElement(name);
+                            if (formatEl == null)
+                                return anyOtherEndTag(t, tb);
+                            else if (!tb.onStack(formatEl)) {
+                                tb.error(this);
+                                tb.removeFromActiveFormattingElements(formatEl);
+                                return true;
+                            } else if (!tb.inScope(formatEl.nodeName())) {
+                                tb.error(this);
+                                return false;
+                            } else if (tb.currentElement() != formatEl)
+                                tb.error(this);
+
+                            Element furthestBlock = null;
+                            Element commonAncestor = null;
+                            boolean seenFormattingElement = false;
+                            LinkedList<Element> stack = tb.getStack();
+                            for (int si = 0; si < stack.size(); si++) {
+                                Element el = stack.get(si);
+                                if (el == formatEl) {
+                                    commonAncestor = stack.get(si - 1);
+                                    seenFormattingElement = true;
+                                } else if (seenFormattingElement && tb.isSpecial(el)) {
+                                    furthestBlock = el;
+                                    break;
+                                }
+                            }
+                            if (furthestBlock == null) {
+                                tb.popStackToClose(formatEl.nodeName());
+                                tb.removeFromActiveFormattingElements(formatEl);
+                                return true;
+                            }
+
+                            // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
+                            // does that mean: int pos of format el in list?
+                            Element node = furthestBlock;
+                            Element lastNode = furthestBlock;
+                            INNER:
+                            for (int j = 0; j < 3; j++) {
+                                if (tb.onStack(node))
+                                    node = tb.aboveOnStack(node);
+                                if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check
+                                    tb.removeFromStack(node);
+                                    continue INNER;
+                                } else if (node == formatEl)
+                                    break INNER;
+
+                                Element replacement = new Element(Tag.valueOf(node.nodeName()), tb.getBaseUri());
+                                tb.replaceActiveFormattingElement(node, replacement);
+                                tb.replaceOnStack(node, replacement);
+                                node = replacement;
+
+                                if (lastNode == furthestBlock) {
+                                    // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements.
+                                    // not getting how this bookmark both straddles the element above, but is inbetween here...
+                                }
+                                if (lastNode.parent() != null)
+                                    lastNode.remove();
+                                node.appendChild(lastNode);
+
+                                lastNode = node;
+                            }
+
+                            if (StringUtil.in(commonAncestor.nodeName(), "table", "tbody", "tfoot", "thead", "tr")) {
+                                if (lastNode.parent() != null)
+                                    lastNode.remove();
+                                tb.insertInFosterParent(lastNode);
+                            } else {
+                                if (lastNode.parent() != null)
+                                    lastNode.remove();
+                                commonAncestor.appendChild(lastNode);
+                            }
+
+                            Element adopter = new Element(Tag.valueOf(name), tb.getBaseUri());
+                            Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodes().size()]);
+                            for (Node childNode : childNodes) {
+                                adopter.appendChild(childNode); // append will reparent. thus the clone to avoid concurrent mod.
+                            }
+                            furthestBlock.appendChild(adopter);
+                            tb.removeFromActiveFormattingElements(formatEl);
+                            // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark.
+                            tb.removeFromStack(formatEl);
+                            tb.insertOnStackAfter(furthestBlock, adopter);
+                        }
+                    } else if (StringUtil.in(name, "applet", "marquee", "object")) {
+                        if (!tb.inScope("name")) {
+                            if (!tb.inScope(name)) {
+                                tb.error(this);
+                                return false;
+                            }
+                            tb.generateImpliedEndTags();
+                            if (!tb.currentElement().nodeName().equals(name))
+                                tb.error(this);
+                            tb.popStackToClose(name);
+                            tb.clearFormattingElementsToLastMarker();
+                        }
+                    } else if (name.equals("br")) {
+                        tb.error(this);
+                        tb.process(new Token.StartTag("br"));
+                        return false;
+                    } else {
+                        return anyOtherEndTag(t, tb);
+                    }
+
+                    break;
+                case EOF:
+                    // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html
+                    // stop parsing
+                    break;
+            }
+            return true;
+        }
+
+        boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) {
+            String name = t.asEndTag().name();
+            DescendableLinkedList<Element> stack = tb.getStack();
+            Iterator<Element> it = stack.descendingIterator();
+            while (it.hasNext()) {
+                Element node = it.next();
+                if (node.nodeName().equals(name)) {
+                    tb.generateImpliedEndTags(name);
+                    if (!name.equals(tb.currentElement().nodeName()))
+                        tb.error(this);
+                    tb.popStackToClose(name);
+                    break;
+                } else {
+                    if (tb.isSpecial(node)) {
+                        tb.error(this);
+                        return false;
+                    }
+                }
+            }
+            return true;
+        }
+    },
+    Text {
+        // in script, style etc. normally treated as data tags
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isCharacter()) {
+                tb.insert(t.asCharacter());
+            } else if (t.isEOF()) {
+                tb.error(this);
+                // if current node is script: already started
+                tb.pop();
+                tb.transition(tb.originalState());
+                return tb.process(t);
+            } else if (t.isEndTag()) {
+                // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts
+                tb.pop();
+                tb.transition(tb.originalState());
+            }
+            return true;
+        }
+    },
+    InTable {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isCharacter()) {
+                tb.newPendingTableCharacters();
+                tb.markInsertionMode();
+                tb.transition(InTableText);
+                return tb.process(t);
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+                return true;
+            } else if (t.isDoctype()) {
+                tb.error(this);
+                return false;
+            } else if (t.isStartTag()) {
+                Token.StartTag startTag = t.asStartTag();
+                String name = startTag.name();
+                if (name.equals("caption")) {
+                    tb.clearStackToTableContext();
+                    tb.insertMarkerToFormattingElements();
+                    tb.insert(startTag);
+                    tb.transition(InCaption);
+                } else if (name.equals("colgroup")) {
+                    tb.clearStackToTableContext();
+                    tb.insert(startTag);
+                    tb.transition(InColumnGroup);
+                } else if (name.equals("col")) {
+                    tb.process(new Token.StartTag("colgroup"));
+                    return tb.process(t);
+                } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) {
+                    tb.clearStackToTableContext();
+                    tb.insert(startTag);
+                    tb.transition(InTableBody);
+                } else if (StringUtil.in(name, "td", "th", "tr")) {
+                    tb.process(new Token.StartTag("tbody"));
+                    return tb.process(t);
+                } else if (name.equals("table")) {
+                    tb.error(this);
+                    boolean processed = tb.process(new Token.EndTag("table"));
+                    if (processed) // only ignored if in fragment
+                        return tb.process(t);
+                } else if (StringUtil.in(name, "style", "script")) {
+                    return tb.process(t, InHead);
+                } else if (name.equals("input")) {
+                    if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) {
+                        return anythingElse(t, tb);
+                    } else {
+                        tb.insertEmpty(startTag);
+                    }
+                } else if (name.equals("form")) {
+                    tb.error(this);
+                    if (tb.getFormElement() != null)
+                        return false;
+                    else {
+                        Element form = tb.insertEmpty(startTag);
+                        tb.setFormElement(form);
+                    }
+                } else {
+                    return anythingElse(t, tb);
+                }
+            } else if (t.isEndTag()) {
+                Token.EndTag endTag = t.asEndTag();
+                String name = endTag.name();
+
+                if (name.equals("table")) {
+                    if (!tb.inTableScope(name)) {
+                        tb.error(this);
+                        return false;
+                    } else {
+                        tb.popStackToClose("table");
+                    }
+                    tb.resetInsertionMode();
+                } else if (StringUtil.in(name,
+                        "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) {
+                    tb.error(this);
+                    return false;
+                } else {
+                    return anythingElse(t, tb);
+                }
+            } else if (t.isEOF()) {
+                if (tb.currentElement().nodeName().equals("html"))
+                    tb.error(this);
+                return true; // stops parsing
+            }
+            return anythingElse(t, tb);
+        }
+
+        boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            tb.error(this);
+            boolean processed = true;
+            if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) {
+                tb.setFosterInserts(true);
+                processed = tb.process(t, InBody);
+                tb.setFosterInserts(false);
+            } else {
+                processed = tb.process(t, InBody);
+            }
+            return processed;
+        }
+    },
+    InTableText {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            switch (t.type) {
+                case Character:
+                    Token.Character c = t.asCharacter();
+                    if (c.getData().equals(nullString)) {
+                        tb.error(this);
+                        return false;
+                    } else {
+                        tb.getPendingTableCharacters().add(c);
+                    }
+                    break;
+                default:
+                    if (tb.getPendingTableCharacters().size() > 0) {
+                        for (Token.Character character : tb.getPendingTableCharacters()) {
+                            if (!isWhitespace(character)) {
+                                // InTable anything else section:
+                                tb.error(this);
+                                if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) {
+                                    tb.setFosterInserts(true);
+                                    tb.process(character, InBody);
+                                    tb.setFosterInserts(false);
+                                } else {
+                                    tb.process(character, InBody);
+                                }
+                            } else
+                                tb.insert(character);
+                        }
+                        tb.newPendingTableCharacters();
+                    }
+                    tb.transition(tb.originalState());
+                    return tb.process(t);
+            }
+            return true;
+        }
+    },
+    InCaption {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isEndTag() && t.asEndTag().name().equals("caption")) {
+                Token.EndTag endTag = t.asEndTag();
+                String name = endTag.name();
+                if (!tb.inTableScope(name)) {
+                    tb.error(this);
+                    return false;
+                } else {
+                    tb.generateImpliedEndTags();
+                    if (!tb.currentElement().nodeName().equals("caption"))
+                        tb.error(this);
+                    tb.popStackToClose("caption");
+                    tb.clearFormattingElementsToLastMarker();
+                    tb.transition(InTable);
+                }
+            } else if ((
+                    t.isStartTag() && StringUtil.in(t.asStartTag().name(),
+                            "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") ||
+                            t.isEndTag() && t.asEndTag().name().equals("table"))
+                    ) {
+                tb.error(this);
+                boolean processed = tb.process(new Token.EndTag("caption"));
+                if (processed)
+                    return tb.process(t);
+            } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(),
+                    "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) {
+                tb.error(this);
+                return false;
+            } else {
+                return tb.process(t, InBody);
+            }
+            return true;
+        }
+    },
+    InColumnGroup {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                tb.insert(t.asCharacter());
+                return true;
+            }
+            switch (t.type) {
+                case Comment:
+                    tb.insert(t.asComment());
+                    break;
+                case Doctype:
+                    tb.error(this);
+                    break;
+                case StartTag:
+                    Token.StartTag startTag = t.asStartTag();
+                    String name = startTag.name();
+                    if (name.equals("html"))
+                        return tb.process(t, InBody);
+                    else if (name.equals("col"))
+                        tb.insertEmpty(startTag);
+                    else
+                        return anythingElse(t, tb);
+                    break;
+                case EndTag:
+                    Token.EndTag endTag = t.asEndTag();
+                    name = endTag.name();
+                    if (name.equals("colgroup")) {
+                        if (tb.currentElement().nodeName().equals("html")) { // frag case
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.pop();
+                            tb.transition(InTable);
+                        }
+                    } else
+                        return anythingElse(t, tb);
+                    break;
+                case EOF:
+                    if (tb.currentElement().nodeName().equals("html"))
+                        return true; // stop parsing; frag case
+                    else
+                        return anythingElse(t, tb);
+                default:
+                    return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, TreeBuilder tb) {
+            boolean processed = tb.process(new Token.EndTag("colgroup"));
+            if (processed) // only ignored in frag case
+                return tb.process(t);
+            return true;
+        }
+    },
+    InTableBody {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            switch (t.type) {
+                case StartTag:
+                    Token.StartTag startTag = t.asStartTag();
+                    String name = startTag.name();
+                    if (name.equals("tr")) {
+                        tb.clearStackToTableBodyContext();
+                        tb.insert(startTag);
+                        tb.transition(InRow);
+                    } else if (StringUtil.in(name, "th", "td")) {
+                        tb.error(this);
+                        tb.process(new Token.StartTag("tr"));
+                        return tb.process(startTag);
+                    } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) {
+                        return exitTableBody(t, tb);
+                    } else
+                        return anythingElse(t, tb);
+                    break;
+                case EndTag:
+                    Token.EndTag endTag = t.asEndTag();
+                    name = endTag.name();
+                    if (StringUtil.in(name, "tbody", "tfoot", "thead")) {
+                        if (!tb.inTableScope(name)) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.clearStackToTableBodyContext();
+                            tb.pop();
+                            tb.transition(InTable);
+                        }
+                    } else if (name.equals("table")) {
+                        return exitTableBody(t, tb);
+                    } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) {
+                        tb.error(this);
+                        return false;
+                    } else
+                        return anythingElse(t, tb);
+                    break;
+                default:
+                    return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean exitTableBody(Token t, HtmlTreeBuilder tb) {
+            if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) {
+                // frag case
+                tb.error(this);
+                return false;
+            }
+            tb.clearStackToTableBodyContext();
+            tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, tfoot, thead
+            return tb.process(t);
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            return tb.process(t, InTable);
+        }
+    },
+    InRow {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isStartTag()) {
+                Token.StartTag startTag = t.asStartTag();
+                String name = startTag.name();
+
+                if (StringUtil.in(name, "th", "td")) {
+                    tb.clearStackToTableRowContext();
+                    tb.insert(startTag);
+                    tb.transition(InCell);
+                    tb.insertMarkerToFormattingElements();
+                } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) {
+                    return handleMissingTr(t, tb);
+                } else {
+                    return anythingElse(t, tb);
+                }
+            } else if (t.isEndTag()) {
+                Token.EndTag endTag = t.asEndTag();
+                String name = endTag.name();
+
+                if (name.equals("tr")) {
+                    if (!tb.inTableScope(name)) {
+                        tb.error(this); // frag
+                        return false;
+                    }
+                    tb.clearStackToTableRowContext();
+                    tb.pop(); // tr
+                    tb.transition(InTableBody);
+                } else if (name.equals("table")) {
+                    return handleMissingTr(t, tb);
+                } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) {
+                    if (!tb.inTableScope(name)) {
+                        tb.error(this);
+                        return false;
+                    }
+                    tb.process(new Token.EndTag("tr"));
+                    return tb.process(t);
+                } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) {
+                    tb.error(this);
+                    return false;
+                } else {
+                    return anythingElse(t, tb);
+                }
+            } else {
+                return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            return tb.process(t, InTable);
+        }
+
+        private boolean handleMissingTr(Token t, TreeBuilder tb) {
+            boolean processed = tb.process(new Token.EndTag("tr"));
+            if (processed)
+                return tb.process(t);
+            else
+                return false;
+        }
+    },
+    InCell {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isEndTag()) {
+                Token.EndTag endTag = t.asEndTag();
+                String name = endTag.name();
+
+                if (StringUtil.in(name, "td", "th")) {
+                    if (!tb.inTableScope(name)) {
+                        tb.error(this);
+                        tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag
+                        return false;
+                    }
+                    tb.generateImpliedEndTags();
+                    if (!tb.currentElement().nodeName().equals(name))
+                        tb.error(this);
+                    tb.popStackToClose(name);
+                    tb.clearFormattingElementsToLastMarker();
+                    tb.transition(InRow);
+                } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) {
+                    tb.error(this);
+                    return false;
+                } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) {
+                    if (!tb.inTableScope(name)) {
+                        tb.error(this);
+                        return false;
+                    }
+                    closeCell(tb);
+                    return tb.process(t);
+                } else {
+                    return anythingElse(t, tb);
+                }
+            } else if (t.isStartTag() &&
+                    StringUtil.in(t.asStartTag().name(),
+                            "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) {
+                if (!(tb.inTableScope("td") || tb.inTableScope("th"))) {
+                    tb.error(this);
+                    return false;
+                }
+                closeCell(tb);
+                return tb.process(t);
+            } else {
+                return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            return tb.process(t, InBody);
+        }
+
+        private void closeCell(HtmlTreeBuilder tb) {
+            if (tb.inTableScope("td"))
+                tb.process(new Token.EndTag("td"));
+            else
+                tb.process(new Token.EndTag("th")); // only here if th or td in scope
+        }
+    },
+    InSelect {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            switch (t.type) {
+                case Character:
+                    Token.Character c = t.asCharacter();
+                    if (c.getData().equals(nullString)) {
+                        tb.error(this);
+                        return false;
+                    } else {
+                        tb.insert(c);
+                    }
+                    break;
+                case Comment:
+                    tb.insert(t.asComment());
+                    break;
+                case Doctype:
+                    tb.error(this);
+                    return false;
+                case StartTag:
+                    Token.StartTag start = t.asStartTag();
+                    String name = start.name();
+                    if (name.equals("html"))
+                        return tb.process(start, InBody);
+                    else if (name.equals("option")) {
+                        tb.process(new Token.EndTag("option"));
+                        tb.insert(start);
+                    } else if (name.equals("optgroup")) {
+                        if (tb.currentElement().nodeName().equals("option"))
+                            tb.process(new Token.EndTag("option"));
+                        else if (tb.currentElement().nodeName().equals("optgroup"))
+                            tb.process(new Token.EndTag("optgroup"));
+                        tb.insert(start);
+                    } else if (name.equals("select")) {
+                        tb.error(this);
+                        return tb.process(new Token.EndTag("select"));
+                    } else if (StringUtil.in(name, "input", "keygen", "textarea")) {
+                        tb.error(this);
+                        if (!tb.inSelectScope("select"))
+                            return false; // frag
+                        tb.process(new Token.EndTag("select"));
+                        return tb.process(start);
+                    } else if (name.equals("script")) {
+                        return tb.process(t, InHead);
+                    } else {
+                        return anythingElse(t, tb);
+                    }
+                    break;
+                case EndTag:
+                    Token.EndTag end = t.asEndTag();
+                    name = end.name();
+                    if (name.equals("optgroup")) {
+                        if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup"))
+                            tb.process(new Token.EndTag("option"));
+                        if (tb.currentElement().nodeName().equals("optgroup"))
+                            tb.pop();
+                        else
+                            tb.error(this);
+                    } else if (name.equals("option")) {
+                        if (tb.currentElement().nodeName().equals("option"))
+                            tb.pop();
+                        else
+                            tb.error(this);
+                    } else if (name.equals("select")) {
+                        if (!tb.inSelectScope(name)) {
+                            tb.error(this);
+                            return false;
+                        } else {
+                            tb.popStackToClose(name);
+                            tb.resetInsertionMode();
+                        }
+                    } else
+                        return anythingElse(t, tb);
+                    break;
+                case EOF:
+                    if (!tb.currentElement().nodeName().equals("html"))
+                        tb.error(this);
+                    break;
+                default:
+                    return anythingElse(t, tb);
+            }
+            return true;
+        }
+
+        private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
+            tb.error(this);
+            return false;
+        }
+    },
+    InSelectInTable {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) {
+                tb.error(this);
+                tb.process(new Token.EndTag("select"));
+                return tb.process(t);
+            } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) {
+                tb.error(this);
+                if (tb.inTableScope(t.asEndTag().name())) {
+                    tb.process(new Token.EndTag("select"));
+                    return (tb.process(t));
+                } else
+                    return false;
+            } else {
+                return tb.process(t, InSelect);
+            }
+        }
+    },
+    AfterBody {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                return tb.process(t, InBody);
+            } else if (t.isComment()) {
+                tb.insert(t.asComment()); // into html node
+            } else if (t.isDoctype()) {
+                tb.error(this);
+                return false;
+            } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
+                return tb.process(t, InBody);
+            } else if (t.isEndTag() && t.asEndTag().name().equals("html")) {
+                if (tb.isFragmentParsing()) {
+                    tb.error(this);
+                    return false;
+                } else {
+                    tb.transition(AfterAfterBody);
+                }
+            } else if (t.isEOF()) {
+                // chillax! we're done
+            } else {
+                tb.error(this);
+                tb.transition(InBody);
+                return tb.process(t);
+            }
+            return true;
+        }
+    },
+    InFrameset {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                tb.insert(t.asCharacter());
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype()) {
+                tb.error(this);
+                return false;
+            } else if (t.isStartTag()) {
+                Token.StartTag start = t.asStartTag();
+                String name = start.name();
+                if (name.equals("html")) {
+                    return tb.process(start, InBody);
+                } else if (name.equals("frameset")) {
+                    tb.insert(start);
+                } else if (name.equals("frame")) {
+                    tb.insertEmpty(start);
+                } else if (name.equals("noframes")) {
+                    return tb.process(start, InHead);
+                } else {
+                    tb.error(this);
+                    return false;
+                }
+            } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) {
+                if (tb.currentElement().nodeName().equals("html")) { // frag
+                    tb.error(this);
+                    return false;
+                } else {
+                    tb.pop();
+                    if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) {
+                        tb.transition(AfterFrameset);
+                    }
+                }
+            } else if (t.isEOF()) {
+                if (!tb.currentElement().nodeName().equals("html")) {
+                    tb.error(this);
+                    return true;
+                }
+            } else {
+                tb.error(this);
+                return false;
+            }
+            return true;
+        }
+    },
+    AfterFrameset {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (isWhitespace(t)) {
+                tb.insert(t.asCharacter());
+            } else if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype()) {
+                tb.error(this);
+                return false;
+            } else if (t.isStartTag() && t.asStartTag().name().equals("html")) {
+                return tb.process(t, InBody);
+            } else if (t.isEndTag() && t.asEndTag().name().equals("html")) {
+                tb.transition(AfterAfterFrameset);
+            } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) {
+                return tb.process(t, InHead);
+            } else if (t.isEOF()) {
+                // cool your heels, we're complete
+            } else {
+                tb.error(this);
+                return false;
+            }
+            return true;
+        }
+    },
+    AfterAfterBody {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) {
+                return tb.process(t, InBody);
+            } else if (t.isEOF()) {
+                // nice work chuck
+            } else {
+                tb.error(this);
+                tb.transition(InBody);
+                return tb.process(t);
+            }
+            return true;
+        }
+    },
+    AfterAfterFrameset {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            if (t.isComment()) {
+                tb.insert(t.asComment());
+            } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) {
+                return tb.process(t, InBody);
+            } else if (t.isEOF()) {
+                // nice work chuck
+            } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) {
+                return tb.process(t, InHead);
+            } else {
+                tb.error(this);
+                return false;
+            }
+            return true;
+        }
+    },
+    ForeignContent {
+        boolean process(Token t, HtmlTreeBuilder tb) {
+            return true;
+            // todo: implement. Also; how do we get here?
+        }
+    };
+
+    private static String nullString = String.valueOf('\u0000');
+
+    abstract boolean process(Token t, HtmlTreeBuilder tb);
+
+    private static boolean isWhitespace(Token t) {
+        if (t.isCharacter()) {
+            String data = t.asCharacter().getData();
+            // todo: this checks more than spec - "\t", "\n", "\f", "\r", " "
+            for (int i = 0; i < data.length(); i++) {
+                char c = data.charAt(i);
+                if (!StringUtil.isWhitespace(c))
+                    return false;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) {
+        tb.insert(startTag);
+        tb.tokeniser.transition(TokeniserState.Rcdata);
+        tb.markInsertionMode();
+        tb.transition(Text);
+    }
+
+    private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) {
+        tb.insert(startTag);
+        tb.tokeniser.transition(TokeniserState.Rawtext);
+        tb.markInsertionMode();
+        tb.transition(Text);
+    }
+}
diff --git a/src/org/jsoup/parser/ParseError.java b/src/org/jsoup/parser/ParseError.java

new file mode 100644 (file)

index 0000000..dfa0900
--- /dev/null
+++ b/src/org/jsoup/parser/ParseError.java
@@ -0,0 +1,40 @@
+package org.jsoup.parser;
+
+/**
+ * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase.
+ */
+public class ParseError {
+    private int pos;
+    private String errorMsg;
+
+    ParseError(int pos, String errorMsg) {
+        this.pos = pos;
+        this.errorMsg = errorMsg;
+    }
+
+    ParseError(int pos, String errorFormat, Object... args) {
+        this.errorMsg = String.format(errorFormat, args);
+        this.pos = pos;
+    }
+
+    /**
+     * Retrieve the error message.
+     * @return the error message.
+     */
+    public String getErrorMessage() {
+        return errorMsg;
+    }
+
+    /**
+     * Retrieves the offset of the error.
+     * @return error offset within input
+     */
+    public int getPosition() {
+        return pos;
+    }
+
+    @Override
+    public String toString() {
+        return pos + ": " + errorMsg;
+    }
+}
diff --git a/src/org/jsoup/parser/ParseErrorList.java b/src/org/jsoup/parser/ParseErrorList.java

new file mode 100644 (file)

index 0000000..3824ffb
--- /dev/null
+++ b/src/org/jsoup/parser/ParseErrorList.java
@@ -0,0 +1,34 @@
+package org.jsoup.parser;
+
+import java.util.ArrayList;
+
+/**
+ * A container for ParseErrors.
+ * 
+ * @author Jonathan Hedley
+ */
+class ParseErrorList extends ArrayList<ParseError>{
+    private static final int INITIAL_CAPACITY = 16;
+    private final int maxSize;
+    
+    ParseErrorList(int initialCapacity, int maxSize) {
+        super(initialCapacity);
+        this.maxSize = maxSize;
+    }
+    
+    boolean canAddError() {
+        return size() < maxSize;
+    }
+
+    int getMaxSize() {
+        return maxSize;
+    }
+
+    static ParseErrorList noTracking() {
+        return new ParseErrorList(0, 0);
+    }
+    
+    static ParseErrorList tracking(int maxSize) {
+        return new ParseErrorList(INITIAL_CAPACITY, maxSize);
+    }
+}
diff --git a/src/org/jsoup/parser/Parser.java b/src/org/jsoup/parser/Parser.java

new file mode 100644 (file)

index 0000000..2236219
--- /dev/null
+++ b/src/org/jsoup/parser/Parser.java
@@ -0,0 +1,157 @@
+package org.jsoup.parser;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+
+import java.util.List;
+
+/**
+ * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the  more convenient parse methods
+ * in {@link org.jsoup.Jsoup}.
+ */
+public class Parser {
+    private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled.
+    
+    private TreeBuilder treeBuilder;
+    private int maxErrors = DEFAULT_MAX_ERRORS;
+    private ParseErrorList errors;
+
+    /**
+     * Create a new Parser, using the specified TreeBuilder
+     * @param treeBuilder TreeBuilder to use to parse input into Documents.
+     */
+    public Parser(TreeBuilder treeBuilder) {
+        this.treeBuilder = treeBuilder;
+    }
+    
+    public Document parseInput(String html, String baseUri) {
+        errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
+        Document doc = treeBuilder.parse(html, baseUri, errors);
+        return doc;
+    }
+
+    // gets & sets
+    /**
+     * Get the TreeBuilder currently in use.
+     * @return current TreeBuilder.
+     */
+    public TreeBuilder getTreeBuilder() {
+        return treeBuilder;
+    }
+
+    /**
+     * Update the TreeBuilder used when parsing content.
+     * @param treeBuilder current TreeBuilder
+     * @return this, for chaining
+     */
+    public Parser setTreeBuilder(TreeBuilder treeBuilder) {
+        this.treeBuilder = treeBuilder;
+        return this;
+    }
+
+    /**
+     * Check if parse error tracking is enabled.
+     * @return current track error state.
+     */
+    public boolean isTrackErrors() {
+        return maxErrors > 0;
+    }
+
+    /**
+     * Enable or disable parse error tracking for the next parse.
+     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
+     * @return this, for chaining
+     */
+    public Parser setTrackErrors(int maxErrors) {
+        this.maxErrors = maxErrors;
+        return this;
+    }
+
+    /**
+     * Retrieve the parse errors, if any, from the last parse.
+     * @return list of parse errors, up to the size of the maximum errors tracked.
+     */
+    public List<ParseError> getErrors() {
+        return errors;
+    }
+
+    // static parse functions below
+    /**
+     * Parse HTML into a Document.
+     *
+     * @param html HTML to parse
+     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
+     *
+     * @return parsed Document
+     */
+    public static Document parse(String html, String baseUri) {
+        TreeBuilder treeBuilder = new HtmlTreeBuilder();
+        return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking());
+    }
+
+    /**
+     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
+     *
+     * @param fragmentHtml the fragment of HTML to parse
+     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
+     * provides stack context (for implicit element creation).
+     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
+     *
+     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
+     */
+    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
+        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
+        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking());
+    }
+
+    /**
+     * Parse a fragment of HTML into the {@code body} of a Document.
+     *
+     * @param bodyHtml fragment of HTML
+     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
+     *
+     * @return Document, with empty head, and HTML parsed into body
+     */
+    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
+        Document doc = Document.createShell(baseUri);
+        Element body = doc.body();
+        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
+        Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
+        for (Node node : nodes) {
+            body.appendChild(node);
+        }
+        return doc;
+    }
+
+    /**
+     * @param bodyHtml HTML to parse
+     * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
+     *
+     * @return parsed Document
+     * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead.
+     */
+    public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
+        return parse(bodyHtml, baseUri);
+    }
+    
+    // builders
+
+    /**
+     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
+     * based on a knowledge of the semantics of the incoming tags.
+     * @return a new HTML parser.
+     */
+    public static Parser htmlParser() {
+        return new Parser(new HtmlTreeBuilder());
+    }
+
+    /**
+     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
+     * rather creates a simple tree directly from the input.
+     * @return a new simple XML parser.
+     */
+    public static Parser xmlParser() {
+        return new Parser(new XmlTreeBuilder());
+    }
+}
diff --git a/src/org/jsoup/parser/Tag.java b/src/org/jsoup/parser/Tag.java

new file mode 100644 (file)

index 0000000..40b7557
--- /dev/null
+++ b/src/org/jsoup/parser/Tag.java
@@ -0,0 +1,262 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * HTML Tag capabilities.
+ *
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
+public class Tag {
+    private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map of known tags
+
+    private String tagName;
+    private boolean isBlock = true; // block or inline
+    private boolean formatAsBlock = true; // should be formatted as a block
+    private boolean canContainBlock = true; // Can this tag hold block level tags?
+    private boolean canContainInline = true; // only pcdata if not
+    private boolean empty = false; // can hold nothing; e.g. img
+    private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty.
+    private boolean preserveWhitespace = false; // for pre, textarea, script etc
+
+    private Tag(String tagName) {
+        this.tagName = tagName.toLowerCase();
+    }
+
+    /**
+     * Get this tag's name.
+     *
+     * @return the tag's name
+     */
+    public String getName() {
+        return tagName;
+    }
+
+    /**
+     * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything.
+     * <p/>
+     * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals().
+     *
+     * @param tagName Name of tag, e.g. "p". Case insensitive.
+     * @return The tag, either defined or new generic.
+     */
+    public static Tag valueOf(String tagName) {
+        Validate.notNull(tagName);
+        tagName = tagName.trim().toLowerCase();
+        Validate.notEmpty(tagName);
+
+        synchronized (tags) {
+            Tag tag = tags.get(tagName);
+            if (tag == null) {
+                // not defined: create default; go anywhere, do anything! (incl be inside a <p>)
+                tag = new Tag(tagName);
+                tag.isBlock = false;
+                tag.canContainBlock = true;
+            }
+            return tag;
+        }
+    }
+
+    /**
+     * Gets if this is a block tag.
+     *
+     * @return if block tag
+     */
+    public boolean isBlock() {
+        return isBlock;
+    }
+
+    /**
+     * Gets if this tag should be formatted as a block (or as inline)
+     *
+     * @return if should be formatted as block or inline
+     */
+    public boolean formatAsBlock() {
+        return formatAsBlock;
+    }
+
+    /**
+     * Gets if this tag can contain block tags.
+     *
+     * @return if tag can contain block tags
+     */
+    public boolean canContainBlock() {
+        return canContainBlock;
+    }
+
+    /**
+     * Gets if this tag is an inline tag.
+     *
+     * @return if this tag is an inline tag.
+     */
+    public boolean isInline() {
+        return !isBlock;
+    }
+
+    /**
+     * Gets if this tag is a data only tag.
+     *
+     * @return if this tag is a data only tag
+     */
+    public boolean isData() {
+        return !canContainInline && !isEmpty();
+    }
+
+    /**
+     * Get if this is an empty tag
+     *
+     * @return if this is an empty tag
+     */
+    public boolean isEmpty() {
+        return empty;
+    }
+
+    /**
+     * Get if this tag is self closing.
+     *
+     * @return if this tag should be output as self closing.
+     */
+    public boolean isSelfClosing() {
+        return empty || selfClosing;
+    }
+
+    /**
+     * Get if this is a pre-defined tag, or was auto created on parsing.
+     *
+     * @return if a known tag
+     */
+    public boolean isKnownTag() {
+        return tags.containsKey(tagName);
+    }
+
+    /**
+     * Check if this tagname is a known tag.
+     *
+     * @param tagName name of tag
+     * @return if known HTML tag
+     */
+    public static boolean isKnownTag(String tagName) {
+        return tags.containsKey(tagName);
+    }
+
+    /**
+     * Get if this tag should preserve whitespace within child text nodes.
+     *
+     * @return if preserve whitepace
+     */
+    public boolean preserveWhitespace() {
+        return preserveWhitespace;
+    }
+
+    Tag setSelfClosing() {
+        selfClosing = true;
+        return this;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof Tag)) return false;
+
+        Tag tag = (Tag) o;
+
+        if (canContainBlock != tag.canContainBlock) return false;
+        if (canContainInline != tag.canContainInline) return false;
+        if (empty != tag.empty) return false;
+        if (formatAsBlock != tag.formatAsBlock) return false;
+        if (isBlock != tag.isBlock) return false;
+        if (preserveWhitespace != tag.preserveWhitespace) return false;
+        if (selfClosing != tag.selfClosing) return false;
+        if (!tagName.equals(tag.tagName)) return false;
+
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = tagName.hashCode();
+        result = 31 * result + (isBlock ? 1 : 0);
+        result = 31 * result + (formatAsBlock ? 1 : 0);
+        result = 31 * result + (canContainBlock ? 1 : 0);
+        result = 31 * result + (canContainInline ? 1 : 0);
+        result = 31 * result + (empty ? 1 : 0);
+        result = 31 * result + (selfClosing ? 1 : 0);
+        result = 31 * result + (preserveWhitespace ? 1 : 0);
+        return result;
+    }
+
+    public String toString() {
+        return tagName;
+    }
+
+    // internal static initialisers:
+    // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources
+    private static final String[] blockTags = {
+            "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
+            "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6",
+            "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
+            "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
+            "td", "video", "audio", "canvas", "details", "menu", "plaintext"
+    };
+    private static final String[] inlineTags = {
+            "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd",
+            "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q",
+            "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup",
+            "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
+            "summary", "command", "device"
+    };
+    private static final String[] emptyTags = {
+            "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
+            "device"
+    };
+    private static final String[] formatAsInlineTags = {
+            "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style"
+    };
+    private static final String[] preserveWhitespaceTags = {"pre", "plaintext", "title"};
+
+    static {
+        // creates
+        for (String tagName : blockTags) {
+            Tag tag = new Tag(tagName);
+            register(tag);
+        }
+        for (String tagName : inlineTags) {
+            Tag tag = new Tag(tagName);
+            tag.isBlock = false;
+            tag.canContainBlock = false;
+            tag.formatAsBlock = false;
+            register(tag);
+        }
+
+        // mods:
+        for (String tagName : emptyTags) {
+            Tag tag = tags.get(tagName);
+            Validate.notNull(tag);
+            tag.canContainBlock = false;
+            tag.canContainInline = false;
+            tag.empty = true;
+        }
+
+        for (String tagName : formatAsInlineTags) {
+            Tag tag = tags.get(tagName);
+            Validate.notNull(tag);
+            tag.formatAsBlock = false;
+        }
+
+        for (String tagName : preserveWhitespaceTags) {
+            Tag tag = tags.get(tagName);
+            Validate.notNull(tag);
+            tag.preserveWhitespace = true;
+        }
+    }
+
+    private static Tag register(Tag tag) {
+        synchronized (tags) {
+            tags.put(tag.tagName, tag);
+        }
+        return tag;
+    }
+}
diff --git a/src/org/jsoup/parser/Token.java b/src/org/jsoup/parser/Token.java

new file mode 100644 (file)

index 0000000..9f4f9e2
--- /dev/null
+++ b/src/org/jsoup/parser/Token.java
@@ -0,0 +1,252 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.Attributes;
+
+/**
+ * Parse tokens for the Tokeniser.
+ */
+abstract class Token {
+    TokenType type;
+
+    private Token() {
+    }
+    
+    String tokenType() {
+        return this.getClass().getSimpleName();
+    }
+
+    static class Doctype extends Token {
+        final StringBuilder name = new StringBuilder();
+        final StringBuilder publicIdentifier = new StringBuilder();
+        final StringBuilder systemIdentifier = new StringBuilder();
+        boolean forceQuirks = false;
+
+        Doctype() {
+            type = TokenType.Doctype;
+        }
+
+        String getName() {
+            return name.toString();
+        }
+
+        String getPublicIdentifier() {
+            return publicIdentifier.toString();
+        }
+
+        public String getSystemIdentifier() {
+            return systemIdentifier.toString();
+        }
+
+        public boolean isForceQuirks() {
+            return forceQuirks;
+        }
+    }
+
+    static abstract class Tag extends Token {
+        protected String tagName;
+        private String pendingAttributeName;
+        private String pendingAttributeValue;
+
+        boolean selfClosing = false;
+        Attributes attributes = new Attributes(); // todo: allow nodes to not have attributes
+
+        void newAttribute() {
+            if (pendingAttributeName != null) {
+                if (pendingAttributeValue == null)
+                    pendingAttributeValue = "";
+                Attribute attribute = new Attribute(pendingAttributeName, pendingAttributeValue);
+                attributes.put(attribute);
+            }
+            pendingAttributeName = null;
+            pendingAttributeValue = null;
+        }
+
+        void finaliseTag() {
+            // finalises for emit
+            if (pendingAttributeName != null) {
+                // todo: check if attribute name exists; if so, drop and error
+                newAttribute();
+            }
+        }
+
+        String name() {
+            Validate.isFalse(tagName.length() == 0);
+            return tagName;
+        }
+
+        Tag name(String name) {
+            tagName = name;
+            return this;
+        }
+
+        boolean isSelfClosing() {
+            return selfClosing;
+        }
+
+        @SuppressWarnings({"TypeMayBeWeakened"})
+        Attributes getAttributes() {
+            return attributes;
+        }
+
+        // these appenders are rarely hit in not null state-- caused by null chars.
+        void appendTagName(String append) {
+            tagName = tagName == null ? append : tagName.concat(append);
+        }
+
+        void appendTagName(char append) {
+            appendTagName(String.valueOf(append));
+        }
+
+        void appendAttributeName(String append) {
+            pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);
+        }
+
+        void appendAttributeName(char append) {
+            appendAttributeName(String.valueOf(append));
+        }
+
+        void appendAttributeValue(String append) {
+            pendingAttributeValue = pendingAttributeValue == null ? append : pendingAttributeValue.concat(append);
+        }
+
+        void appendAttributeValue(char append) {
+            appendAttributeValue(String.valueOf(append));
+        }
+    }
+
+    static class StartTag extends Tag {
+        StartTag() {
+            super();
+            type = TokenType.StartTag;
+        }
+
+        StartTag(String name) {
+            this();
+            this.tagName = name;
+        }
+
+        StartTag(String name, Attributes attributes) {
+            this();
+            this.tagName = name;
+            this.attributes = attributes;
+        }
+
+        @Override
+        public String toString() {
+            return "<" + name() + " " + attributes.toString() + ">";
+        }
+    }
+
+    static class EndTag extends Tag{
+        EndTag() {
+            super();
+            type = TokenType.EndTag;
+        }
+
+        EndTag(String name) {
+            this();
+            this.tagName = name;
+        }
+
+        @Override
+        public String toString() {
+            return "</" + name() + " " + attributes.toString() + ">";
+        }
+    }
+
+    static class Comment extends Token {
+        final StringBuilder data = new StringBuilder();
+
+        Comment() {
+            type = TokenType.Comment;
+        }
+
+        String getData() {
+            return data.toString();
+        }
+
+        @Override
+        public String toString() {
+            return "<!--" + getData() + "-->";
+        }
+    }
+
+    static class Character extends Token {
+        private final String data;
+
+        Character(String data) {
+            type = TokenType.Character;
+            this.data = data;
+        }
+
+        String getData() {
+            return data;
+        }
+
+        @Override
+        public String toString() {
+            return getData();
+        }
+    }
+
+    static class EOF extends Token {
+        EOF() {
+            type = Token.TokenType.EOF;
+        }
+    }
+
+    boolean isDoctype() {
+        return type == TokenType.Doctype;
+    }
+
+    Doctype asDoctype() {
+        return (Doctype) this;
+    }
+
+    boolean isStartTag() {
+        return type == TokenType.StartTag;
+    }
+
+    StartTag asStartTag() {
+        return (StartTag) this;
+    }
+
+    boolean isEndTag() {
+        return type == TokenType.EndTag;
+    }
+
+    EndTag asEndTag() {
+        return (EndTag) this;
+    }
+
+    boolean isComment() {
+        return type == TokenType.Comment;
+    }
+
+    Comment asComment() {
+        return (Comment) this;
+    }
+
+    boolean isCharacter() {
+        return type == TokenType.Character;
+    }
+
+    Character asCharacter() {
+        return (Character) this;
+    }
+
+    boolean isEOF() {
+        return type == TokenType.EOF;
+    }
+
+    enum TokenType {
+        Doctype,
+        StartTag,
+        EndTag,
+        Comment,
+        Character,
+        EOF
+    }
+}
diff --git a/src/org/jsoup/parser/TokenQueue.java b/src/org/jsoup/parser/TokenQueue.java

new file mode 100644 (file)

index 0000000..a2fdfe6
--- /dev/null
+++ b/src/org/jsoup/parser/TokenQueue.java
@@ -0,0 +1,393 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+
+/**
+ * A character queue with parsing helpers.
+ *
+ * @author Jonathan Hedley
+ */
+public class TokenQueue {
+    private String queue;
+    private int pos = 0;
+    
+    private static final char ESC = '\\'; // escape char for chomp balanced.
+
+    /**
+     Create a new TokenQueue.
+     @param data string of data to back queue.
+     */
+    public TokenQueue(String data) {
+        Validate.notNull(data);
+        queue = data;
+    }
+
+    /**
+     * Is the queue empty?
+     * @return true if no data left in queue.
+     */
+    public boolean isEmpty() {
+        return remainingLength() == 0;
+    }
+    
+    private int remainingLength() {
+        return queue.length() - pos;
+    }
+
+    /**
+     * Retrieves but does not remove the first character from the queue.
+     * @return First character, or 0 if empty.
+     */
+    public char peek() {
+        return isEmpty() ? 0 : queue.charAt(pos);
+    }
+
+    /**
+     Add a character to the start of the queue (will be the next character retrieved).
+     @param c character to add
+     */
+    public void addFirst(Character c) {
+        addFirst(c.toString());
+    }
+
+    /**
+     Add a string to the start of the queue.
+     @param seq string to add.
+     */
+    public void addFirst(String seq) {
+        // not very performant, but an edge case
+        queue = seq + queue.substring(pos);
+        pos = 0;
+    }
+
+    /**
+     * Tests if the next characters on the queue match the sequence. Case insensitive.
+     * @param seq String to check queue for.
+     * @return true if the next characters match.
+     */
+    public boolean matches(String seq) {
+        return queue.regionMatches(true, pos, seq, 0, seq.length());
+    }
+
+    /**
+     * Case sensitive match test.
+     * @param seq string to case sensitively check for
+     * @return true if matched, false if not
+     */
+    public boolean matchesCS(String seq) {
+        return queue.startsWith(seq, pos);
+    }
+    
+
+    /**
+     Tests if the next characters match any of the sequences. Case insensitive.
+     @param seq list of strings to case insensitively check for
+     @return true of any matched, false if none did
+     */
+    public boolean matchesAny(String... seq) {
+        for (String s : seq) {
+            if (matches(s))
+                return true;
+        }
+        return false;
+    }
+
+    public boolean matchesAny(char... seq) {
+        if (isEmpty())
+            return false;
+
+        for (char c: seq) {
+            if (queue.charAt(pos) == c)
+                return true;
+        }
+        return false;
+    }
+
+    public boolean matchesStartTag() {
+        // micro opt for matching "<x"
+        return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1)));
+    }
+
+    /**
+     * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
+     * queue.
+     * @param seq String to search for, and if found, remove from queue.
+     * @return true if found and removed, false if not found.
+     */
+    public boolean matchChomp(String seq) {
+        if (matches(seq)) {
+            pos += seq.length();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     Tests if queue starts with a whitespace character.
+     @return if starts with whitespace
+     */
+    public boolean matchesWhitespace() {
+        return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos));
+    }
+
+    /**
+     Test if the queue matches a word character (letter or digit).
+     @return if matches a word character
+     */
+    public boolean matchesWord() {
+        return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
+    }
+
+    /**
+     * Drops the next character off the queue.
+     */
+    public void advance() {
+        if (!isEmpty()) pos++;
+    }
+
+    /**
+     * Consume one character off queue.
+     * @return first character on queue.
+     */
+    public char consume() {
+        return queue.charAt(pos++);
+    }
+
+    /**
+     * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
+     * throw an illegal state exception -- but you should be running match() against that condition.
+     <p>
+     Case insensitive.
+     * @param seq sequence to remove from head of queue.
+     */
+    public void consume(String seq) {
+        if (!matches(seq))
+            throw new IllegalStateException("Queue did not match expected sequence");
+        int len = seq.length();
+        if (len > remainingLength())
+            throw new IllegalStateException("Queue not long enough to consume sequence");
+        
+        pos += len;
+    }
+
+    /**
+     * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
+     * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b>
+     * @return The matched data consumed from queue.
+     */
+    public String consumeTo(String seq) {
+        int offset = queue.indexOf(seq, pos);
+        if (offset != -1) {
+            String consumed = queue.substring(pos, offset);
+            pos += consumed.length();
+            return consumed;
+        } else {
+            return remainder();
+        }
+    }
+    
+    public String consumeToIgnoreCase(String seq) {
+        int start = pos;
+        String first = seq.substring(0, 1);
+        boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of
+        while (!isEmpty()) {
+            if (matches(seq))
+                break;
+            
+            if (canScan) {
+                int skip = queue.indexOf(first, pos) - pos;
+                if (skip == 0) // this char is the skip char, but not match, so force advance of pos
+                    pos++;
+                else if (skip < 0) // no chance of finding, grab to end
+                    pos = queue.length();
+                else
+                    pos += skip;
+            }
+            else
+                pos++;
+        }
+
+        String data = queue.substring(start, pos); 
+        return data; 
+    }
+
+    /**
+     Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
+     @param seq any number of terminators to consume to. <b>Case insensitive.</b>
+     @return consumed string   
+     */
+    // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
+    // is is a case sensitive time...
+    public String consumeToAny(String... seq) {
+        int start = pos;
+        while (!isEmpty() && !matchesAny(seq)) {
+            pos++;
+        }
+
+        String data = queue.substring(start, pos); 
+        return data; 
+    }
+
+    /**
+     * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
+     * <p>
+     * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
+     * isEmpty() == true).
+     * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b>
+     * @return Data matched from queue.
+     */
+    public String chompTo(String seq) {
+        String data = consumeTo(seq);
+        matchChomp(seq);
+        return data;
+    }
+    
+    public String chompToIgnoreCase(String seq) {
+        String data = consumeToIgnoreCase(seq); // case insensitive scan
+        matchChomp(seq);
+        return data;
+    }
+
+    /**
+     * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
+     * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left
+     * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
+     * contains text strings; use unescape for that.
+     * @param open opener
+     * @param close closer
+     * @return data matched from the queue
+     */
+    public String chompBalanced(char open, char close) {
+        StringBuilder accum = new StringBuilder();
+        int depth = 0;
+        char last = 0;
+
+        do {
+            if (isEmpty()) break;
+            Character c = consume();
+            if (last == 0 || last != ESC) {
+                if (c.equals(open))
+                    depth++;
+                else if (c.equals(close))
+                    depth--;
+            }
+
+            if (depth > 0 && last != 0)
+                accum.append(c); // don't include the outer match pair in the return
+            last = c;
+        } while (depth > 0);
+        return accum.toString();
+    }
+    
+    /**
+     * Unescaped a \ escaped string.
+     * @param in backslash escaped string
+     * @return unescaped string
+     */
+    public static String unescape(String in) {
+        StringBuilder out = new StringBuilder();
+        char last = 0;
+        for (char c : in.toCharArray()) {
+            if (c == ESC) {
+                if (last != 0 && last == ESC)
+                    out.append(c);
+            }
+            else 
+                out.append(c);
+            last = c;
+        }
+        return out.toString();
+    }
+
+    /**
+     * Pulls the next run of whitespace characters of the queue.
+     */
+    public boolean consumeWhitespace() {
+        boolean seen = false;
+        while (matchesWhitespace()) {
+            pos++;
+            seen = true;
+        }
+        return seen;
+    }
+
+    /**
+     * Retrieves the next run of word type (letter or digit) off the queue.
+     * @return String of word characters from queue, or empty string if none.
+     */
+    public String consumeWord() {
+        int start = pos;
+        while (matchesWord())
+            pos++;
+        return queue.substring(start, pos);
+    }
+    
+    /**
+     * Consume an tag name off the queue (word or :, _, -)
+     * 
+     * @return tag name
+     */
+    public String consumeTagName() {
+        int start = pos;
+        while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-')))
+            pos++;
+        
+        return queue.substring(start, pos);
+    }
+    
+    /**
+     * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects).
+     * 
+     * @return tag name
+     */
+    public String consumeElementSelector() {
+        int start = pos;
+        while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-')))
+            pos++;
+        
+        return queue.substring(start, pos);
+    }
+
+    /**
+     Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
+     http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
+     @return identifier
+     */
+    public String consumeCssIdentifier() {
+        int start = pos;
+        while (!isEmpty() && (matchesWord() || matchesAny('-', '_')))
+            pos++;
+
+        return queue.substring(start, pos);
+    }
+
+    /**
+     Consume an attribute key off the queue (letter, digit, -, _, :")
+     @return attribute key
+     */
+    public String consumeAttributeKey() {
+        int start = pos;
+        while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':')))
+            pos++;
+        
+        return queue.substring(start, pos);
+    }
+
+    /**
+     Consume and return whatever is left on the queue.
+     @return remained of queue.
+     */
+    public String remainder() {
+        StringBuilder accum = new StringBuilder();
+        while (!isEmpty()) {
+            accum.append(consume());
+        }
+        return accum.toString();
+    }
+    
+    public String toString() {
+        return queue.substring(pos);
+    }
+}
diff --git a/src/org/jsoup/parser/Tokeniser.java b/src/org/jsoup/parser/Tokeniser.java

new file mode 100644 (file)

index 0000000..ce6ee69
--- /dev/null
+++ b/src/org/jsoup/parser/Tokeniser.java
@@ -0,0 +1,230 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Entities;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Readers the input stream into tokens.
+ */
+class Tokeniser {
+    static final char replacementChar = '\uFFFD'; // replaces null character
+
+    private CharacterReader reader; // html input
+    private ParseErrorList errors; // errors found while tokenising
+
+    private TokeniserState state = TokeniserState.Data; // current tokenisation state
+    private Token emitPending; // the token we are about to emit on next read
+    private boolean isEmitPending = false;
+    private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token
+    StringBuilder dataBuffer; // buffers data looking for </script>
+
+    Token.Tag tagPending; // tag we are building up
+    Token.Doctype doctypePending; // doctype building up
+    Token.Comment commentPending; // comment building up
+    private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag
+    private boolean selfClosingFlagAcknowledged = true;
+
+    Tokeniser(CharacterReader reader, ParseErrorList errors) {
+        this.reader = reader;
+        this.errors = errors;
+    }
+
+    Token read() {
+        if (!selfClosingFlagAcknowledged) {
+            error("Self closing flag not acknowledged");
+            selfClosingFlagAcknowledged = true;
+        }
+
+        while (!isEmitPending)
+            state.read(this, reader);
+
+        // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
+        if (charBuffer.length() > 0) {
+            String str = charBuffer.toString();
+            charBuffer.delete(0, charBuffer.length());
+            return new Token.Character(str);
+        } else {
+            isEmitPending = false;
+            return emitPending;
+        }
+    }
+
+    void emit(Token token) {
+        Validate.isFalse(isEmitPending, "There is an unread token pending!");
+
+        emitPending = token;
+        isEmitPending = true;
+
+        if (token.type == Token.TokenType.StartTag) {
+            Token.StartTag startTag = (Token.StartTag) token;
+            lastStartTag = startTag;
+            if (startTag.selfClosing)
+                selfClosingFlagAcknowledged = false;
+        } else if (token.type == Token.TokenType.EndTag) {
+            Token.EndTag endTag = (Token.EndTag) token;
+            if (endTag.attributes.size() > 0)
+                error("Attributes incorrectly present on end tag");
+        }
+    }
+
+    void emit(String str) {
+        // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
+        // does not set isEmitPending; read checks that
+        charBuffer.append(str);
+    }
+
+    void emit(char c) {
+        charBuffer.append(c);
+    }
+
+    TokeniserState getState() {
+        return state;
+    }
+
+    void transition(TokeniserState state) {
+        this.state = state;
+    }
+
+    void advanceTransition(TokeniserState state) {
+        reader.advance();
+        this.state = state;
+    }
+
+    void acknowledgeSelfClosingFlag() {
+        selfClosingFlagAcknowledged = true;
+    }
+
+    Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
+        if (reader.isEmpty())
+            return null;
+        if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
+            return null;
+        if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&'))
+            return null;
+
+        reader.mark();
+        if (reader.matchConsume("#")) { // numbered
+            boolean isHexMode = reader.matchConsumeIgnoreCase("X");
+            String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
+            if (numRef.length() == 0) { // didn't match anything
+                characterReferenceError("numeric reference with no numerals");
+                reader.rewindToMark();
+                return null;
+            }
+            if (!reader.matchConsume(";"))
+                characterReferenceError("missing semicolon"); // missing semi
+            int charval = -1;
+            try {
+                int base = isHexMode ? 16 : 10;
+                charval = Integer.valueOf(numRef, base);
+            } catch (NumberFormatException e) {
+            } // skip
+            if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
+                characterReferenceError("character outside of valid range");
+                return replacementChar;
+            } else {
+                // todo: implement number replacement table
+                // todo: check for extra illegal unicode points as parse errors
+                return (char) charval;
+            }
+        } else { // named
+            // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found
+            String nameRef = reader.consumeLetterThenDigitSequence();
+            String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches
+            boolean looksLegit = reader.matches(';');
+            boolean found = false;
+            while (nameRef.length() > 0 && !found) {
+                if (Entities.isNamedEntity(nameRef))
+                    found = true;
+                else {
+                    nameRef = nameRef.substring(0, nameRef.length()-1);
+                    reader.unconsume();
+                }
+            }
+            if (!found) {
+                if (looksLegit) // named with semicolon
+                    characterReferenceError(String.format("invalid named referenece '%s'", origNameRef));
+                reader.rewindToMark();
+                return null;
+            }
+            if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
+                // don't want that to match
+                reader.rewindToMark();
+                return null;
+            }
+            if (!reader.matchConsume(";"))
+                characterReferenceError("missing semicolon"); // missing semi
+            return Entities.getCharacterByName(nameRef);
+        }
+    }
+
+    Token.Tag createTagPending(boolean start) {
+        tagPending = start ? new Token.StartTag() : new Token.EndTag();
+        return tagPending;
+    }
+
+    void emitTagPending() {
+        tagPending.finaliseTag();
+        emit(tagPending);
+    }
+
+    void createCommentPending() {
+        commentPending = new Token.Comment();
+    }
+
+    void emitCommentPending() {
+        emit(commentPending);
+    }
+
+    void createDoctypePending() {
+        doctypePending = new Token.Doctype();
+    }
+
+    void emitDoctypePending() {
+        emit(doctypePending);
+    }
+
+    void createTempBuffer() {
+        dataBuffer = new StringBuilder();
+    }
+
+    boolean isAppropriateEndTagToken() {
+        if (lastStartTag == null)
+            return false;
+        return tagPending.tagName.equals(lastStartTag.tagName);
+    }
+
+    String appropriateEndTagName() {
+        return lastStartTag.tagName;
+    }
+
+    void error(TokeniserState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
+    }
+
+    void eofError(TokeniserState state) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
+    }
+
+    private void characterReferenceError(String message) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
+    }
+
+    private void error(String errorMsg) {
+        if (errors.canAddError())
+            errors.add(new ParseError(reader.pos(), errorMsg));
+    }
+
+    boolean currentNodeInHtmlNS() {
+        // todo: implement namespaces correctly
+        return true;
+        // Element currentNode = currentNode();
+        // return currentNode != null && currentNode.namespace().equals("HTML");
+    }
+}
diff --git a/src/org/jsoup/parser/TokeniserState.java b/src/org/jsoup/parser/TokeniserState.java

new file mode 100644 (file)

index 0000000..e3013c7
--- /dev/null
+++ b/src/org/jsoup/parser/TokeniserState.java
@@ -0,0 +1,1778 @@
+package org.jsoup.parser;
+
+/**
+ * States and transition activations for the Tokeniser.
+ */
+enum TokeniserState {
+    Data {
+        // in data state, gather characters until a character reference or tag is found
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.current()) {
+                case '&':
+                    t.advanceTransition(CharacterReferenceInData);
+                    break;
+                case '<':
+                    t.advanceTransition(TagOpen);
+                    break;
+                case nullChar:
+                    t.error(this); // NOT replacement character (oddly?)
+                    t.emit(r.consume());
+                    break;
+                case eof:
+                    t.emit(new Token.EOF());
+                    break;
+                default:
+                    String data = r.consumeToAny('&', '<', nullChar);
+                    t.emit(data);
+                    break;
+            }
+        }
+    },
+    CharacterReferenceInData {
+        // from & in data
+        void read(Tokeniser t, CharacterReader r) {
+            Character c = t.consumeCharacterReference(null, false);
+            if (c == null)
+                t.emit('&');
+            else
+                t.emit(c);
+            t.transition(Data);
+        }
+    },
+    Rcdata {
+        /// handles data in title, textarea etc
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.current()) {
+                case '&':
+                    t.advanceTransition(CharacterReferenceInRcdata);
+                    break;
+                case '<':
+                    t.advanceTransition(RcdataLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.emit(replacementChar);
+                    break;
+                case eof:
+                    t.emit(new Token.EOF());
+                    break;
+                default:
+                    String data = r.consumeToAny('&', '<', nullChar);
+                    t.emit(data);
+                    break;
+            }
+        }
+    },
+    CharacterReferenceInRcdata {
+        void read(Tokeniser t, CharacterReader r) {
+            Character c = t.consumeCharacterReference(null, false);
+            if (c == null)
+                t.emit('&');
+            else
+                t.emit(c);
+            t.transition(Rcdata);
+        }
+    },
+    Rawtext {
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.current()) {
+                case '<':
+                    t.advanceTransition(RawtextLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.emit(replacementChar);
+                    break;
+                case eof:
+                    t.emit(new Token.EOF());
+                    break;
+                default:
+                    String data = r.consumeToAny('<', nullChar);
+                    t.emit(data);
+                    break;
+            }
+        }
+    },
+    ScriptData {
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.current()) {
+                case '<':
+                    t.advanceTransition(ScriptDataLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.emit(replacementChar);
+                    break;
+                case eof:
+                    t.emit(new Token.EOF());
+                    break;
+                default:
+                    String data = r.consumeToAny('<', nullChar);
+                    t.emit(data);
+                    break;
+            }
+        }
+    },
+    PLAINTEXT {
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.current()) {
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.emit(replacementChar);
+                    break;
+                case eof:
+                    t.emit(new Token.EOF());
+                    break;
+                default:
+                    String data = r.consumeTo(nullChar);
+                    t.emit(data);
+                    break;
+            }
+        }
+    },
+    TagOpen {
+        // from < in data
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.current()) {
+                case '!':
+                    t.advanceTransition(MarkupDeclarationOpen);
+                    break;
+                case '/':
+                    t.advanceTransition(EndTagOpen);
+                    break;
+                case '?':
+                    t.advanceTransition(BogusComment);
+                    break;
+                default:
+                    if (r.matchesLetter()) {
+                        t.createTagPending(true);
+                        t.transition(TagName);
+                    } else {
+                        t.error(this);
+                        t.emit('<'); // char that got us here
+                        t.transition(Data);
+                    }
+                    break;
+            }
+        }
+    },
+    EndTagOpen {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.isEmpty()) {
+                t.eofError(this);
+                t.emit("</");
+                t.transition(Data);
+            } else if (r.matchesLetter()) {
+                t.createTagPending(false);
+                t.transition(TagName);
+            } else if (r.matches('>')) {
+                t.error(this);
+                t.advanceTransition(Data);
+            } else {
+                t.error(this);
+                t.advanceTransition(BogusComment);
+            }
+        }
+    },
+    TagName {
+        // from < or </ in data, will have start or end tag pending
+        void read(Tokeniser t, CharacterReader r) {
+            // previous TagOpen state did NOT consume, will have a letter char in current
+            String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase();
+            t.tagPending.appendTagName(tagName);
+
+            switch (r.consume()) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BeforeAttributeName);
+                    break;
+                case '/':
+                    t.transition(SelfClosingStartTag);
+                    break;
+                case '>':
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case nullChar: // replacement
+                    t.tagPending.appendTagName(replacementStr);
+                    break;
+                case eof: // should emit pending tag?
+                    t.eofError(this);
+                    t.transition(Data);
+                // no default, as covered with above consumeToAny
+            }
+        }
+    },
+    RcdataLessthanSign {
+        // from < in rcdata
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matches('/')) {
+                t.createTempBuffer();
+                t.advanceTransition(RCDATAEndTagOpen);
+            } else if (r.matchesLetter() && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
+                // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than
+                // consuming to EOF; break out here
+                t.tagPending = new Token.EndTag(t.appropriateEndTagName());
+                t.emitTagPending();
+                r.unconsume(); // undo "<"
+                t.transition(Data);
+            } else {
+                t.emit("<");
+                t.transition(Rcdata);
+            }
+        }
+    },
+    RCDATAEndTagOpen {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                t.createTagPending(false);
+                t.tagPending.appendTagName(Character.toLowerCase(r.current()));
+                t.dataBuffer.append(Character.toLowerCase(r.current()));
+                t.advanceTransition(RCDATAEndTagName);
+            } else {
+                t.emit("</");
+                t.transition(Rcdata);
+            }
+        }
+    },
+    RCDATAEndTagName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.tagPending.appendTagName(name.toLowerCase());
+                t.dataBuffer.append(name);
+                return;
+            }
+
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    if (t.isAppropriateEndTagToken())
+                        t.transition(BeforeAttributeName);
+                    else
+                        anythingElse(t, r);
+                    break;
+                case '/':
+                    if (t.isAppropriateEndTagToken())
+                        t.transition(SelfClosingStartTag);
+                    else
+                        anythingElse(t, r);
+                    break;
+                case '>':
+                    if (t.isAppropriateEndTagToken()) {
+                        t.emitTagPending();
+                        t.transition(Data);
+                    }
+                    else
+                        anythingElse(t, r);
+                    break;
+                default:
+                    anythingElse(t, r);
+            }
+        }
+
+        private void anythingElse(Tokeniser t, CharacterReader r) {
+            t.emit("</" + t.dataBuffer.toString());
+            t.transition(Rcdata);
+        }
+    },
+    RawtextLessthanSign {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matches('/')) {
+                t.createTempBuffer();
+                t.advanceTransition(RawtextEndTagOpen);
+            } else {
+                t.emit('<');
+                t.transition(Rawtext);
+            }
+        }
+    },
+    RawtextEndTagOpen {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                t.createTagPending(false);
+                t.transition(RawtextEndTagName);
+            } else {
+                t.emit("</");
+                t.transition(Rawtext);
+            }
+        }
+    },
+    RawtextEndTagName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.tagPending.appendTagName(name.toLowerCase());
+                t.dataBuffer.append(name);
+                return;
+            }
+
+            if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+                char c = r.consume();
+                switch (c) {
+                    case '\t':
+                    case '\n':
+                    case '\f':
+                    case ' ':
+                        t.transition(BeforeAttributeName);
+                        break;
+                    case '/':
+                        t.transition(SelfClosingStartTag);
+                        break;
+                    case '>':
+                        t.emitTagPending();
+                        t.transition(Data);
+                        break;
+                    default:
+                        t.dataBuffer.append(c);
+                        anythingElse(t, r);
+                }
+            } else
+                anythingElse(t, r);
+        }
+
+        private void anythingElse(Tokeniser t, CharacterReader r) {
+            t.emit("</" + t.dataBuffer.toString());
+            t.transition(Rawtext);
+        }
+    },
+    ScriptDataLessthanSign {
+        void read(Tokeniser t, CharacterReader r) {
+            switch (r.consume()) {
+                case '/':
+                    t.createTempBuffer();
+                    t.transition(ScriptDataEndTagOpen);
+                    break;
+                case '!':
+                    t.emit("<!");
+                    t.transition(ScriptDataEscapeStart);
+                    break;
+                default:
+                    t.emit("<");
+                    r.unconsume();
+                    t.transition(ScriptData);
+            }
+        }
+    },
+    ScriptDataEndTagOpen {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                t.createTagPending(false);
+                t.transition(ScriptDataEndTagName);
+            } else {
+                t.emit("</");
+                t.transition(ScriptData);
+            }
+
+        }
+    },
+    ScriptDataEndTagName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.tagPending.appendTagName(name.toLowerCase());
+                t.dataBuffer.append(name);
+                return;
+            }
+
+            if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+                char c = r.consume();
+                switch (c) {
+                    case '\t':
+                    case '\n':
+                    case '\f':
+                    case ' ':
+                        t.transition(BeforeAttributeName);
+                        break;
+                    case '/':
+                        t.transition(SelfClosingStartTag);
+                        break;
+                    case '>':
+                        t.emitTagPending();
+                        t.transition(Data);
+                        break;
+                    default:
+                        t.dataBuffer.append(c);
+                        anythingElse(t, r);
+                }
+            } else {
+                anythingElse(t, r);
+            }
+        }
+
+        private void anythingElse(Tokeniser t, CharacterReader r) {
+            t.emit("</" + t.dataBuffer.toString());
+            t.transition(ScriptData);
+        }
+    },
+    ScriptDataEscapeStart {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matches('-')) {
+                t.emit('-');
+                t.advanceTransition(ScriptDataEscapeStartDash);
+            } else {
+                t.transition(ScriptData);
+            }
+        }
+    },
+    ScriptDataEscapeStartDash {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matches('-')) {
+                t.emit('-');
+                t.advanceTransition(ScriptDataEscapedDashDash);
+            } else {
+                t.transition(ScriptData);
+            }
+        }
+    },
+    ScriptDataEscaped {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.isEmpty()) {
+                t.eofError(this);
+                t.transition(Data);
+                return;
+            }
+
+            switch (r.current()) {
+                case '-':
+                    t.emit('-');
+                    t.advanceTransition(ScriptDataEscapedDash);
+                    break;
+                case '<':
+                    t.advanceTransition(ScriptDataEscapedLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.emit(replacementChar);
+                    break;
+                default:
+                    String data = r.consumeToAny('-', '<', nullChar);
+                    t.emit(data);
+            }
+        }
+    },
+    ScriptDataEscapedDash {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.isEmpty()) {
+                t.eofError(this);
+                t.transition(Data);
+                return;
+            }
+
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.emit(c);
+                    t.transition(ScriptDataEscapedDashDash);
+                    break;
+                case '<':
+                    t.transition(ScriptDataEscapedLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.emit(replacementChar);
+                    t.transition(ScriptDataEscaped);
+                    break;
+                default:
+                    t.emit(c);
+                    t.transition(ScriptDataEscaped);
+            }
+        }
+    },
+    ScriptDataEscapedDashDash {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.isEmpty()) {
+                t.eofError(this);
+                t.transition(Data);
+                return;
+            }
+
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.emit(c);
+                    break;
+                case '<':
+                    t.transition(ScriptDataEscapedLessthanSign);
+                    break;
+                case '>':
+                    t.emit(c);
+                    t.transition(ScriptData);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.emit(replacementChar);
+                    t.transition(ScriptDataEscaped);
+                    break;
+                default:
+                    t.emit(c);
+                    t.transition(ScriptDataEscaped);
+            }
+        }
+    },
+    ScriptDataEscapedLessthanSign {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                t.createTempBuffer();
+                t.dataBuffer.append(Character.toLowerCase(r.current()));
+                t.emit("<" + r.current());
+                t.advanceTransition(ScriptDataDoubleEscapeStart);
+            } else if (r.matches('/')) {
+                t.createTempBuffer();
+                t.advanceTransition(ScriptDataEscapedEndTagOpen);
+            } else {
+                t.emit('<');
+                t.transition(ScriptDataEscaped);
+            }
+        }
+    },
+    ScriptDataEscapedEndTagOpen {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                t.createTagPending(false);
+                t.tagPending.appendTagName(Character.toLowerCase(r.current()));
+                t.dataBuffer.append(r.current());
+                t.advanceTransition(ScriptDataEscapedEndTagName);
+            } else {
+                t.emit("</");
+                t.transition(ScriptDataEscaped);
+            }
+        }
+    },
+    ScriptDataEscapedEndTagName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.tagPending.appendTagName(name.toLowerCase());
+                t.dataBuffer.append(name);
+                return;
+            }
+
+            if (t.isAppropriateEndTagToken() && !r.isEmpty()) {
+                char c = r.consume();
+                switch (c) {
+                    case '\t':
+                    case '\n':
+                    case '\f':
+                    case ' ':
+                        t.transition(BeforeAttributeName);
+                        break;
+                    case '/':
+                        t.transition(SelfClosingStartTag);
+                        break;
+                    case '>':
+                        t.emitTagPending();
+                        t.transition(Data);
+                        break;
+                    default:
+                        t.dataBuffer.append(c);
+                        anythingElse(t, r);
+                        break;
+                }
+            } else {
+                anythingElse(t, r);
+            }
+        }
+        
+        private void anythingElse(Tokeniser t, CharacterReader r) {
+            t.emit("</" + t.dataBuffer.toString());
+            t.transition(ScriptDataEscaped);
+        }
+    },
+    ScriptDataDoubleEscapeStart {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.dataBuffer.append(name.toLowerCase());
+                t.emit(name);
+                return;
+            }
+
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                case '/':
+                case '>':
+                    if (t.dataBuffer.toString().equals("script"))
+                        t.transition(ScriptDataDoubleEscaped);
+                    else
+                        t.transition(ScriptDataEscaped);
+                    t.emit(c);
+                    break;
+                default:
+                    r.unconsume();
+                    t.transition(ScriptDataEscaped);
+            }
+        }
+    },
+    ScriptDataDoubleEscaped {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.current();
+            switch (c) {
+                case '-':
+                    t.emit(c);
+                    t.advanceTransition(ScriptDataDoubleEscapedDash);
+                    break;
+                case '<':
+                    t.emit(c);
+                    t.advanceTransition(ScriptDataDoubleEscapedLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.emit(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                default:
+                    String data = r.consumeToAny('-', '<', nullChar);
+                    t.emit(data);
+            }
+        }
+    },
+    ScriptDataDoubleEscapedDash {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.emit(c);
+                    t.transition(ScriptDataDoubleEscapedDashDash);
+                    break;
+                case '<':
+                    t.emit(c);
+                    t.transition(ScriptDataDoubleEscapedLessthanSign);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.emit(replacementChar);
+                    t.transition(ScriptDataDoubleEscaped);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                default:
+                    t.emit(c);
+                    t.transition(ScriptDataDoubleEscaped);
+            }
+        }
+    },
+    ScriptDataDoubleEscapedDashDash {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.emit(c);
+                    break;
+                case '<':
+                    t.emit(c);
+                    t.transition(ScriptDataDoubleEscapedLessthanSign);
+                    break;
+                case '>':
+                    t.emit(c);
+                    t.transition(ScriptData);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.emit(replacementChar);
+                    t.transition(ScriptDataDoubleEscaped);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                default:
+                    t.emit(c);
+                    t.transition(ScriptDataDoubleEscaped);
+            }
+        }
+    },
+    ScriptDataDoubleEscapedLessthanSign {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matches('/')) {
+                t.emit('/');
+                t.createTempBuffer();
+                t.advanceTransition(ScriptDataDoubleEscapeEnd);
+            } else {
+                t.transition(ScriptDataDoubleEscaped);
+            }
+        }
+    },
+    ScriptDataDoubleEscapeEnd {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.dataBuffer.append(name.toLowerCase());
+                t.emit(name);
+                return;
+            }
+
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                case '/':
+                case '>':
+                    if (t.dataBuffer.toString().equals("script"))
+                        t.transition(ScriptDataEscaped);
+                    else
+                        t.transition(ScriptDataDoubleEscaped);
+                    t.emit(c);
+                    break;
+                default:
+                    r.unconsume();
+                    t.transition(ScriptDataDoubleEscaped);
+            }
+        }
+    },
+    BeforeAttributeName {
+        // from tagname <xxx
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    break; // ignore whitespace
+                case '/':
+                    t.transition(SelfClosingStartTag);
+                    break;
+                case '>':
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.newAttribute();
+                    r.unconsume();
+                    t.transition(AttributeName);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                case '"':
+                case '\'':
+                case '<':
+                case '=':
+                    t.error(this);
+                    t.tagPending.newAttribute();
+                    t.tagPending.appendAttributeName(c);
+                    t.transition(AttributeName);
+                    break;
+                default: // A-Z, anything else
+                    t.tagPending.newAttribute();
+                    r.unconsume();
+                    t.transition(AttributeName);
+            }
+        }
+    },
+    AttributeName {
+        // from before attribute name
+        void read(Tokeniser t, CharacterReader r) {
+            String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<');
+            t.tagPending.appendAttributeName(name.toLowerCase());
+
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(AfterAttributeName);
+                    break;
+                case '/':
+                    t.transition(SelfClosingStartTag);
+                    break;
+                case '=':
+                    t.transition(BeforeAttributeValue);
+                    break;
+                case '>':
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.appendAttributeName(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                case '"':
+                case '\'':
+                case '<':
+                    t.error(this);
+                    t.tagPending.appendAttributeName(c);
+                // no default, as covered in consumeToAny
+            }
+        }
+    },
+    AfterAttributeName {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    // ignore
+                    break;
+                case '/':
+                    t.transition(SelfClosingStartTag);
+                    break;
+                case '=':
+                    t.transition(BeforeAttributeValue);
+                    break;
+                case '>':
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.appendAttributeName(replacementChar);
+                    t.transition(AttributeName);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                case '"':
+                case '\'':
+                case '<':
+                    t.error(this);
+                    t.tagPending.newAttribute();
+                    t.tagPending.appendAttributeName(c);
+                    t.transition(AttributeName);
+                    break;
+                default: // A-Z, anything else
+                    t.tagPending.newAttribute();
+                    r.unconsume();
+                    t.transition(AttributeName);
+            }
+        }
+    },
+    BeforeAttributeValue {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    // ignore
+                    break;
+                case '"':
+                    t.transition(AttributeValue_doubleQuoted);
+                    break;
+                case '&':
+                    r.unconsume();
+                    t.transition(AttributeValue_unquoted);
+                    break;
+                case '\'':
+                    t.transition(AttributeValue_singleQuoted);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.appendAttributeValue(replacementChar);
+                    t.transition(AttributeValue_unquoted);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case '<':
+                case '=':
+                case '`':
+                    t.error(this);
+                    t.tagPending.appendAttributeValue(c);
+                    t.transition(AttributeValue_unquoted);
+                    break;
+                default:
+                    r.unconsume();
+                    t.transition(AttributeValue_unquoted);
+            }
+        }
+    },
+    AttributeValue_doubleQuoted {
+        void read(Tokeniser t, CharacterReader r) {
+            String value = r.consumeToAny('"', '&', nullChar);
+            if (value.length() > 0)
+                t.tagPending.appendAttributeValue(value);
+
+            char c = r.consume();
+            switch (c) {
+                case '"':
+                    t.transition(AfterAttributeValue_quoted);
+                    break;
+                case '&':
+                    Character ref = t.consumeCharacterReference('"', true);
+                    if (ref != null)
+                        t.tagPending.appendAttributeValue(ref);
+                    else
+                        t.tagPending.appendAttributeValue('&');
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.appendAttributeValue(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                // no default, handled in consume to any above
+            }
+        }
+    },
+    AttributeValue_singleQuoted {
+        void read(Tokeniser t, CharacterReader r) {
+            String value = r.consumeToAny('\'', '&', nullChar);
+            if (value.length() > 0)
+                t.tagPending.appendAttributeValue(value);
+
+            char c = r.consume();
+            switch (c) {
+                case '\'':
+                    t.transition(AfterAttributeValue_quoted);
+                    break;
+                case '&':
+                    Character ref = t.consumeCharacterReference('\'', true);
+                    if (ref != null)
+                        t.tagPending.appendAttributeValue(ref);
+                    else
+                        t.tagPending.appendAttributeValue('&');
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.appendAttributeValue(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                // no default, handled in consume to any above
+            }
+        }
+    },
+    AttributeValue_unquoted {
+        void read(Tokeniser t, CharacterReader r) {
+            String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`');
+            if (value.length() > 0)
+                t.tagPending.appendAttributeValue(value);
+
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BeforeAttributeName);
+                    break;
+                case '&':
+                    Character ref = t.consumeCharacterReference('>', true);
+                    if (ref != null)
+                        t.tagPending.appendAttributeValue(ref);
+                    else
+                        t.tagPending.appendAttributeValue('&');
+                    break;
+                case '>':
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.tagPending.appendAttributeValue(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                case '"':
+                case '\'':
+                case '<':
+                case '=':
+                case '`':
+                    t.error(this);
+                    t.tagPending.appendAttributeValue(c);
+                    break;
+                // no default, handled in consume to any above
+            }
+
+        }
+    },
+    // CharacterReferenceInAttributeValue state handled inline
+    AfterAttributeValue_quoted {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BeforeAttributeName);
+                    break;
+                case '/':
+                    t.transition(SelfClosingStartTag);
+                    break;
+                case '>':
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    r.unconsume();
+                    t.transition(BeforeAttributeName);
+            }
+
+        }
+    },
+    SelfClosingStartTag {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '>':
+                    t.tagPending.selfClosing = true;
+                    t.emitTagPending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.transition(BeforeAttributeName);
+            }
+        }
+    },
+    BogusComment {
+        void read(Tokeniser t, CharacterReader r) {
+            // todo: handle bogus comment starting from eof. when does that trigger?
+            // rewind to capture character that lead us here
+            r.unconsume();
+            Token.Comment comment = new Token.Comment();
+            comment.data.append(r.consumeTo('>'));
+            // todo: replace nullChar with replaceChar
+            t.emit(comment);
+            t.advanceTransition(Data);
+        }
+    },
+    MarkupDeclarationOpen {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchConsume("--")) {
+                t.createCommentPending();
+                t.transition(CommentStart);
+            } else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
+                t.transition(Doctype);
+            } else if (r.matchConsume("[CDATA[")) {
+                // todo: should actually check current namepspace, and only non-html allows cdata. until namespace
+                // is implemented properly, keep handling as cdata
+                //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
+                t.transition(CdataSection);
+            } else {
+                t.error(this);
+                t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind
+            }
+        }
+    },
+    CommentStart {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.transition(CommentStartDash);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.commentPending.data.append(replacementChar);
+                    t.transition(Comment);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.commentPending.data.append(c);
+                    t.transition(Comment);
+            }
+        }
+    },
+    CommentStartDash {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.transition(CommentStartDash);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.commentPending.data.append(replacementChar);
+                    t.transition(Comment);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.commentPending.data.append(c);
+                    t.transition(Comment);
+            }
+        }
+    },
+    Comment {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.current();
+            switch (c) {
+                case '-':
+                    t.advanceTransition(CommentEndDash);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    r.advance();
+                    t.commentPending.data.append(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.commentPending.data.append(r.consumeToAny('-', nullChar));
+            }
+        }
+    },
+    CommentEndDash {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.transition(CommentEnd);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.commentPending.data.append('-').append(replacementChar);
+                    t.transition(Comment);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.commentPending.data.append('-').append(c);
+                    t.transition(Comment);
+            }
+        }
+    },
+    CommentEnd {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '>':
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.commentPending.data.append("--").append(replacementChar);
+                    t.transition(Comment);
+                    break;
+                case '!':
+                    t.error(this);
+                    t.transition(CommentEndBang);
+                    break;
+                case '-':
+                    t.error(this);
+                    t.commentPending.data.append('-');
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.commentPending.data.append("--").append(c);
+                    t.transition(Comment);
+            }
+        }
+    },
+    CommentEndBang {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '-':
+                    t.commentPending.data.append("--!");
+                    t.transition(CommentEndDash);
+                    break;
+                case '>':
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.commentPending.data.append("--!").append(replacementChar);
+                    t.transition(Comment);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.emitCommentPending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.commentPending.data.append("--!").append(c);
+                    t.transition(Comment);
+            }
+        }
+    },
+    Doctype {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BeforeDoctypeName);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.createDoctypePending();
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.transition(BeforeDoctypeName);
+            }
+        }
+    },
+    BeforeDoctypeName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                t.createDoctypePending();
+                t.transition(DoctypeName);
+                return;
+            }
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    break; // ignore whitespace
+                case nullChar:
+                    t.error(this);
+                    t.doctypePending.name.append(replacementChar);
+                    t.transition(DoctypeName);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.createDoctypePending();
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.createDoctypePending();
+                    t.doctypePending.name.append(c);
+                    t.transition(DoctypeName);
+            }
+        }
+    },
+    DoctypeName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.matchesLetter()) {
+                String name = r.consumeLetterSequence();
+                t.doctypePending.name.append(name.toLowerCase());
+                return;
+            }
+            char c = r.consume();
+            switch (c) {
+                case '>':
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(AfterDoctypeName);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.doctypePending.name.append(replacementChar);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.doctypePending.name.append(c);
+            }
+        }
+    },
+    AfterDoctypeName {
+        void read(Tokeniser t, CharacterReader r) {
+            if (r.isEmpty()) {
+                t.eofError(this);
+                t.doctypePending.forceQuirks = true;
+                t.emitDoctypePending();
+                t.transition(Data);
+                return;
+            }
+            if (r.matchesAny('\t', '\n', '\f', ' '))
+                r.advance(); // ignore whitespace
+            else if (r.matches('>')) {
+                t.emitDoctypePending();
+                t.advanceTransition(Data);
+            } else if (r.matchConsumeIgnoreCase("PUBLIC")) {
+                t.transition(AfterDoctypePublicKeyword);
+            } else if (r.matchConsumeIgnoreCase("SYSTEM")) {
+                t.transition(AfterDoctypeSystemKeyword);
+            } else {
+                t.error(this);
+                t.doctypePending.forceQuirks = true;
+                t.advanceTransition(BogusDoctype);
+            }
+
+        }
+    },
+    AfterDoctypePublicKeyword {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BeforeDoctypePublicIdentifier);
+                    break;
+                case '"':
+                    t.error(this);
+                    // set public id to empty string
+                    t.transition(DoctypePublicIdentifier_doubleQuoted);
+                    break;
+                case '\'':
+                    t.error(this);
+                    // set public id to empty string
+                    t.transition(DoctypePublicIdentifier_singleQuoted);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.transition(BogusDoctype);
+            }
+        }
+    },
+    BeforeDoctypePublicIdentifier {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    break;
+                case '"':
+                    // set public id to empty string
+                    t.transition(DoctypePublicIdentifier_doubleQuoted);
+                    break;
+                case '\'':
+                    // set public id to empty string
+                    t.transition(DoctypePublicIdentifier_singleQuoted);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.transition(BogusDoctype);
+            }
+        }
+    },
+    DoctypePublicIdentifier_doubleQuoted {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '"':
+                    t.transition(AfterDoctypePublicIdentifier);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.doctypePending.publicIdentifier.append(replacementChar);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.doctypePending.publicIdentifier.append(c);
+            }
+        }
+    },
+    DoctypePublicIdentifier_singleQuoted {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\'':
+                    t.transition(AfterDoctypePublicIdentifier);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.doctypePending.publicIdentifier.append(replacementChar);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.doctypePending.publicIdentifier.append(c);
+            }
+        }
+    },
+    AfterDoctypePublicIdentifier {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BetweenDoctypePublicAndSystemIdentifiers);
+                    break;
+                case '>':
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case '"':
+                    t.error(this);
+                    // system id empty
+                    t.transition(DoctypeSystemIdentifier_doubleQuoted);
+                    break;
+                case '\'':
+                    t.error(this);
+                    // system id empty
+                    t.transition(DoctypeSystemIdentifier_singleQuoted);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.transition(BogusDoctype);
+            }
+        }
+    },
+    BetweenDoctypePublicAndSystemIdentifiers {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    break;
+                case '>':
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case '"':
+                    t.error(this);
+                    // system id empty
+                    t.transition(DoctypeSystemIdentifier_doubleQuoted);
+                    break;
+                case '\'':
+                    t.error(this);
+                    // system id empty
+                    t.transition(DoctypeSystemIdentifier_singleQuoted);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.transition(BogusDoctype);
+            }
+        }
+    },
+    AfterDoctypeSystemKeyword {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    t.transition(BeforeDoctypeSystemIdentifier);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case '"':
+                    t.error(this);
+                    // system id empty
+                    t.transition(DoctypeSystemIdentifier_doubleQuoted);
+                    break;
+                case '\'':
+                    t.error(this);
+                    // system id empty
+                    t.transition(DoctypeSystemIdentifier_singleQuoted);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+            }
+        }
+    },
+    BeforeDoctypeSystemIdentifier {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    break;
+                case '"':
+                    // set system id to empty string
+                    t.transition(DoctypeSystemIdentifier_doubleQuoted);
+                    break;
+                case '\'':
+                    // set public id to empty string
+                    t.transition(DoctypeSystemIdentifier_singleQuoted);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.transition(BogusDoctype);
+            }
+        }
+    },
+    DoctypeSystemIdentifier_doubleQuoted {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '"':
+                    t.transition(AfterDoctypeSystemIdentifier);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.doctypePending.systemIdentifier.append(replacementChar);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.doctypePending.systemIdentifier.append(c);
+            }
+        }
+    },
+    DoctypeSystemIdentifier_singleQuoted {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\'':
+                    t.transition(AfterDoctypeSystemIdentifier);
+                    break;
+                case nullChar:
+                    t.error(this);
+                    t.doctypePending.systemIdentifier.append(replacementChar);
+                    break;
+                case '>':
+                    t.error(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.doctypePending.systemIdentifier.append(c);
+            }
+        }
+    },
+    AfterDoctypeSystemIdentifier {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '\t':
+                case '\n':
+                case '\f':
+                case ' ':
+                    break;
+                case '>':
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.eofError(this);
+                    t.doctypePending.forceQuirks = true;
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    t.error(this);
+                    t.transition(BogusDoctype);
+                    // NOT force quirks
+            }
+        }
+    },
+    BogusDoctype {
+        void read(Tokeniser t, CharacterReader r) {
+            char c = r.consume();
+            switch (c) {
+                case '>':
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                case eof:
+                    t.emitDoctypePending();
+                    t.transition(Data);
+                    break;
+                default:
+                    // ignore char
+                    break;
+            }
+        }
+    },
+    CdataSection {
+        void read(Tokeniser t, CharacterReader r) {
+            String data = r.consumeTo("]]>");
+            t.emit(data);
+            r.matchConsume("]]>");
+            t.transition(Data);
+        }
+    };
+
+
+    abstract void read(Tokeniser t, CharacterReader r);
+
+    private static final char nullChar = '\u0000';
+    private static final char replacementChar = Tokeniser.replacementChar;
+    private static final String replacementStr = String.valueOf(Tokeniser.replacementChar);
+    private static final char eof = CharacterReader.EOF;
+}
diff --git a/src/org/jsoup/parser/TreeBuilder.java b/src/org/jsoup/parser/TreeBuilder.java

new file mode 100644 (file)

index 0000000..e06caad
--- /dev/null
+++ b/src/org/jsoup/parser/TreeBuilder.java
@@ -0,0 +1,60 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.DescendableLinkedList;
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author Jonathan Hedley
+ */
+abstract class TreeBuilder {
+    CharacterReader reader;
+    Tokeniser tokeniser;
+    protected Document doc; // current doc we are building into
+    protected DescendableLinkedList<Element> stack; // the stack of open elements
+    protected String baseUri; // current base uri, for creating new elements
+    protected Token currentToken; // currentToken is used only for error tracking.
+    protected ParseErrorList errors; // null when not tracking errors
+
+    protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
+        Validate.notNull(input, "String input must not be null");
+        Validate.notNull(baseUri, "BaseURI must not be null");
+
+        doc = new Document(baseUri);
+        reader = new CharacterReader(input);
+        this.errors = errors;
+        tokeniser = new Tokeniser(reader, errors);
+        stack = new DescendableLinkedList<Element>();
+        this.baseUri = baseUri;
+    }
+
+    Document parse(String input, String baseUri) {
+        return parse(input, baseUri, ParseErrorList.noTracking());
+    }
+
+    Document parse(String input, String baseUri, ParseErrorList errors) {
+        initialiseParse(input, baseUri, errors);
+        runParser();
+        return doc;
+    }
+
+    protected void runParser() {
+        while (true) {
+            Token token = tokeniser.read();
+            process(token);
+
+            if (token.type == Token.TokenType.EOF)
+                break;
+        }
+    }
+
+    protected abstract boolean process(Token token);
+
+    protected Element currentElement() {
+        return stack.getLast();
+    }
+}
diff --git a/src/org/jsoup/parser/XmlTreeBuilder.java b/src/org/jsoup/parser/XmlTreeBuilder.java

new file mode 100644 (file)

index 0000000..3f03ad2
--- /dev/null
+++ b/src/org/jsoup/parser/XmlTreeBuilder.java
@@ -0,0 +1,111 @@
+package org.jsoup.parser;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.*;
+
+import java.util.Iterator;
+
+/**
+ * @author Jonathan Hedley
+ */
+public class XmlTreeBuilder extends TreeBuilder {
+    @Override
+    protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
+        super.initialiseParse(input, baseUri, errors);
+        stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
+    }
+
+    @Override
+    protected boolean process(Token token) {
+        // start tag, end tag, doctype, comment, character, eof
+        switch (token.type) {
+            case StartTag:
+                insert(token.asStartTag());
+                break;
+            case EndTag:
+                popStackToClose(token.asEndTag());
+                break;
+            case Comment:
+                insert(token.asComment());
+                break;
+            case Character:
+                insert(token.asCharacter());
+                break;
+            case Doctype:
+                insert(token.asDoctype());
+                break;
+            case EOF: // could put some normalisation here if desired
+                break;
+            default:
+                Validate.fail("Unexpected token type: " + token.type);
+        }
+        return true;
+    }
+
+    private void insertNode(Node node) {
+        currentElement().appendChild(node);
+    }
+
+    Element insert(Token.StartTag startTag) {
+        Tag tag = Tag.valueOf(startTag.name());
+        // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html.
+        Element el = new Element(tag, baseUri, startTag.attributes);
+        insertNode(el);
+        if (startTag.isSelfClosing()) {
+            tokeniser.acknowledgeSelfClosingFlag();
+            if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above.
+                tag.setSelfClosing();
+        } else {
+            stack.add(el);
+        }
+        return el;
+    }
+
+    void insert(Token.Comment commentToken) {
+        Comment comment = new Comment(commentToken.getData(), baseUri);
+        insertNode(comment);
+    }
+
+    void insert(Token.Character characterToken) {
+        Node node = new TextNode(characterToken.getData(), baseUri);
+        insertNode(node);
+    }
+
+    void insert(Token.Doctype d) {
+        DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri);
+        insertNode(doctypeNode);
+    }
+
+    /**
+     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
+     * found, skips.
+     *
+     * @param endTag
+     */
+    private void popStackToClose(Token.EndTag endTag) {
+        String elName = endTag.name();
+        Element firstFound = null;
+
+        Iterator<Element> it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next.nodeName().equals(elName)) {
+                firstFound = next;
+                break;
+            }
+        }
+        if (firstFound == null)
+            return; // not found, skip
+
+        it = stack.descendingIterator();
+        while (it.hasNext()) {
+            Element next = it.next();
+            if (next == firstFound) {
+                it.remove();
+                break;
+            } else {
+                it.remove();
+            }
+        }
+    }
+}
diff --git a/src/org/jsoup/parser/package-info.java b/src/org/jsoup/parser/package-info.java

new file mode 100644 (file)

index 0000000..168fdf4
--- /dev/null
+++ b/src/org/jsoup/parser/package-info.java
@@ -0,0 +1,4 @@
+/**
+ Contains the HTML parser, tag specifications, and HTML tokeniser.
+ */
+package org.jsoup.parser;
diff --git a/src/org/jsoup/safety/Cleaner.java b/src/org/jsoup/safety/Cleaner.java

new file mode 100644 (file)

index 0000000..eda67df
--- /dev/null
+++ b/src/org/jsoup/safety/Cleaner.java
@@ -0,0 +1,129 @@
+package org.jsoup.safety;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.*;
+import org.jsoup.parser.Tag;
+
+import java.util.List;
+
+/**
+ The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
+ that you are expecting; no junk, and no cross-site scripting attacks!
+ <p/>
+ The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
+ HTML that is allowed by the whitelist.
+ <p/>
+ It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
+ canned white-lists only allow body contained tags.
+ <p/>
+ Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
+ */
+public class Cleaner {
+    private Whitelist whitelist;
+
+    /**
+     Create a new cleaner, that sanitizes documents using the supplied whitelist.
+     @param whitelist white-list to clean with
+     */
+    public Cleaner(Whitelist whitelist) {
+        Validate.notNull(whitelist);
+        this.whitelist = whitelist;
+    }
+
+    /**
+     Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
+     The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
+     @param dirtyDocument Untrusted base document to clean.
+     @return cleaned document.
+     */
+    public Document clean(Document dirtyDocument) {
+        Validate.notNull(dirtyDocument);
+
+        Document clean = Document.createShell(dirtyDocument.baseUri());
+        copySafeNodes(dirtyDocument.body(), clean.body());
+
+        return clean;
+    }
+
+    /**
+     Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
+     in the input HTML are allowed by the whitelist.
+     <p/>
+     This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
+     using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
+     to ensure enforced attributes are set correctly, and that the output is tidied.
+     @param dirtyDocument document to test
+     @return true if no tags or attributes need to be removed; false if they do
+     */
+    public boolean isValid(Document dirtyDocument) {
+        Validate.notNull(dirtyDocument);
+
+        Document clean = Document.createShell(dirtyDocument.baseUri());
+        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
+        return numDiscarded == 0;
+    }
+
+    /**
+     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
+     @param source source of HTML
+     @param dest destination element to copy into
+     @return number of discarded elements (that were considered unsafe)
+     */
+    private int copySafeNodes(Element source, Element dest) {
+        List<Node> sourceChildren = source.childNodes();
+        int numDiscarded = 0;
+
+        for (Node sourceChild : sourceChildren) {
+            if (sourceChild instanceof Element) {
+                Element sourceEl = (Element) sourceChild;
+
+                if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
+                    ElementMeta meta = createSafeElement(sourceEl);
+                    Element destChild = meta.el;
+                    dest.appendChild(destChild);
+
+                    numDiscarded += meta.numAttribsDiscarded;
+                    numDiscarded += copySafeNodes(sourceEl, destChild); // recurs
+                } else { // not a safe tag, but it may have children (els or text) that are, so recurse
+                    numDiscarded++;
+                    numDiscarded += copySafeNodes(sourceEl, dest);
+                }
+            } else if (sourceChild instanceof TextNode) {
+                TextNode sourceText = (TextNode) sourceChild;
+                TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri());
+                dest.appendChild(destText);
+            } // else, we don't care about comments, xml proc instructions, etc
+        }
+        return numDiscarded;
+    }
+
+    private ElementMeta createSafeElement(Element sourceEl) {
+        String sourceTag = sourceEl.tagName();
+        Attributes destAttrs = new Attributes();
+        Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
+        int numDiscarded = 0;
+
+        Attributes sourceAttrs = sourceEl.attributes();
+        for (Attribute sourceAttr : sourceAttrs) {
+            if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
+                destAttrs.put(sourceAttr);
+            else
+                numDiscarded++;
+        }
+        Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
+        destAttrs.addAll(enforcedAttrs);
+
+        return new ElementMeta(dest, numDiscarded);
+    }
+
+    private static class ElementMeta {
+        Element el;
+        int numAttribsDiscarded;
+
+        ElementMeta(Element el, int numAttribsDiscarded) {
+            this.el = el;
+            this.numAttribsDiscarded = numAttribsDiscarded;
+        }
+    }
+
+}
diff --git a/src/org/jsoup/safety/Whitelist.java b/src/org/jsoup/safety/Whitelist.java

new file mode 100644 (file)

index 0000000..2c1150c
--- /dev/null
+++ b/src/org/jsoup/safety/Whitelist.java
@@ -0,0 +1,451 @@
+package org.jsoup.safety;
+
+/*
+    Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired
+    this whitelist configuration, and the initial defaults.
+ */
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.nodes.Attributes;
+import org.jsoup.nodes.Element;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+
+/**
+ Whitelists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed.
+ <p/>
+ Start with one of the defaults:
+ <ul>
+ <li>{@link #none}
+ <li>{@link #simpleText}
+ <li>{@link #basic}
+ <li>{@link #basicWithImages}
+ <li>{@link #relaxed}
+ </ul>
+ <p/>
+ If you need to allow more through (please be careful!), tweak a base whitelist with:
+ <ul>
+ <li>{@link #addTags}
+ <li>{@link #addAttributes}
+ <li>{@link #addEnforcedAttribute}
+ <li>{@link #addProtocols}
+ </ul>
+ <p/>
+ The cleaner and these whitelists assume that you want to clean a <code>body</code> fragment of HTML (to add user
+ supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, either wrap the
+ document HTML around the cleaned body HTML, or create a whitelist that allows <code>html</code> and <code>head</code>
+ elements as appropriate.
+ <p/>
+ If you are going to extend a whitelist, please be very careful. Make sure you understand what attributes may lead to
+ XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See 
+ http://ha.ckers.org/xss.html for some XSS attack examples.
+
+ @author Jonathan Hedley
+ */
+public class Whitelist {
+    private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]
+    private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
+    private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values
+    private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes
+    private boolean preserveRelativeLinks; // option to preserve relative links
+
+    /**
+     This whitelist allows only text nodes: all HTML will be stripped.
+
+     @return whitelist
+     */
+    public static Whitelist none() {
+        return new Whitelist();
+    }
+
+    /**
+     This whitelist allows only simple text formatting: <code>b, em, i, strong, u</code>. All other HTML (tags and
+     attributes) will be removed.
+
+     @return whitelist
+     */
+    public static Whitelist simpleText() {
+        return new Whitelist()
+                .addTags("b", "em", "i", "strong", "u")
+                ;
+    }
+
+    /**
+     This whitelist allows a fuller range of text nodes: <code>a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li,
+     ol, p, pre, q, small, strike, strong, sub, sup, u, ul</code>, and appropriate attributes.
+     <p/>
+     Links (<code>a</code> elements) can point to <code>http, https, ftp, mailto</code>, and have an enforced
+     <code>rel=nofollow</code> attribute.
+     <p/>
+     Does not allow images.
+
+     @return whitelist
+     */
+    public static Whitelist basic() {
+        return new Whitelist()
+                .addTags(
+                        "a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em",
+                        "i", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub",
+                        "sup", "u", "ul")
+
+                .addAttributes("a", "href")
+                .addAttributes("blockquote", "cite")
+                .addAttributes("q", "cite")
+
+                .addProtocols("a", "href", "ftp", "http", "https", "mailto")
+                .addProtocols("blockquote", "cite", "http", "https")
+                .addProtocols("cite", "cite", "http", "https")
+
+                .addEnforcedAttribute("a", "rel", "nofollow")
+                ;
+
+    }
+
+    /**
+     This whitelist allows the same text tags as {@link #basic}, and also allows <code>img</code> tags, with appropriate
+     attributes, with <code>src</code> pointing to <code>http</code> or <code>https</code>.
+
+     @return whitelist
+     */
+    public static Whitelist basicWithImages() {
+        return basic()
+                .addTags("img")
+                .addAttributes("img", "align", "alt", "height", "src", "title", "width")
+                .addProtocols("img", "src", "http", "https")
+                ;
+    }
+
+    /**
+     This whitelist allows a full range of text and structural body HTML: <code>a, b, blockquote, br, caption, cite,
+     code, col, colgroup, dd, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, strike, strong, sub,
+     sup, table, tbody, td, tfoot, th, thead, tr, u, ul</code>
+     <p/>
+     Links do not have an enforced <code>rel=nofollow</code> attribute, but you can add that if desired.
+
+     @return whitelist
+     */
+    public static Whitelist relaxed() {
+        return new Whitelist()
+                .addTags(
+                        "a", "b", "blockquote", "br", "caption", "cite", "code", "col",
+                        "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6",
+                        "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
+                        "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
+                        "ul")
+
+                .addAttributes("a", "href", "title")
+                .addAttributes("blockquote", "cite")
+                .addAttributes("col", "span", "width")
+                .addAttributes("colgroup", "span", "width")
+                .addAttributes("img", "align", "alt", "height", "src", "title", "width")
+                .addAttributes("ol", "start", "type")
+                .addAttributes("q", "cite")
+                .addAttributes("table", "summary", "width")
+                .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width")
+                .addAttributes(
+                        "th", "abbr", "axis", "colspan", "rowspan", "scope",
+                        "width")
+                .addAttributes("ul", "type")
+
+                .addProtocols("a", "href", "ftp", "http", "https", "mailto")
+                .addProtocols("blockquote", "cite", "http", "https")
+                .addProtocols("img", "src", "http", "https")
+                .addProtocols("q", "cite", "http", "https")
+                ;
+    }
+
+    /**
+     Create a new, empty whitelist. Generally it will be better to start with a default prepared whitelist instead.
+
+     @see #basic()
+     @see #basicWithImages()
+     @see #simpleText()
+     @see #relaxed()
+     */
+    public Whitelist() {
+        tagNames = new HashSet<TagName>();
+        attributes = new HashMap<TagName, Set<AttributeKey>>();
+        enforcedAttributes = new HashMap<TagName, Map<AttributeKey, AttributeValue>>();
+        protocols = new HashMap<TagName, Map<AttributeKey, Set<Protocol>>>();
+        preserveRelativeLinks = false;
+    }
+
+    /**
+     Add a list of allowed elements to a whitelist. (If a tag is not allowed, it will be removed from the HTML.)
+
+     @param tags tag names to allow
+     @return this (for chaining)
+     */
+    public Whitelist addTags(String... tags) {
+        Validate.notNull(tags);
+
+        for (String tagName : tags) {
+            Validate.notEmpty(tagName);
+            tagNames.add(TagName.valueOf(tagName));
+        }
+        return this;
+    }
+
+    /**
+     Add a list of allowed attributes to a tag. (If an attribute is not allowed on an element, it will be removed.)
+     <p/>
+     E.g.: <code>addAttributes("a", "href", "class")</code> allows <code>href</code> and <code>class</code> attributes
+     on <code>a</code> tags.
+     <p/>
+     To make an attribute valid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g.
+     <code>addAttributes(":all", "class")</code>.
+
+     @param tag  The tag the attributes are for. The tag will be added to the allowed tag list if necessary.
+     @param keys List of valid attributes for the tag
+     @return this (for chaining)
+     */
+    public Whitelist addAttributes(String tag, String... keys) {
+        Validate.notEmpty(tag);
+        Validate.notNull(keys);
+        Validate.isTrue(keys.length > 0, "No attributes supplied.");
+
+        TagName tagName = TagName.valueOf(tag);
+        if (!tagNames.contains(tagName))
+            tagNames.add(tagName);
+        Set<AttributeKey> attributeSet = new HashSet<AttributeKey>();
+        for (String key : keys) {
+            Validate.notEmpty(key);
+            attributeSet.add(AttributeKey.valueOf(key));
+        }
+        if (attributes.containsKey(tagName)) {
+            Set<AttributeKey> currentSet = attributes.get(tagName);
+            currentSet.addAll(attributeSet);
+        } else {
+            attributes.put(tagName, attributeSet);
+        }
+        return this;
+    }
+
+    /**
+     Add an enforced attribute to a tag. An enforced attribute will always be added to the element. If the element
+     already has the attribute set, it will be overridden.
+     <p/>
+     E.g.: <code>addEnforcedAttribute("a", "rel", "nofollow")</code> will make all <code>a</code> tags output as
+     <code>&lt;a href="..." rel="nofollow"></code>
+
+     @param tag   The tag the enforced attribute is for. The tag will be added to the allowed tag list if necessary.
+     @param key   The attribute key
+     @param value The enforced attribute value
+     @return this (for chaining)
+     */
+    public Whitelist addEnforcedAttribute(String tag, String key, String value) {
+        Validate.notEmpty(tag);
+        Validate.notEmpty(key);
+        Validate.notEmpty(value);
+
+        TagName tagName = TagName.valueOf(tag);
+        if (!tagNames.contains(tagName))
+            tagNames.add(tagName);
+        AttributeKey attrKey = AttributeKey.valueOf(key);
+        AttributeValue attrVal = AttributeValue.valueOf(value);
+
+        if (enforcedAttributes.containsKey(tagName)) {
+            enforcedAttributes.get(tagName).put(attrKey, attrVal);
+        } else {
+            Map<AttributeKey, AttributeValue> attrMap = new HashMap<AttributeKey, AttributeValue>();
+            attrMap.put(attrKey, attrVal);
+            enforcedAttributes.put(tagName, attrMap);
+        }
+        return this;
+    }
+
+    /**
+     * Configure this Whitelist to preserve relative links in an element's URL attribute, or convert them to absolute
+     * links. By default, this is <b>false</b>: URLs will be  made absolute (e.g. start with an allowed protocol, like
+     * e.g. {@code http://}.
+     * <p />
+     * Note that when handling relative links, the input document must have an appropriate {@code base URI} set when
+     * parsing, so that the link's protocol can be confirmed. Regardless of the setting of the {@code preserve relative
+     * links} option, the link must be resolvable against the base URI to an allowed protocol; otherwise the attribute
+     * will be removed.
+     *
+     * @param preserve {@code true} to allow relative links, {@code false} (default) to deny
+     * @return this Whitelist, for chaining.
+     * @see #addProtocols
+     */
+    public Whitelist preserveRelativeLinks(boolean preserve) {
+        preserveRelativeLinks = preserve;
+        return this;
+    }
+
+    /**
+     Add allowed URL protocols for an element's URL attribute. This restricts the possible values of the attribute to
+     URLs with the defined protocol.
+     <p/>
+     E.g.: <code>addProtocols("a", "href", "ftp", "http", "https")</code>
+
+     @param tag       Tag the URL protocol is for
+     @param key       Attribute key
+     @param protocols List of valid protocols
+     @return this, for chaining
+     */
+    public Whitelist addProtocols(String tag, String key, String... protocols) {
+        Validate.notEmpty(tag);
+        Validate.notEmpty(key);
+        Validate.notNull(protocols);
+
+        TagName tagName = TagName.valueOf(tag);
+        AttributeKey attrKey = AttributeKey.valueOf(key);
+        Map<AttributeKey, Set<Protocol>> attrMap;
+        Set<Protocol> protSet;
+
+        if (this.protocols.containsKey(tagName)) {
+            attrMap = this.protocols.get(tagName);
+        } else {
+            attrMap = new HashMap<AttributeKey, Set<Protocol>>();
+            this.protocols.put(tagName, attrMap);
+        }
+        if (attrMap.containsKey(attrKey)) {
+            protSet = attrMap.get(attrKey);
+        } else {
+            protSet = new HashSet<Protocol>();
+            attrMap.put(attrKey, protSet);
+        }
+        for (String protocol : protocols) {
+            Validate.notEmpty(protocol);
+            Protocol prot = Protocol.valueOf(protocol);
+            protSet.add(prot);
+        }
+        return this;
+    }
+
+    boolean isSafeTag(String tag) {
+        return tagNames.contains(TagName.valueOf(tag));
+    }
+
+    boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
+        TagName tag = TagName.valueOf(tagName);
+        AttributeKey key = AttributeKey.valueOf(attr.getKey());
+
+        if (attributes.containsKey(tag)) {
+            if (attributes.get(tag).contains(key)) {
+                if (protocols.containsKey(tag)) {
+                    Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag);
+                    // ok if not defined protocol; otherwise test
+                    return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key));
+                } else { // attribute found, no protocols defined, so OK
+                    return true;
+                }
+            }
+        }
+        // no attributes defined for tag, try :all tag
+        return !tagName.equals(":all") && isSafeAttribute(":all", el, attr);
+    }
+
+    private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
+        // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
+        // rels without a baseuri get removed
+        String value = el.absUrl(attr.getKey());
+        if (value.length() == 0)
+            value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols
+        if (!preserveRelativeLinks)
+            attr.setValue(value);
+        
+        for (Protocol protocol : protocols) {
+            String prot = protocol.toString() + ":";
+            if (value.toLowerCase().startsWith(prot)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    Attributes getEnforcedAttributes(String tagName) {
+        Attributes attrs = new Attributes();
+        TagName tag = TagName.valueOf(tagName);
+        if (enforcedAttributes.containsKey(tag)) {
+            Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag);
+            for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) {
+                attrs.put(entry.getKey().toString(), entry.getValue().toString());
+            }
+        }
+        return attrs;
+    }
+    
+    // named types for config. All just hold strings, but here for my sanity.
+
+    static class TagName extends TypedValue {
+        TagName(String value) {
+            super(value);
+        }
+
+        static TagName valueOf(String value) {
+            return new TagName(value);
+        }
+    }
+
+    static class AttributeKey extends TypedValue {
+        AttributeKey(String value) {
+            super(value);
+        }
+
+        static AttributeKey valueOf(String value) {
+            return new AttributeKey(value);
+        }
+    }
+
+    static class AttributeValue extends TypedValue {
+        AttributeValue(String value) {
+            super(value);
+        }
+
+        static AttributeValue valueOf(String value) {
+            return new AttributeValue(value);
+        }
+    }
+
+    static class Protocol extends TypedValue {
+        Protocol(String value) {
+            super(value);
+        }
+
+        static Protocol valueOf(String value) {
+            return new Protocol(value);
+        }
+    }
+
+    abstract static class TypedValue {
+        private String value;
+
+        TypedValue(String value) {
+            Validate.notNull(value);
+            this.value = value;
+        }
+
+        @Override
+        public int hashCode() {
+            final int prime = 31;
+            int result = 1;
+            result = prime * result + ((value == null) ? 0 : value.hashCode());
+            return result;
+        }
+
+        @Override
+        public boolean equals(Object obj) {
+            if (this == obj) return true;
+            if (obj == null) return false;
+            if (getClass() != obj.getClass()) return false;
+            TypedValue other = (TypedValue) obj;
+            if (value == null) {
+                if (other.value != null) return false;
+            } else if (!value.equals(other.value)) return false;
+            return true;
+        }
+
+        @Override
+        public String toString() {
+            return value;
+        }
+    }
+}
+
diff --git a/src/org/jsoup/safety/package-info.java b/src/org/jsoup/safety/package-info.java

new file mode 100644 (file)

index 0000000..ac890f0
--- /dev/null
+++ b/src/org/jsoup/safety/package-info.java
@@ -0,0 +1,4 @@
+/**
+ Contains the jsoup HTML cleaner, and whitelist definitions.
+ */
+package org.jsoup.safety;
diff --git a/src/org/jsoup/select/Collector.java b/src/org/jsoup/select/Collector.java

new file mode 100644 (file)

index 0000000..8f01045
--- /dev/null
+++ b/src/org/jsoup/select/Collector.java
@@ -0,0 +1,51 @@
+package org.jsoup.select;
+
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+
+/**
+ * Collects a list of elements that match the supplied criteria.
+ *
+ * @author Jonathan Hedley
+ */
+public class Collector {
+
+    private Collector() {
+    }
+
+    /**
+     Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator.
+     @param eval Evaluator to test elements against
+     @param root root of tree to descend
+     @return list of matches; empty if none
+     */
+    public static Elements collect (Evaluator eval, Element root) {
+        Elements elements = new Elements();
+        new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root);
+        return elements;
+    }
+
+    private static class Accumulator implements NodeVisitor {
+        private final Element root;
+        private final Elements elements;
+        private final Evaluator eval;
+
+        Accumulator(Element root, Elements elements, Evaluator eval) {
+            this.root = root;
+            this.elements = elements;
+            this.eval = eval;
+        }
+
+        public void head(Node node, int depth) {
+            if (node instanceof Element) {
+                Element el = (Element) node;
+                if (eval.matches(root, el))
+                    elements.add(el);
+            }
+        }
+
+        public void tail(Node node, int depth) {
+            // void
+        }
+    }
+}
diff --git a/src/org/jsoup/select/CombiningEvaluator.java b/src/org/jsoup/select/CombiningEvaluator.java

new file mode 100644 (file)

index 0000000..a31ed26
--- /dev/null
+++ b/src/org/jsoup/select/CombiningEvaluator.java
@@ -0,0 +1,94 @@
+package org.jsoup.select;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.nodes.Element;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Base combining (and, or) evaluator.
+ */
+abstract class CombiningEvaluator extends Evaluator {
+    final List<Evaluator> evaluators;
+
+    CombiningEvaluator() {
+        super();
+        evaluators = new ArrayList<Evaluator>();
+    }
+
+    CombiningEvaluator(Collection<Evaluator> evaluators) {
+        this();
+        this.evaluators.addAll(evaluators);
+    }
+
+    Evaluator rightMostEvaluator() {
+        return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) : null;
+    }
+    
+    void replaceRightMostEvaluator(Evaluator replacement) {
+        evaluators.set(evaluators.size() - 1, replacement);
+    }
+
+    static final class And extends CombiningEvaluator {
+        And(Collection<Evaluator> evaluators) {
+            super(evaluators);
+        }
+
+        And(Evaluator... evaluators) {
+            this(Arrays.asList(evaluators));
+        }
+
+        @Override
+        public boolean matches(Element root, Element node) {
+            for (Evaluator s : evaluators) {
+                if (!s.matches(root, node))
+                    return false;
+            }
+            return true;
+        }
+
+        @Override
+        public String toString() {
+            return StringUtil.join(evaluators, " ");
+        }
+    }
+
+    static final class Or extends CombiningEvaluator {
+        /**
+         * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR.
+         * @param evaluators initial OR clause (these are wrapped into an AND evaluator).
+         */
+        Or(Collection<Evaluator> evaluators) {
+            super();
+            if (evaluators.size() > 1)
+                this.evaluators.add(new And(evaluators));
+            else // 0 or 1
+                this.evaluators.addAll(evaluators);
+        }
+
+        Or() {
+            super();
+        }
+
+        public void add(Evaluator e) {
+            evaluators.add(e);
+        }
+
+        @Override
+        public boolean matches(Element root, Element node) {
+            for (Evaluator s : evaluators) {
+                if (s.matches(root, node))
+                    return true;
+            }
+            return false;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":or%s", evaluators);
+        }
+    }
+}
diff --git a/src/org/jsoup/select/Elements.java b/src/org/jsoup/select/Elements.java

new file mode 100644 (file)

index 0000000..8302da1
--- /dev/null
+++ b/src/org/jsoup/select/Elements.java
@@ -0,0 +1,536 @@
+package org.jsoup.select;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+
+import java.util.*;
+
+/**
+ A list of {@link Element Elements}, with methods that act on every element in the list.
+ <p/>
+ To get an Elements object, use the {@link Element#select(String)} method.
+
+ @author Jonathan Hedley, jonathan@hedley.net */
+public class Elements implements List<Element>, Cloneable {
+    private List<Element> contents;
+
+    public Elements() {
+        contents = new ArrayList<Element>();
+    }
+
+    public Elements(int initialCapacity) {
+        contents = new ArrayList<Element>(initialCapacity);
+    }
+
+    public Elements(Collection<Element> elements) {
+        contents = new ArrayList<Element>(elements);
+    }
+    
+    public Elements(List<Element> elements) {
+        contents = elements;
+    }
+    
+    public Elements(Element... elements) {
+        this(Arrays.asList(elements));
+    }
+    
+    @Override
+       public Elements clone() {
+       List<Element> elements = new ArrayList<Element>();
+       
+       for(Element e : contents)
+               elements.add(e.clone());
+               
+       
+       return new Elements(elements);
+       }
+
+       // attribute methods
+    /**
+     Get an attribute value from the first matched element that has the attribute.
+     @param attributeKey The attribute key.
+     @return The attribute value from the first matched element that has the attribute.. If no elements were matched (isEmpty() == true),
+     or if the no elements have the attribute, returns empty string.
+     @see #hasAttr(String)
+     */
+    public String attr(String attributeKey) {
+        for (Element element : contents) {
+            if (element.hasAttr(attributeKey))
+                return element.attr(attributeKey);
+        }
+        return "";
+    }
+
+    /**
+     Checks if any of the matched elements have this attribute set.
+     @param attributeKey attribute key
+     @return true if any of the elements have the attribute; false if none do.
+     */
+    public boolean hasAttr(String attributeKey) {
+        for (Element element : contents) {
+            if (element.hasAttr(attributeKey))
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     * Set an attribute on all matched elements.
+     * @param attributeKey attribute key
+     * @param attributeValue attribute value
+     * @return this
+     */
+    public Elements attr(String attributeKey, String attributeValue) {
+        for (Element element : contents) {
+            element.attr(attributeKey, attributeValue);
+        }
+        return this;
+    }
+
+    /**
+     * Remove an attribute from every matched element.
+     * @param attributeKey The attribute to remove.
+     * @return this (for chaining)
+     */
+    public Elements removeAttr(String attributeKey) {
+        for (Element element : contents) {
+            element.removeAttr(attributeKey);
+        }
+        return this;
+    }
+
+    /**
+     Add the class name to every matched element's {@code class} attribute.
+     @param className class name to add
+     @return this
+     */
+    public Elements addClass(String className) {
+        for (Element element : contents) {
+            element.addClass(className);
+        }
+        return this;
+    }
+
+    /**
+     Remove the class name from every matched element's {@code class} attribute, if present.
+     @param className class name to remove
+     @return this
+     */
+    public Elements removeClass(String className) {
+        for (Element element : contents) {
+            element.removeClass(className);
+        }
+        return this;
+    }
+
+    /**
+     Toggle the class name on every matched element's {@code class} attribute.
+     @param className class name to add if missing, or remove if present, from every element.
+     @return this
+     */
+    public Elements toggleClass(String className) {
+        for (Element element : contents) {
+            element.toggleClass(className);
+        }
+        return this;
+    }
+
+    /**
+     Determine if any of the matched elements have this class name set in their {@code class} attribute.
+     @param className class name to check for
+     @return true if any do, false if none do
+     */
+    public boolean hasClass(String className) {
+        for (Element element : contents) {
+            if (element.hasClass(className))
+                return true;
+        }
+        return false;
+    }
+    
+    /**
+     * Get the form element's value of the first matched element.
+     * @return The form element's value, or empty if not set.
+     * @see Element#val()
+     */
+    public String val() {
+        if (size() > 0)
+            return first().val();
+        else
+            return "";
+    }
+    
+    /**
+     * Set the form element's value in each of the matched elements.
+     * @param value The value to set into each matched element
+     * @return this (for chaining)
+     */
+    public Elements val(String value) {
+        for (Element element : contents)
+            element.val(value);
+        return this;
+    }
+    
+    /**
+     * Get the combined text of all the matched elements.
+     * <p>
+     * Note that it is possible to get repeats if the matched elements contain both parent elements and their own
+     * children, as the Element.text() method returns the combined text of a parent and all its children.
+     * @return string of all text: unescaped and no HTML.
+     * @see Element#text()
+     */
+    public String text() {
+        StringBuilder sb = new StringBuilder();
+        for (Element element : contents) {
+            if (sb.length() != 0)
+                sb.append(" ");
+            sb.append(element.text());
+        }
+        return sb.toString();
+    }
+
+    public boolean hasText() {
+        for (Element element: contents) {
+            if (element.hasText())
+                return true;
+        }
+        return false;
+    }
+    
+    /**
+     * Get the combined inner HTML of all matched elements.
+     * @return string of all element's inner HTML.
+     * @see #text()
+     * @see #outerHtml()
+     */
+    public String html() {
+        StringBuilder sb = new StringBuilder();
+        for (Element element : contents) {
+            if (sb.length() != 0)
+                sb.append("\n");
+            sb.append(element.html());
+        }
+        return sb.toString();
+    }
+    
+    /**
+     * Get the combined outer HTML of all matched elements.
+     * @return string of all element's outer HTML.
+     * @see #text()
+     * @see #html()
+     */
+    public String outerHtml() {
+        StringBuilder sb = new StringBuilder();
+        for (Element element : contents) {
+            if (sb.length() != 0)
+                sb.append("\n");
+            sb.append(element.outerHtml());
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Get the combined outer HTML of all matched elements. Alias of {@link #outerHtml()}.
+     * @return string of all element's outer HTML.
+     * @see #text()
+     * @see #html()
+     */
+    public String toString() {
+        return outerHtml();
+    }
+
+    /**
+     * Update the tag name of each matched element. For example, to change each {@code <i>} to a {@code <em>}, do
+     * {@code doc.select("i").tagName("em");}
+     * @param tagName the new tag name
+     * @return this, for chaining
+     * @see Element#tagName(String)
+     */
+    public Elements tagName(String tagName) {
+        for (Element element : contents) {
+            element.tagName(tagName);
+        }
+        return this;
+    }
+    
+    /**
+     * Set the inner HTML of each matched element.
+     * @param html HTML to parse and set into each matched element.
+     * @return this, for chaining
+     * @see Element#html(String)
+     */
+    public Elements html(String html) {
+        for (Element element : contents) {
+            element.html(html);
+        }
+        return this;
+    }
+    
+    /**
+     * Add the supplied HTML to the start of each matched element's inner HTML.
+     * @param html HTML to add inside each element, before the existing HTML
+     * @return this, for chaining
+     * @see Element#prepend(String)
+     */
+    public Elements prepend(String html) {
+        for (Element element : contents) {
+            element.prepend(html);
+        }
+        return this;
+    }
+    
+    /**
+     * Add the supplied HTML to the end of each matched element's inner HTML.
+     * @param html HTML to add inside each element, after the existing HTML
+     * @return this, for chaining
+     * @see Element#append(String)
+     */
+    public Elements append(String html) {
+        for (Element element : contents) {
+            element.append(html);
+        }
+        return this;
+    }
+    
+    /**
+     * Insert the supplied HTML before each matched element's outer HTML.
+     * @param html HTML to insert before each element
+     * @return this, for chaining
+     * @see Element#before(String)
+     */
+    public Elements before(String html) {
+        for (Element element : contents) {
+            element.before(html);
+        }
+        return this;
+    }
+    
+    /**
+     * Insert the supplied HTML after each matched element's outer HTML.
+     * @param html HTML to insert after each element
+     * @return this, for chaining
+     * @see Element#after(String)
+     */
+    public Elements after(String html) {
+        for (Element element : contents) {
+            element.after(html);
+        }
+        return this;
+    }
+
+    /**
+     Wrap the supplied HTML around each matched elements. For example, with HTML
+     {@code <p><b>This</b> is <b>Jsoup</b></p>},
+     <code>doc.select("b").wrap("&lt;i&gt;&lt;/i&gt;");</code>
+     becomes {@code <p><i><b>This</b></i> is <i><b>jsoup</b></i></p>}
+     @param html HTML to wrap around each element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep.
+     @return this (for chaining)
+     @see Element#wrap
+     */
+    public Elements wrap(String html) {
+        Validate.notEmpty(html);
+        for (Element element : contents) {
+            element.wrap(html);
+        }
+        return this;
+    }
+
+    /**
+     * Removes the matched elements from the DOM, and moves their children up into their parents. This has the effect of
+     * dropping the elements but keeping their children.
+     * <p/>
+     * This is useful for e.g removing unwanted formatting elements but keeping their contents.
+     * <p/>
+     * E.g. with HTML: {@code <div><font>One</font> <font><a href="/">Two</a></font></div>}<br/>
+     * {@code doc.select("font").unwrap();}<br/>
+     * HTML = {@code <div>One <a href="/">Two</a></div>}
+     *
+     * @return this (for chaining)
+     * @see Node#unwrap
+     */
+    public Elements unwrap() {
+        for (Element element : contents) {
+            element.unwrap();
+        }
+        return this;
+    }
+
+    /**
+     * Empty (remove all child nodes from) each matched element. This is similar to setting the inner HTML of each
+     * element to nothing.
+     * <p>
+     * E.g. HTML: {@code <div><p>Hello <b>there</b></p> <p>now</p></div>}<br>
+     * <code>doc.select("p").empty();</code><br>
+     * HTML = {@code <div><p></p> <p></p></div>}
+     * @return this, for chaining
+     * @see Element#empty()
+     * @see #remove()
+     */
+    public Elements empty() {
+        for (Element element : contents) {
+            element.empty();
+        }
+        return this;
+    }
+
+    /**
+     * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing.
+     * <p>
+     * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br>
+     * <code>doc.select("p").remove();</code><br>
+     * HTML = {@code <div> <img /></div>}
+     * <p>
+     * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML.
+     * @return this, for chaining
+     * @see Element#empty()
+     * @see #empty()
+     */
+    public Elements remove() {
+        for (Element element : contents) {
+            element.remove();
+        }
+        return this;
+    }
+    
+    // filters
+    
+    /**
+     * Find matching elements within this element list.
+     * @param query A {@link Selector} query
+     * @return the filtered list of elements, or an empty list if none match.
+     */
+    public Elements select(String query) {
+        return Selector.select(query, this);
+    }
+
+    /**
+     * Remove elements from this list that match the {@link Selector} query.
+     * <p>
+     * E.g. HTML: {@code <div class=logo>One</div> <div>Two</div>}<br>
+     * <code>Elements divs = doc.select("div").not("#logo");</code><br>
+     * Result: {@code divs: [<div>Two</div>]}
+     * <p>
+     * @param query the selector query whose results should be removed from these elements
+     * @return a new elements list that contains only the filtered results
+     */
+    public Elements not(String query) {
+        Elements out = Selector.select(query, this);
+        return Selector.filterOut(this, out);
+    }
+    
+    /**
+     * Get the <i>nth</i> matched element as an Elements object.
+     * <p>
+     * See also {@link #get(int)} to retrieve an Element.
+     * @param index the (zero-based) index of the element in the list to retain
+     * @return Elements containing only the specified element, or, if that element did not exist, an empty list.
+     */
+    public Elements eq(int index) {
+        return contents.size() > index ? new Elements(get(index)) : new Elements();
+    }
+    
+    /**
+     * Test if any of the matched elements match the supplied query.
+     * @param query A selector
+     * @return true if at least one element in the list matches the query.
+     */
+    public boolean is(String query) {
+        Elements children = select(query);
+        return !children.isEmpty();
+    }
+
+    /**
+     * Get all of the parents and ancestor elements of the matched elements.
+     * @return all of the parents and ancestor elements of the matched elements
+     */
+    public Elements parents() {
+        HashSet<Element> combo = new LinkedHashSet<Element>();
+        for (Element e: contents) {
+            combo.addAll(e.parents());
+        }
+        return new Elements(combo);
+    }
+
+    // list-like methods
+    /**
+     Get the first matched element.
+     @return The first matched element, or <code>null</code> if contents is empty;
+     */
+    public Element first() {
+        return contents.isEmpty() ? null : contents.get(0);
+    }
+
+    /**
+     Get the last matched element.
+     @return The last matched element, or <code>null</code> if contents is empty.
+     */
+    public Element last() {
+        return contents.isEmpty() ? null : contents.get(contents.size() - 1);
+    }
+
+    /**
+     * Perform a depth-first traversal on each of the selected elements.
+     * @param nodeVisitor the visitor callbacks to perform on each node
+     * @return this, for chaining
+     */
+    public Elements traverse(NodeVisitor nodeVisitor) {
+        Validate.notNull(nodeVisitor);
+        NodeTraversor traversor = new NodeTraversor(nodeVisitor);
+        for (Element el: contents) {
+            traversor.traverse(el);
+        }
+        return this;
+    }
+
+    // implements List<Element> delegates:
+    public int size() {return contents.size();}
+
+    public boolean isEmpty() {return contents.isEmpty();}
+
+    public boolean contains(Object o) {return contents.contains(o);}
+
+    public Iterator<Element> iterator() {return contents.iterator();}
+
+    public Object[] toArray() {return contents.toArray();}
+
+    public <T> T[] toArray(T[] a) {return contents.toArray(a);}
+
+    public boolean add(Element element) {return contents.add(element);}
+
+    public boolean remove(Object o) {return contents.remove(o);}
+
+    public boolean containsAll(Collection<?> c) {return contents.containsAll(c);}
+
+    public boolean addAll(Collection<? extends Element> c) {return contents.addAll(c);}
+
+    public boolean addAll(int index, Collection<? extends Element> c) {return contents.addAll(index, c);}
+
+    public boolean removeAll(Collection<?> c) {return contents.removeAll(c);}
+
+    public boolean retainAll(Collection<?> c) {return contents.retainAll(c);}
+
+    public void clear() {contents.clear();}
+
+    public boolean equals(Object o) {return contents.equals(o);}
+
+    public int hashCode() {return contents.hashCode();}
+
+    public Element get(int index) {return contents.get(index);}
+
+    public Element set(int index, Element element) {return contents.set(index, element);}
+
+    public void add(int index, Element element) {contents.add(index, element);}
+
+    public Element remove(int index) {return contents.remove(index);}
+
+    public int indexOf(Object o) {return contents.indexOf(o);}
+
+    public int lastIndexOf(Object o) {return contents.lastIndexOf(o);}
+
+    public ListIterator<Element> listIterator() {return contents.listIterator();}
+
+    public ListIterator<Element> listIterator(int index) {return contents.listIterator(index);}
+
+    public List<Element> subList(int fromIndex, int toIndex) {return contents.subList(fromIndex, toIndex);}
+}
diff --git a/src/org/jsoup/select/Evaluator.java b/src/org/jsoup/select/Evaluator.java

new file mode 100644 (file)

index 0000000..16a083b
--- /dev/null
+++ b/src/org/jsoup/select/Evaluator.java
@@ -0,0 +1,454 @@
+package org.jsoup.select;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Element;
+
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * Evaluates that an element matches the selector.
+ */
+public abstract class Evaluator {
+    protected Evaluator() {
+    }
+
+    /**
+     * Test if the element meets the evaluator's requirements.
+     *
+     * @param root    Root of the matching subtree
+     * @param element tested element
+     */
+    public abstract boolean matches(Element root, Element element);
+
+    /**
+     * Evaluator for tag name
+     */
+    public static final class Tag extends Evaluator {
+        private String tagName;
+
+        public Tag(String tagName) {
+            this.tagName = tagName;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return (element.tagName().equals(tagName));
+        }
+
+        @Override
+        public String toString() {
+            return String.format("%s", tagName);
+        }
+    }
+
+    /**
+     * Evaluator for element id
+     */
+    public static final class Id extends Evaluator {
+        private String id;
+
+        public Id(String id) {
+            this.id = id;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return (id.equals(element.id()));
+        }
+
+        @Override
+        public String toString() {
+            return String.format("#%s", id);
+        }
+
+    }
+
+    /**
+     * Evaluator for element class
+     */
+    public static final class Class extends Evaluator {
+        private String className;
+
+        public Class(String className) {
+            this.className = className;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return (element.hasClass(className));
+        }
+
+        @Override
+        public String toString() {
+            return String.format(".%s", className);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name matching
+     */
+    public static final class Attribute extends Evaluator {
+        private String key;
+
+        public Attribute(String key) {
+            this.key = key;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.hasAttr(key);
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s]", key);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name prefix matching
+     */
+    public static final class AttributeStarting extends Evaluator {
+        private String keyPrefix;
+
+        public AttributeStarting(String keyPrefix) {
+            this.keyPrefix = keyPrefix;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            List<org.jsoup.nodes.Attribute> values = element.attributes().asList();
+            for (org.jsoup.nodes.Attribute attribute : values) {
+                if (attribute.getKey().startsWith(keyPrefix))
+                    return true;
+            }
+            return false;
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[^%s]", keyPrefix);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name/value matching
+     */
+    public static final class AttributeWithValue extends AttributeKeyPair {
+        public AttributeWithValue(String key, String value) {
+            super(key, value);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key));
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s=%s]", key, value);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name != value matching
+     */
+    public static final class AttributeWithValueNot extends AttributeKeyPair {
+        public AttributeWithValueNot(String key, String value) {
+            super(key, value);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return !value.equalsIgnoreCase(element.attr(key));
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s!=%s]", key, value);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name/value matching (value prefix)
+     */
+    public static final class AttributeWithValueStarting extends AttributeKeyPair {
+        public AttributeWithValueStarting(String key, String value) {
+            super(key, value);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.hasAttr(key) && element.attr(key).toLowerCase().startsWith(value); // value is lower case already
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s^=%s]", key, value);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name/value matching (value ending)
+     */
+    public static final class AttributeWithValueEnding extends AttributeKeyPair {
+        public AttributeWithValueEnding(String key, String value) {
+            super(key, value);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.hasAttr(key) && element.attr(key).toLowerCase().endsWith(value); // value is lower case
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s$=%s]", key, value);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name/value matching (value containing)
+     */
+    public static final class AttributeWithValueContaining extends AttributeKeyPair {
+        public AttributeWithValueContaining(String key, String value) {
+            super(key, value);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.hasAttr(key) && element.attr(key).toLowerCase().contains(value); // value is lower case
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s*=%s]", key, value);
+        }
+
+    }
+
+    /**
+     * Evaluator for attribute name/value matching (value regex matching)
+     */
+    public static final class AttributeWithValueMatching extends Evaluator {
+        String key;
+        Pattern pattern;
+
+        public AttributeWithValueMatching(String key, Pattern pattern) {
+            this.key = key.trim().toLowerCase();
+            this.pattern = pattern;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.hasAttr(key) && pattern.matcher(element.attr(key)).find();
+        }
+
+        @Override
+        public String toString() {
+            return String.format("[%s~=%s]", key, pattern.toString());
+        }
+
+    }
+
+    /**
+     * Abstract evaluator for attribute name/value matching
+     */
+    public abstract static class AttributeKeyPair extends Evaluator {
+        String key;
+        String value;
+
+        public AttributeKeyPair(String key, String value) {
+            Validate.notEmpty(key);
+            Validate.notEmpty(value);
+
+            this.key = key.trim().toLowerCase();
+            this.value = value.trim().toLowerCase();
+        }
+    }
+
+    /**
+     * Evaluator for any / all element matching
+     */
+    public static final class AllElements extends Evaluator {
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return true;
+        }
+
+        @Override
+        public String toString() {
+            return "*";
+        }
+    }
+
+    /**
+     * Evaluator for matching by sibling index number (e < idx)
+     */
+    public static final class IndexLessThan extends IndexEvaluator {
+        public IndexLessThan(int index) {
+            super(index);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.elementSiblingIndex() < index;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":lt(%d)", index);
+        }
+
+    }
+
+    /**
+     * Evaluator for matching by sibling index number (e > idx)
+     */
+    public static final class IndexGreaterThan extends IndexEvaluator {
+        public IndexGreaterThan(int index) {
+            super(index);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.elementSiblingIndex() > index;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":gt(%d)", index);
+        }
+
+    }
+
+    /**
+     * Evaluator for matching by sibling index number (e = idx)
+     */
+    public static final class IndexEquals extends IndexEvaluator {
+        public IndexEquals(int index) {
+            super(index);
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return element.elementSiblingIndex() == index;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":eq(%d)", index);
+        }
+
+    }
+
+    /**
+     * Abstract evaluator for sibling index matching
+     *
+     * @author ant
+     */
+    public abstract static class IndexEvaluator extends Evaluator {
+        int index;
+
+        public IndexEvaluator(int index) {
+            this.index = index;
+        }
+    }
+
+    /**
+     * Evaluator for matching Element (and its descendants) text
+     */
+    public static final class ContainsText extends Evaluator {
+        private String searchText;
+
+        public ContainsText(String searchText) {
+            this.searchText = searchText.toLowerCase();
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return (element.text().toLowerCase().contains(searchText));
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":contains(%s", searchText);
+        }
+    }
+
+    /**
+     * Evaluator for matching Element's own text
+     */
+    public static final class ContainsOwnText extends Evaluator {
+        private String searchText;
+
+        public ContainsOwnText(String searchText) {
+            this.searchText = searchText.toLowerCase();
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            return (element.ownText().toLowerCase().contains(searchText));
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":containsOwn(%s", searchText);
+        }
+    }
+
+    /**
+     * Evaluator for matching Element (and its descendants) text with regex
+     */
+    public static final class Matches extends Evaluator {
+        private Pattern pattern;
+
+        public Matches(Pattern pattern) {
+            this.pattern = pattern;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            Matcher m = pattern.matcher(element.text());
+            return m.find();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":matches(%s", pattern);
+        }
+    }
+
+    /**
+     * Evaluator for matching Element's own text with regex
+     */
+    public static final class MatchesOwn extends Evaluator {
+        private Pattern pattern;
+
+        public MatchesOwn(Pattern pattern) {
+            this.pattern = pattern;
+        }
+
+        @Override
+        public boolean matches(Element root, Element element) {
+            Matcher m = pattern.matcher(element.ownText());
+            return m.find();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(":matchesOwn(%s", pattern);
+        }
+    }
+}
diff --git a/src/org/jsoup/select/NodeTraversor.java b/src/org/jsoup/select/NodeTraversor.java

new file mode 100644 (file)

index 0000000..9bb081e
--- /dev/null
+++ b/src/org/jsoup/select/NodeTraversor.java
@@ -0,0 +1,47 @@
+package org.jsoup.select;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
+ * <p/>
+ * This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
+ */
+public class NodeTraversor {
+    private NodeVisitor visitor;
+
+    /**
+     * Create a new traversor.
+     * @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node.
+     */
+    public NodeTraversor(NodeVisitor visitor) {
+        this.visitor = visitor;
+    }
+
+    /**
+     * Start a depth-first traverse of the root and all of its descendants.
+     * @param root the root node point to traverse.
+     */
+    public void traverse(Node root) {
+        Node node = root;
+        int depth = 0;
+        
+        while (node != null) {
+            visitor.head(node, depth);
+            if (node.childNodes().size() > 0) {
+                node = node.childNode(0);
+                depth++;
+            } else {
+                while (node.nextSibling() == null && depth > 0) {
+                    visitor.tail(node, depth);
+                    node = node.parent();
+                    depth--;
+                }
+                visitor.tail(node, depth);
+                if (node == root)
+                    break;
+                node = node.nextSibling();
+            }
+        }
+    }
+}
diff --git a/src/org/jsoup/select/NodeVisitor.java b/src/org/jsoup/select/NodeVisitor.java

new file mode 100644 (file)

index 0000000..20112e8
--- /dev/null
+++ b/src/org/jsoup/select/NodeVisitor.java
@@ -0,0 +1,30 @@
+package org.jsoup.select;
+
+import org.jsoup.nodes.Node;
+
+/**
+ * Node visitor interface. Provide an implementing class to {@link NodeTraversor} to iterate through nodes.
+ * <p/>
+ * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
+ * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
+ * create a start tag for a node, and tail to create the end tag.
+ */
+public interface NodeVisitor {
+    /**
+     * Callback for when a node is first visited.
+     *
+     * @param node the node being visited.
+     * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
+     * of that will have depth 1.
+     */
+    public void head(Node node, int depth);
+
+    /**
+     * Callback for when a node is last visited, after all of its descendants have been visited.
+     *
+     * @param node the node being visited.
+     * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
+     * of that will have depth 1.
+     */
+    public void tail(Node node, int depth);
+}
diff --git a/src/org/jsoup/select/QueryParser.java b/src/org/jsoup/select/QueryParser.java

new file mode 100644 (file)

index 0000000..d3cc36f
--- /dev/null
+++ b/src/org/jsoup/select/QueryParser.java
@@ -0,0 +1,293 @@
+package org.jsoup.select;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.jsoup.helper.StringUtil;
+import org.jsoup.helper.Validate;
+import org.jsoup.parser.TokenQueue;
+
+/**
+ * Parses a CSS selector into an Evaluator tree.
+ */
+class QueryParser {
+    private final static String[] combinators = {",", ">", "+", "~", " "};
+
+    private TokenQueue tq;
+    private String query;
+    private List<Evaluator> evals = new ArrayList<Evaluator>();
+
+    /**
+     * Create a new QueryParser.
+     * @param query CSS query
+     */
+    private QueryParser(String query) {
+        this.query = query;
+        this.tq = new TokenQueue(query);
+    }
+
+    /**
+     * Parse a CSS query into an Evaluator.
+     * @param query CSS query
+     * @return Evaluator
+     */
+    public static Evaluator parse(String query) {
+        QueryParser p = new QueryParser(query);
+        return p.parse();
+    }
+
+    /**
+     * Parse the query
+     * @return Evaluator
+     */
+    Evaluator parse() {
+        tq.consumeWhitespace();
+
+        if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements
+            evals.add(new StructuralEvaluator.Root());
+            combinator(tq.consume());
+        } else {
+            findElements();
+        }
+
+        while (!tq.isEmpty()) {
+            // hierarchy and extras
+            boolean seenWhite = tq.consumeWhitespace();
+
+            if (tq.matchesAny(combinators)) {
+                combinator(tq.consume());
+            } else if (seenWhite) {
+                combinator(' ');
+            } else { // E.class, E#id, E[attr] etc. AND
+                findElements(); // take next el, #. etc off queue
+            }
+        }
+
+        if (evals.size() == 1)
+            return evals.get(0);
+
+        return new CombiningEvaluator.And(evals);
+    }
+
+    private void combinator(char combinator) {
+        tq.consumeWhitespace();
+        String subQuery = consumeSubQuery(); // support multi > childs
+
+        Evaluator rootEval; // the new topmost evaluator
+        Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or.
+        Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator
+        boolean replaceRightMost = false;
+
+        if (evals.size() == 1) {
+            rootEval = currentEval = evals.get(0);
+            // make sure OR (,) has precedence:
+            if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') {
+                currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator();
+                replaceRightMost = true;
+            }
+        }
+        else {
+            rootEval = currentEval = new CombiningEvaluator.And(evals);
+        }
+        evals.clear();
+
+        // for most combinators: change the current eval into an AND of the current eval and the new eval
+        if (combinator == '>')
+            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediateParent(currentEval));
+        else if (combinator == ' ')
+            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.Parent(currentEval));
+        else if (combinator == '+')
+            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediatePreviousSibling(currentEval));
+        else if (combinator == '~')
+            currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.PreviousSibling(currentEval));
+        else if (combinator == ',') { // group or.
+            CombiningEvaluator.Or or;
+            if (currentEval instanceof CombiningEvaluator.Or) {
+                or = (CombiningEvaluator.Or) currentEval;
+                or.add(newEval);
+            } else {
+                or = new CombiningEvaluator.Or();
+                or.add(currentEval);
+                or.add(newEval);
+            }
+            currentEval = or;
+        }
+        else
+            throw new Selector.SelectorParseException("Unknown combinator: " + combinator);
+
+        if (replaceRightMost)
+            ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval);
+        else rootEval = currentEval;
+        evals.add(rootEval);
+    }
+
+    private String consumeSubQuery() {
+        StringBuilder sq = new StringBuilder();
+        while (!tq.isEmpty()) {
+            if (tq.matches("("))
+                sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
+            else if (tq.matches("["))
+                sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
+            else if (tq.matchesAny(combinators))
+                break;
+            else
+                sq.append(tq.consume());
+        }
+        return sq.toString();
+    }
+
+    private void findElements() {
+        if (tq.matchChomp("#"))
+            byId();
+        else if (tq.matchChomp("."))
+            byClass();
+        else if (tq.matchesWord())
+            byTag();
+        else if (tq.matches("["))
+            byAttribute();
+        else if (tq.matchChomp("*"))
+            allElements();
+        else if (tq.matchChomp(":lt("))
+            indexLessThan();
+        else if (tq.matchChomp(":gt("))
+            indexGreaterThan();
+        else if (tq.matchChomp(":eq("))
+            indexEquals();
+        else if (tq.matches(":has("))
+            has();
+        else if (tq.matches(":contains("))
+            contains(false);
+        else if (tq.matches(":containsOwn("))
+            contains(true);
+        else if (tq.matches(":matches("))
+            matches(false);
+        else if (tq.matches(":matchesOwn("))
+            matches(true);
+        else if (tq.matches(":not("))
+            not();
+        else // unhandled
+            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
+
+    }
+
+    private void byId() {
+        String id = tq.consumeCssIdentifier();
+        Validate.notEmpty(id);
+        evals.add(new Evaluator.Id(id));
+    }
+
+    private void byClass() {
+        String className = tq.consumeCssIdentifier();
+        Validate.notEmpty(className);
+        evals.add(new Evaluator.Class(className.trim().toLowerCase()));
+    }
+
+    private void byTag() {
+        String tagName = tq.consumeElementSelector();
+        Validate.notEmpty(tagName);
+
+        // namespaces: if element name is "abc:def", selector must be "abc|def", so flip:
+        if (tagName.contains("|"))
+            tagName = tagName.replace("|", ":");
+
+        evals.add(new Evaluator.Tag(tagName.trim().toLowerCase()));
+    }
+
+    private void byAttribute() {
+        TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
+        String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, not, start, end, contain, match, (no val)
+        Validate.notEmpty(key);
+        cq.consumeWhitespace();
+
+        if (cq.isEmpty()) {
+            if (key.startsWith("^"))
+                evals.add(new Evaluator.AttributeStarting(key.substring(1)));
+            else
+                evals.add(new Evaluator.Attribute(key));
+        } else {
+            if (cq.matchChomp("="))
+                evals.add(new Evaluator.AttributeWithValue(key, cq.remainder()));
+
+            else if (cq.matchChomp("!="))
+                evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder()));
+
+            else if (cq.matchChomp("^="))
+                evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder()));
+
+            else if (cq.matchChomp("$="))
+                evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder()));
+
+            else if (cq.matchChomp("*="))
+                evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder()));
+
+            else if (cq.matchChomp("~="))
+                evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())));
+            else
+                throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
+        }
+    }
+
+    private void allElements() {
+        evals.add(new Evaluator.AllElements());
+    }
+
+    // pseudo selectors :lt, :gt, :eq
+    private void indexLessThan() {
+        evals.add(new Evaluator.IndexLessThan(consumeIndex()));
+    }
+
+    private void indexGreaterThan() {
+        evals.add(new Evaluator.IndexGreaterThan(consumeIndex()));
+    }
+
+    private void indexEquals() {
+        evals.add(new Evaluator.IndexEquals(consumeIndex()));
+    }
+
+    private int consumeIndex() {
+        String indexS = tq.chompTo(")").trim();
+        Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric");
+        return Integer.parseInt(indexS);
+    }
+
+    // pseudo selector :has(el)
+    private void has() {
+        tq.consume(":has");
+        String subQuery = tq.chompBalanced('(', ')');
+        Validate.notEmpty(subQuery, ":has(el) subselect must not be empty");
+        evals.add(new StructuralEvaluator.Has(parse(subQuery)));
+    }
+
+    // pseudo selector :contains(text), containsOwn(text)
+    private void contains(boolean own) {
+        tq.consume(own ? ":containsOwn" : ":contains");
+        String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')'));
+        Validate.notEmpty(searchText, ":contains(text) query must not be empty");
+        if (own)
+            evals.add(new Evaluator.ContainsOwnText(searchText));
+        else
+            evals.add(new Evaluator.ContainsText(searchText));
+    }
+
+    // :matches(regex), matchesOwn(regex)
+    private void matches(boolean own) {
+        tq.consume(own ? ":matchesOwn" : ":matches");
+        String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped
+        Validate.notEmpty(regex, ":matches(regex) query must not be empty");
+
+        if (own)
+            evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex)));
+        else
+            evals.add(new Evaluator.Matches(Pattern.compile(regex)));
+    }
+
+    // :not(selector)
+    private void not() {
+        tq.consume(":not");
+        String subQuery = tq.chompBalanced('(', ')');
+        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
+
+        evals.add(new StructuralEvaluator.Not(parse(subQuery)));
+    }
+}
diff --git a/src/org/jsoup/select/Selector.java b/src/org/jsoup/select/Selector.java

new file mode 100644 (file)

index 0000000..8fc6286
--- /dev/null
+++ b/src/org/jsoup/select/Selector.java
@@ -0,0 +1,126 @@
+package org.jsoup.select;
+
+import org.jsoup.helper.Validate;
+import org.jsoup.nodes.Element;
+
+import java.util.Collection;
+import java.util.LinkedHashSet;
+
+/**
+ * CSS-like element selector, that finds elements matching a query.
+ * <p/>
+ * <h2>Selector syntax</h2>
+ * A selector is a chain of simple selectors, separated by combinators. Selectors are case insensitive (including against
+ * elements, attributes, and attribute values).
+ * <p/>
+ * The universal selector (*) is implicit when no element selector is supplied (i.e. {@code *.header} and {@code .header}
+ * is equivalent).
+ * <p/>
+ * <table>
+ * <tr><th>Pattern</th><th>Matches</th><th>Example</th></tr>
+ * <tr><td><code>*</code></td><td>any element</td><td><code>*</code></td></tr>
+ * <tr><td><code>tag</code></td><td>elements with the given tag name</td><td><code>div</code></td></tr>
+ * <tr><td><code>ns|E</code></td><td>elements of type E in the namespace <i>ns</i></td><td><code>fb|name</code> finds <code>&lt;fb:name></code> elements</td></tr>
+ * <tr><td><code>#id</code></td><td>elements with attribute ID of "id"</td><td><code>div#wrap</code>, <code>#logo</code></td></tr>
+ * <tr><td><code>.class</code></td><td>elements with a class name of "class"</td><td><code>div.left</code>, <code>.result</code></td></tr>
+ * <tr><td><code>[attr]</code></td><td>elements with an attribute named "attr" (with any value)</td><td><code>a[href]</code>, <code>[title]</code></td></tr>
+ * <tr><td><code>[^attrPrefix]</code></td><td>elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets</td><td><code>[^data-]</code>, <code>div[^data-]</code></td></tr>
+ * <tr><td><code>[attr=val]</code></td><td>elements with an attribute named "attr", and value equal to "val"</td><td><code>img[width=500]</code>, <code>a[rel=nofollow]</code></td></tr>
+ * <tr><td><code>[attr^=valPrefix]</code></td><td>elements with an attribute named "attr", and value starting with "valPrefix"</td><td><code>a[href^=http:]</code></code></td></tr>
+ * <tr><td><code>[attr$=valSuffix]</code></td><td>elements with an attribute named "attr", and value ending with "valSuffix"</td><td><code>img[src$=.png]</code></td></tr>
+ * <tr><td><code>[attr*=valContaining]</code></td><td>elements with an attribute named "attr", and value containing "valContaining"</td><td><code>a[href*=/search/]</code></td></tr>
+ * <tr><td><code>[attr~=<em>regex</em>]</code></td><td>elements with an attribute named "attr", and value matching the regular expression</td><td><code>img[src~=(?i)\\.(png|jpe?g)]</code></td></tr>
+ * <tr><td></td><td>The above may be combined in any order</td><td><code>div.header[title]</code></td></tr>
+ * <tr><td><td colspan="3"><h3>Combinators</h3></td></tr>
+ * <tr><td><code>E F</code></td><td>an F element descended from an E element</td><td><code>div a</code>, <code>.logo h1</code></td></tr>
+ * <tr><td><code>E > F</code></td><td>an F direct child of E</td><td><code>ol > li</code></td></tr>
+ * <tr><td><code>E + F</code></td><td>an F element immediately preceded by sibling E</td><td><code>li + li</code>, <code>div.head + div</code></td></tr>
+ * <tr><td><code>E ~ F</code></td><td>an F element preceded by sibling E</td><td><code>h1 ~ p</code></td></tr>
+ * <tr><td><code>E, F, G</code></td><td>all matching elements E, F, or G</td><td><code>a[href], div, h3</code></td></tr>
+ * <tr><td><td colspan="3"><h3>Pseudo selectors</h3></td></tr>
+ * <tr><td><code>:lt(<em>n</em>)</code></td><td>elements whose sibling index is less than <em>n</em></td><td><code>td:lt(3)</code> finds the first 2 cells of each row</td></tr>
+ * <tr><td><code>:gt(<em>n</em>)</code></td><td>elements whose sibling index is greater than <em>n</em></td><td><code>td:gt(1)</code> finds cells after skipping the first two</td></tr>
+ * <tr><td><code>:eq(<em>n</em>)</code></td><td>elements whose sibling index is equal to <em>n</em></td><td><code>td:eq(0)</code> finds the first cell of each row</td></tr>
+ * <tr><td><code>:has(<em>selector</em>)</code></td><td>elements that contains at least one element matching the <em>selector</em></td><td><code>div:has(p)</code> finds divs that contain p elements </td></tr>
+ * <tr><td><code>:not(<em>selector</em>)</code></td><td>elements that do not match the <em>selector</em>. See also {@link Elements#not(String)}</td><td><code>div:not(.logo)</code> finds all divs that do not have the "logo" class.<br /><code>div:not(:has(div))</code> finds divs that do not contain divs.</code></td></tr>
+ * <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants.</td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".</td></tr>
+ * <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements whose text matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
+ * <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contains the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr>
+ * <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr>
+ * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr>
+ * </table>
+ *
+ * @author Jonathan Hedley, jonathan@hedley.net
+ * @see Element#select(String)
+ */
+public class Selector {
+    private final Evaluator evaluator;
+    private final Element root;
+
+    private Selector(String query, Element root) {
+        Validate.notNull(query);
+        query = query.trim();
+        Validate.notEmpty(query);
+        Validate.notNull(root);
+
+        this.evaluator = QueryParser.parse(query);
+
+        this.root = root;
+    }
+
+    /**
+     * Find elements matching selector.
+     *
+     * @param query CSS selector
+     * @param root  root element to descend into
+     * @return matching elements, empty if not
+     */
+    public static Elements select(String query, Element root) {
+        return new Selector(query, root).select();
+    }
+
+    /**
+     * Find elements matching selector.
+     *
+     * @param query CSS selector
+     * @param roots root elements to descend into
+     * @return matching elements, empty if not
+     */
+    public static Elements select(String query, Iterable<Element> roots) {
+        Validate.notEmpty(query);
+        Validate.notNull(roots);
+        LinkedHashSet<Element> elements = new LinkedHashSet<Element>();
+
+        for (Element root : roots) {
+            elements.addAll(select(query, root));
+        }
+        return new Elements(elements);
+    }
+
+    private Elements select() {
+        return Collector.collect(evaluator, root);
+    }
+
+    // exclude set. package open so that Elements can implement .not() selector.
+    static Elements filterOut(Collection<Element> elements, Collection<Element> outs) {
+        Elements output = new Elements();
+        for (Element el : elements) {
+            boolean found = false;
+            for (Element out : outs) {
+                if (el.equals(out)) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found)
+                output.add(el);
+        }
+        return output;
+    }
+
+    public static class SelectorParseException extends IllegalStateException {
+        public SelectorParseException(String msg, Object... params) {
+            super(String.format(msg, params));
+        }
+    }
+}
diff --git a/src/org/jsoup/select/StructuralEvaluator.java b/src/org/jsoup/select/StructuralEvaluator.java

new file mode 100644 (file)

index 0000000..69e8a62
--- /dev/null
+++ b/src/org/jsoup/select/StructuralEvaluator.java
@@ -0,0 +1,132 @@
+package org.jsoup.select;
+
+import org.jsoup.nodes.Element;
+
+/**
+ * Base structural evaluator.
+ */
+abstract class StructuralEvaluator extends Evaluator {
+    Evaluator evaluator;
+
+    static class Root extends Evaluator {
+        public boolean matches(Element root, Element element) {
+            return root == element;
+        }
+    }
+
+    static class Has extends StructuralEvaluator {
+        public Has(Evaluator evaluator) {
+            this.evaluator = evaluator;
+        }
+
+        public boolean matches(Element root, Element element) {
+            for (Element e : element.getAllElements()) {
+                if (e != element && evaluator.matches(root, e))
+                    return true;
+            }
+            return false;
+        }
+
+        public String toString() {
+            return String.format(":has(%s)", evaluator);
+        }
+    }
+
+    static class Not extends StructuralEvaluator {
+        public Not(Evaluator evaluator) {
+            this.evaluator = evaluator;
+        }
+
+        public boolean matches(Element root, Element node) {
+            return !evaluator.matches(root, node);
+        }
+
+        public String toString() {
+            return String.format(":not%s", evaluator);
+        }
+    }
+
+    static class Parent extends StructuralEvaluator {
+        public Parent(Evaluator evaluator) {
+            this.evaluator = evaluator;
+        }
+
+        public boolean matches(Element root, Element element) {
+            if (root == element)
+                return false;
+
+            Element parent = element.parent();
+            while (parent != root) {
+                if (evaluator.matches(root, parent))
+                    return true;
+                parent = parent.parent();
+            }
+            return false;
+        }
+
+        public String toString() {
+            return String.format(":parent%s", evaluator);
+        }
+    }
+
+    static class ImmediateParent extends StructuralEvaluator {
+        public ImmediateParent(Evaluator evaluator) {
+            this.evaluator = evaluator;
+        }
+
+        public boolean matches(Element root, Element element) {
+            if (root == element)
+                return false;
+
+            Element parent = element.parent();
+            return parent != null && evaluator.matches(root, parent);
+        }
+
+        public String toString() {
+            return String.format(":ImmediateParent%s", evaluator);
+        }
+    }
+
+    static class PreviousSibling extends StructuralEvaluator {
+        public PreviousSibling(Evaluator evaluator) {
+            this.evaluator = evaluator;
+        }
+
+        public boolean matches(Element root, Element element) {
+            if (root == element)
+                return false;
+
+            Element prev = element.previousElementSibling();
+
+            while (prev != null) {
+                if (evaluator.matches(root, prev))
+                    return true;
+
+                prev = prev.previousElementSibling();
+            }
+            return false;
+        }
+
+        public String toString() {
+            return String.format(":prev*%s", evaluator);
+        }
+    }
+
+    static class ImmediatePreviousSibling extends StructuralEvaluator {
+        public ImmediatePreviousSibling(Evaluator evaluator) {
+            this.evaluator = evaluator;
+        }
+
+        public boolean matches(Element root, Element element) {
+            if (root == element)
+                return false;
+
+            Element prev = element.previousElementSibling();
+            return prev != null && evaluator.matches(root, prev);
+        }
+
+        public String toString() {
+            return String.format(":prev%s", evaluator);
+        }
+    }
+}
diff --git a/src/org/jsoup/select/package-info.java b/src/org/jsoup/select/package-info.java

new file mode 100644 (file)

index 0000000..a6e6a2f
--- /dev/null
+++ b/src/org/jsoup/select/package-info.java
@@ -0,0 +1,4 @@
+/**
+ Packages to support the CSS-style element selector.
+ */
+package org.jsoup.select;
+\ No newline at end of file
author	Leif Åstrand <leif@vaadin.com>
	Thu, 9 Aug 2012 13:25:06 +0000 (16:25 +0300)
committer	Leif Åstrand <leif@vaadin.com>
	Thu, 9 Aug 2012 13:39:36 +0000 (16:39 +0300)
src/org/jsoup/Connection.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/Jsoup.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/examples/HtmlToPlainText.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/examples/ListLinks.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/examples/package-info.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/helper/DataUtil.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/helper/DescendableLinkedList.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/helper/HttpConnection.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/helper/StringUtil.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/helper/Validate.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Attribute.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Attributes.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Comment.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/DataNode.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Document.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/DocumentType.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Element.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Entities.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/Node.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/TextNode.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/XmlDeclaration.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/entities-base.properties	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/entities-full.properties	[new file with mode: 0644]	patch \| blob
src/org/jsoup/nodes/package-info.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/package-info.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/CharacterReader.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/HtmlTreeBuilder.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/HtmlTreeBuilderState.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/ParseError.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/ParseErrorList.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/Parser.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/Tag.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/Token.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/TokenQueue.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/Tokeniser.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/TokeniserState.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/TreeBuilder.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/XmlTreeBuilder.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/parser/package-info.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/safety/Cleaner.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/safety/Whitelist.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/safety/package-info.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/Collector.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/CombiningEvaluator.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/Elements.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/Evaluator.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/NodeTraversor.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/NodeVisitor.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/QueryParser.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/Selector.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/StructuralEvaluator.java	[new file with mode: 0644]	patch \| blob
src/org/jsoup/select/package-info.java	[new file with mode: 0644]	patch \| blob