diff options
52 files changed, 15233 insertions, 0 deletions
diff --git a/src/org/jsoup/Connection.java b/src/org/jsoup/Connection.java new file mode 100644 index 0000000000..564eeb89b7 --- /dev/null +++ b/src/org/jsoup/Connection.java @@ -0,0 +1,481 @@ +package org.jsoup; + +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; + +import java.net.URL; +import java.util.Map; +import java.util.Collection; +import java.io.IOException; + +/** + * A Connection provides a convenient interface to fetch content from the web, and parse them into Documents. + * <p> + * To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}. Connections contain {@link Connection.Request} + * and {@link Connection.Response} objects. The request objects are reusable as prototype requests. + * <p> + * Request configuration can be made using either the shortcut methods in Connection (e.g. {@link #userAgent(String)}), + * or by methods in the Connection.Request object directly. All request configuration must be made before the request + * is executed. + * <p> + * The Connection interface is <b>currently in beta</b> and subject to change. Comments, suggestions, and bug reports are welcome. + */ +public interface Connection { + + /** + * GET and POST http methods. + */ + public enum Method { + GET, POST + } + + /** + * Set the request URL to fetch. The protocol must be HTTP or HTTPS. + * @param url URL to connect to + * @return this Connection, for chaining + */ + public Connection url(URL url); + + /** + * Set the request URL to fetch. The protocol must be HTTP or HTTPS. + * @param url URL to connect to + * @return this Connection, for chaining + */ + public Connection url(String url); + + /** + * Set the request user-agent header. + * @param userAgent user-agent to use + * @return this Connection, for chaining + */ + public Connection userAgent(String userAgent); + + /** + * Set the request timeouts (connect and read). If a timeout occurs, an IOException will be thrown. The default + * timeout is 3 seconds (3000 millis). A timeout of zero is treated as an infinite timeout. + * @param millis number of milliseconds (thousandths of a second) before timing out connects or reads. + * @return this Connection, for chaining + */ + public Connection timeout(int millis); + + /** + * Set the request referrer (aka "referer") header. + * @param referrer referrer to use + * @return this Connection, for chaining + */ + public Connection referrer(String referrer); + + /** + * Configures the connection to (not) follow server redirects. By default this is <b>true</b>. + * @param followRedirects true if server redirects should be followed. + * @return this Connection, for chaining + */ + public Connection followRedirects(boolean followRedirects); + + /** + * Set the request method to use, GET or POST. Default is GET. + * @param method HTTP request method + * @return this Connection, for chaining + */ + public Connection method(Method method); + + /** + * Configures the connection to not throw exceptions when a HTTP error occurs. (4xx - 5xx, e.g. 404 or 500). By + * default this is <b>false</b>; an IOException is thrown if an error is encountered. If set to <b>true</b>, the + * response is populated with the error body, and the status message will reflect the error. + * @param ignoreHttpErrors - false (default) if HTTP errors should be ignored. + * @return this Connection, for chaining + */ + public Connection ignoreHttpErrors(boolean ignoreHttpErrors); + + /** + * Ignore the document's Content-Type when parsing the response. By default this is <b>false</b>, an unrecognised + * content-type will cause an IOException to be thrown. (This is to prevent producing garbage by attempting to parse + * a JPEG binary image, for example.) Set to true to force a parse attempt regardless of content type. + * @param ignoreContentType set to true if you would like the content type ignored on parsing the response into a + * Document. + * @return this Connection, for chaining + */ + public Connection ignoreContentType(boolean ignoreContentType); + + /** + * Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the request + * body for POSTs. A request may have multiple values of the same name. + * @param key data key + * @param value data value + * @return this Connection, for chaining + */ + public Connection data(String key, String value); + + /** + * Adds all of the supplied data to the request data parameters + * @param data map of data parameters + * @return this Connection, for chaining + */ + public Connection data(Map<String, String> data); + + /** + * Add a number of request data parameters. Multiple parameters may be set at once, e.g.: + * <code>.data("name", "jsoup", "language", "Java", "language", "English");</code> creates a query string like: + * <code>?name=jsoup&language=Java&language=English</code> + * @param keyvals a set of key value pairs. + * @return this Connection, for chaining + */ + public Connection data(String... keyvals); + + /** + * Set a request header. + * @param name header name + * @param value header value + * @return this Connection, for chaining + * @see org.jsoup.Connection.Request#headers() + */ + public Connection header(String name, String value); + + /** + * Set a cookie to be sent in the request. + * @param name name of cookie + * @param value value of cookie + * @return this Connection, for chaining + */ + public Connection cookie(String name, String value); + + /** + * Adds each of the supplied cookies to the request. + * @param cookies map of cookie name -> value pairs + * @return this Connection, for chaining + */ + public Connection cookies(Map<String, String> cookies); + + /** + * Provide an alternate parser to use when parsing the response to a Document. + * @param parser alternate parser + * @return this Connection, for chaining + */ + public Connection parser(Parser parser); + + /** + * Execute the request as a GET, and parse the result. + * @return parsed Document + * @throws IOException on error + */ + public Document get() throws IOException; + + /** + * Execute the request as a POST, and parse the result. + * @return parsed Document + * @throws IOException on error + */ + public Document post() throws IOException; + + /** + * Execute the request. + * @return a response object + * @throws IOException on error + */ + public Response execute() throws IOException; + + /** + * Get the request object associated with this connection + * @return request + */ + public Request request(); + + /** + * Set the connection's request + * @param request new request object + * @return this Connection, for chaining + */ + public Connection request(Request request); + + /** + * Get the response, once the request has been executed + * @return response + */ + public Response response(); + + /** + * Set the connection's response + * @param response new response + * @return this Connection, for chaining + */ + public Connection response(Response response); + + + /** + * Common methods for Requests and Responses + * @param <T> Type of Base, either Request or Response + */ + interface Base<T extends Base> { + + /** + * Get the URL + * @return URL + */ + public URL url(); + + /** + * Set the URL + * @param url new URL + * @return this, for chaining + */ + public T url(URL url); + + /** + * Get the request method + * @return method + */ + public Method method(); + + /** + * Set the request method + * @param method new method + * @return this, for chaining + */ + public T method(Method method); + + /** + * Get the value of a header. This is a simplified header model, where a header may only have one value. + * <p> + * Header names are case insensitive. + * @param name name of header (case insensitive) + * @return value of header, or null if not set. + * @see #hasHeader(String) + * @see #cookie(String) + */ + public String header(String name); + + /** + * Set a header. This method will overwrite any existing header with the same case insensitive name. + * @param name Name of header + * @param value Value of header + * @return this, for chaining + */ + public T header(String name, String value); + + /** + * Check if a header is present + * @param name name of header (case insensitive) + * @return if the header is present in this request/response + */ + public boolean hasHeader(String name); + + /** + * Remove a header by name + * @param name name of header to remove (case insensitive) + * @return this, for chaining + */ + public T removeHeader(String name); + + /** + * Retrieve all of the request/response headers as a map + * @return headers + */ + public Map<String, String> headers(); + + /** + * Get a cookie value by name from this request/response. + * <p> + * Response objects have a simplified cookie model. Each cookie set in the response is added to the response + * object's cookie key=value map. The cookie's path, domain, and expiry date are ignored. + * @param name name of cookie to retrieve. + * @return value of cookie, or null if not set + */ + public String cookie(String name); + + /** + * Set a cookie in this request/response. + * @param name name of cookie + * @param value value of cookie + * @return this, for chaining + */ + public T cookie(String name, String value); + + /** + * Check if a cookie is present + * @param name name of cookie + * @return if the cookie is present in this request/response + */ + public boolean hasCookie(String name); + + /** + * Remove a cookie by name + * @param name name of cookie to remove + * @return this, for chaining + */ + public T removeCookie(String name); + + /** + * Retrieve all of the request/response cookies as a map + * @return cookies + */ + public Map<String, String> cookies(); + + } + + /** + * Represents a HTTP request. + */ + public interface Request extends Base<Request> { + /** + * Get the request timeout, in milliseconds. + * @return the timeout in milliseconds. + */ + public int timeout(); + + /** + * Update the request timeout. + * @param millis timeout, in milliseconds + * @return this Request, for chaining + */ + public Request timeout(int millis); + + /** + * Get the current followRedirects configuration. + * @return true if followRedirects is enabled. + */ + public boolean followRedirects(); + + /** + * Configures the request to (not) follow server redirects. By default this is <b>true</b>. + * + * @param followRedirects true if server redirects should be followed. + * @return this Request, for chaining + */ + public Request followRedirects(boolean followRedirects); + + /** + * Get the current ignoreHttpErrors configuration. + * @return true if errors will be ignored; false (default) if HTTP errors will cause an IOException to be thrown. + */ + public boolean ignoreHttpErrors(); + + /** + * Configures the request to ignore HTTP errors in the response. + * @param ignoreHttpErrors set to true to ignore HTTP errors. + * @return this Request, for chaining + */ + public Request ignoreHttpErrors(boolean ignoreHttpErrors); + + /** + * Get the current ignoreContentType configuration. + * @return true if invalid content-types will be ignored; false (default) if they will cause an IOException to be thrown. + */ + public boolean ignoreContentType(); + + /** + * Configures the request to ignore the Content-Type of the response. + * @param ignoreContentType set to true to ignore the content type. + * @return this Request, for chaining + */ + public Request ignoreContentType(boolean ignoreContentType); + + /** + * Add a data parameter to the request + * @param keyval data to add. + * @return this Request, for chaining + */ + public Request data(KeyVal keyval); + + /** + * Get all of the request's data parameters + * @return collection of keyvals + */ + public Collection<KeyVal> data(); + + /** + * Specify the parser to use when parsing the document. + * @param parser parser to use. + * @return this Request, for chaining + */ + public Request parser(Parser parser); + + /** + * Get the current parser to use when parsing the document. + * @return current Parser + */ + public Parser parser(); + } + + /** + * Represents a HTTP response. + */ + public interface Response extends Base<Response> { + + /** + * Get the status code of the response. + * @return status code + */ + public int statusCode(); + + /** + * Get the status message of the response. + * @return status message + */ + public String statusMessage(); + + /** + * Get the character set name of the response. + * @return character set name + */ + public String charset(); + + /** + * Get the response content type (e.g. "text/html"); + * @return the response content type + */ + public String contentType(); + + /** + * Parse the body of the response as a Document. + * @return a parsed Document + * @throws IOException on error + */ + public Document parse() throws IOException; + + /** + * Get the body of the response as a plain string. + * @return body + */ + public String body(); + + /** + * Get the body of the response as an array of bytes. + * @return body bytes + */ + public byte[] bodyAsBytes(); + } + + /** + * A Key Value tuple. + */ + public interface KeyVal { + + /** + * Update the key of a keyval + * @param key new key + * @return this KeyVal, for chaining + */ + public KeyVal key(String key); + + /** + * Get the key of a keyval + * @return the key + */ + public String key(); + + /** + * Update the value of a keyval + * @param value the new value + * @return this KeyVal, for chaining + */ + public KeyVal value(String value); + + /** + * Get the value of a keyval + * @return the value + */ + public String value(); + } +} + diff --git a/src/org/jsoup/Jsoup.java b/src/org/jsoup/Jsoup.java new file mode 100644 index 0000000000..8c6afcee36 --- /dev/null +++ b/src/org/jsoup/Jsoup.java @@ -0,0 +1,229 @@ +package org.jsoup; + +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.jsoup.safety.Cleaner; +import org.jsoup.safety.Whitelist; +import org.jsoup.helper.DataUtil; +import org.jsoup.helper.HttpConnection; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +/** + The core public access point to the jsoup functionality. + + @author Jonathan Hedley */ +public class Jsoup { + private Jsoup() {} + + /** + Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. + + @param html HTML to parse + @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur + before the HTML declares a {@code <base href>} tag. + @return sane HTML + */ + public static Document parse(String html, String baseUri) { + return Parser.parse(html, baseUri); + } + + /** + Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML + (non-HTML) parser. + + @param html HTML to parse + @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur + before the HTML declares a {@code <base href>} tag. + @param parser alternate {@link Parser#xmlParser() parser} to use. + @return sane HTML + */ + public static Document parse(String html, String baseUri, Parser parser) { + return parser.parseInput(html, baseUri); + } + + /** + Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a + {@code <base href>} tag. + + @param html HTML to parse + @return sane HTML + + @see #parse(String, String) + */ + public static Document parse(String html) { + return Parser.parse(html, ""); + } + + /** + * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. + * <p> + * Use examples: + * <ul> + * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> + * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post(); + * </ul> + * @param url URL to connect to. The protocol must be {@code http} or {@code https}. + * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. + */ + public static Connection connect(String url) { + return HttpConnection.connect(url); + } + + /** + Parse the contents of a file as HTML. + + @param in file to load HTML from + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + public static Document parse(File in, String charsetName, String baseUri) throws IOException { + return DataUtil.load(in, charsetName, baseUri); + } + + /** + Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + + @param in file to load HTML from + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @see #parse(File, String, String) + */ + public static Document parse(File in, String charsetName) throws IOException { + return DataUtil.load(in, charsetName, in.getAbsolutePath()); + } + + /** + Read an input stream, and parse it to a Document. + + @param in input stream to read. Make sure to close it after parsing. + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { + return DataUtil.load(in, charsetName, baseUri); + } + + /** + Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML + (non-HTML) parser. + + @param in input stream to read. Make sure to close it after parsing. + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @param parser alternate {@link Parser#xmlParser() parser} to use. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { + return DataUtil.load(in, charsetName, baseUri, parser); + } + + /** + Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. + + @param bodyHtml body HTML fragment + @param baseUri URL to resolve relative URLs against. + @return sane HTML document + + @see Document#body() + */ + public static Document parseBodyFragment(String bodyHtml, String baseUri) { + return Parser.parseBodyFragment(bodyHtml, baseUri); + } + + /** + Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. + + @param bodyHtml body HTML fragment + @return sane HTML document + + @see Document#body() + */ + public static Document parseBodyFragment(String bodyHtml) { + return Parser.parseBodyFragment(bodyHtml, ""); + } + + /** + Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. + <p> + The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. + + @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. + @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. + @return The parsed HTML. + + @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading + the response stream. + + @see #connect(String) + */ + public static Document parse(URL url, int timeoutMillis) throws IOException { + Connection con = HttpConnection.connect(url); + con.timeout(timeoutMillis); + return con.get(); + } + + /** + Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted + tags and attributes. + + @param bodyHtml input untrusted HTML + @param baseUri URL to resolve relative URLs against + @param whitelist white-list of permitted HTML elements + @return safe HTML + + @see Cleaner#clean(Document) + */ + public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { + Document dirty = parseBodyFragment(bodyHtml, baseUri); + Cleaner cleaner = new Cleaner(whitelist); + Document clean = cleaner.clean(dirty); + return clean.body().html(); + } + + /** + Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted + tags and attributes. + + @param bodyHtml input untrusted HTML + @param whitelist white-list of permitted HTML elements + @return safe HTML + + @see Cleaner#clean(Document) + */ + public static String clean(String bodyHtml, Whitelist whitelist) { + return clean(bodyHtml, "", whitelist); + } + + /** + Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should + still be run through the cleaner to set up enforced attributes, and to tidy the output. + @param bodyHtml HTML to test + @param whitelist whitelist to test against + @return true if no tags or attributes were removed; false otherwise + @see #clean(String, org.jsoup.safety.Whitelist) + */ + public static boolean isValid(String bodyHtml, Whitelist whitelist) { + Document dirty = parseBodyFragment(bodyHtml, ""); + Cleaner cleaner = new Cleaner(whitelist); + return cleaner.isValid(dirty); + } + +} diff --git a/src/org/jsoup/examples/HtmlToPlainText.java b/src/org/jsoup/examples/HtmlToPlainText.java new file mode 100644 index 0000000000..8f563e9608 --- /dev/null +++ b/src/org/jsoup/examples/HtmlToPlainText.java @@ -0,0 +1,109 @@ +package org.jsoup.examples; + +import org.jsoup.Jsoup; +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; + +import java.io.IOException; + +/** + * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted + * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a + * scrape. + * <p/> + * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class HtmlToPlainText { + public static void main(String... args) throws IOException { + Validate.isTrue(args.length == 1, "usage: supply url to fetch"); + String url = args[0]; + + // fetch the specified URL and parse to a HTML DOM + Document doc = Jsoup.connect(url).get(); + + HtmlToPlainText formatter = new HtmlToPlainText(); + String plainText = formatter.getPlainText(doc); + System.out.println(plainText); + } + + /** + * Format an Element to plain-text + * @param element the root element to format + * @return formatted text + */ + public String getPlainText(Element element) { + FormattingVisitor formatter = new FormattingVisitor(); + NodeTraversor traversor = new NodeTraversor(formatter); + traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node + + return formatter.toString(); + } + + // the formatting rules, implemented in a breadth-first DOM traverse + private class FormattingVisitor implements NodeVisitor { + private static final int maxWidth = 80; + private int width = 0; + private StringBuilder accum = new StringBuilder(); // holds the accumulated text + + // hit when the node is first seen + public void head(Node node, int depth) { + String name = node.nodeName(); + if (node instanceof TextNode) + append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. + else if (name.equals("li")) + append("\n * "); + } + + // hit when all of the node's children (if any) have been visited + public void tail(Node node, int depth) { + String name = node.nodeName(); + if (name.equals("br")) + append("\n"); + else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5")) + append("\n\n"); + else if (name.equals("a")) + append(String.format(" <%s>", node.absUrl("href"))); + } + + // appends text to the string builder with a simple word wrap method + private void append(String text) { + if (text.startsWith("\n")) + width = 0; // reset counter if starts with a newline. only from formats above, not in natural text + if (text.equals(" ") && + (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n"))) + return; // don't accumulate long runs of empty spaces + + if (text.length() + width > maxWidth) { // won't fit, needs to wrap + String words[] = text.split("\\s+"); + for (int i = 0; i < words.length; i++) { + String word = words[i]; + boolean last = i == words.length - 1; + if (!last) // insert a space if not the last word + word = word + " "; + if (word.length() + width > maxWidth) { // wrap and reset counter + accum.append("\n").append(word); + width = word.length(); + } else { + accum.append(word); + width += word.length(); + } + } + } else { // fits as is, without need to wrap text + accum.append(text); + width += text.length(); + } + } + + public String toString() { + return accum.toString(); + } + } +} diff --git a/src/org/jsoup/examples/ListLinks.java b/src/org/jsoup/examples/ListLinks.java new file mode 100644 index 0000000000..64b29ba107 --- /dev/null +++ b/src/org/jsoup/examples/ListLinks.java @@ -0,0 +1,56 @@ +package org.jsoup.examples; + +import org.jsoup.Jsoup; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; + +/** + * Example program to list links from a URL. + */ +public class ListLinks { + public static void main(String[] args) throws IOException { + Validate.isTrue(args.length == 1, "usage: supply url to fetch"); + String url = args[0]; + print("Fetching %s...", url); + + Document doc = Jsoup.connect(url).get(); + Elements links = doc.select("a[href]"); + Elements media = doc.select("[src]"); + Elements imports = doc.select("link[href]"); + + print("\nMedia: (%d)", media.size()); + for (Element src : media) { + if (src.tagName().equals("img")) + print(" * %s: <%s> %sx%s (%s)", + src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), + trim(src.attr("alt"), 20)); + else + print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); + } + + print("\nImports: (%d)", imports.size()); + for (Element link : imports) { + print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel")); + } + + print("\nLinks: (%d)", links.size()); + for (Element link : links) { + print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); + } + } + + private static void print(String msg, Object... args) { + System.out.println(String.format(msg, args)); + } + + private static String trim(String s, int width) { + if (s.length() > width) + return s.substring(0, width-1) + "."; + else + return s; + } +} diff --git a/src/org/jsoup/examples/package-info.java b/src/org/jsoup/examples/package-info.java new file mode 100644 index 0000000000..c312f430d4 --- /dev/null +++ b/src/org/jsoup/examples/package-info.java @@ -0,0 +1,4 @@ +/** + Contains example programs and use of jsoup. See the <a href="http://jsoup.org/cookbook/">jsoup cookbook</a>. + */ +package org.jsoup.examples;
\ No newline at end of file diff --git a/src/org/jsoup/helper/DataUtil.java b/src/org/jsoup/helper/DataUtil.java new file mode 100644 index 0000000000..9adfe42153 --- /dev/null +++ b/src/org/jsoup/helper/DataUtil.java @@ -0,0 +1,135 @@ +package org.jsoup.helper; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.parser.Parser; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Internal static utilities for handling data. + * + */ +public class DataUtil { + private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); + static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset + private static final int bufferSize = 0x20000; // ~130K. + + private DataUtil() {} + + /** + * Loads a file to a Document. + * @param in file to load + * @param charsetName character set of input + * @param baseUri base URI of document, to resolve relative links against + * @return Document + * @throws IOException on IO error + */ + public static Document load(File in, String charsetName, String baseUri) throws IOException { + FileInputStream inStream = null; + try { + inStream = new FileInputStream(in); + ByteBuffer byteData = readToByteBuffer(inStream); + return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser()); + } finally { + if (inStream != null) + inStream.close(); + } + } + + /** + * Parses a Document from an input steam. + * @param in input stream to parse. You will need to close it. + * @param charsetName character set of input + * @param baseUri base URI of document, to resolve relative links against + * @return Document + * @throws IOException on IO error + */ + public static Document load(InputStream in, String charsetName, String baseUri) throws IOException { + ByteBuffer byteData = readToByteBuffer(in); + return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser()); + } + + /** + * Parses a Document from an input steam, using the provided Parser. + * @param in input stream to parse. You will need to close it. + * @param charsetName character set of input + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate {@link Parser#xmlParser() parser} to use. + * @return Document + * @throws IOException on IO error + */ + public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { + ByteBuffer byteData = readToByteBuffer(in); + return parseByteData(byteData, charsetName, baseUri, parser); + } + + // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support + // switching the chartset midstream when a meta http-equiv tag defines the charset. + static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) { + String docData; + Document doc = null; + if (charsetName == null) { // determine from meta. safe parse as UTF-8 + // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> + docData = Charset.forName(defaultCharset).decode(byteData).toString(); + doc = parser.parseInput(docData, baseUri); + Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first(); + if (meta != null) { // if not found, will keep utf-8 as best attempt + String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset"); + if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode + charsetName = foundCharset; + byteData.rewind(); + docData = Charset.forName(foundCharset).decode(byteData).toString(); + doc = null; + } + } + } else { // specified by content type header (or by user on file load) + Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); + docData = Charset.forName(charsetName).decode(byteData).toString(); + } + if (doc == null) { + // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present + // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight + // into head mode + if (docData.charAt(0) == 65279) + docData = docData.substring(1); + + doc = parser.parseInput(docData, baseUri); + doc.outputSettings().charset(charsetName); + } + return doc; + } + + static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException { + byte[] buffer = new byte[bufferSize]; + ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize); + int read; + while(true) { + read = inStream.read(buffer); + if (read == -1) break; + outStream.write(buffer, 0, read); + } + ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray()); + return byteData; + } + + /** + * Parse out a charset from a content type header. + * @param contentType e.g. "text/html; charset=EUC-JP" + * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. + */ + static String getCharsetFromContentType(String contentType) { + if (contentType == null) return null; + Matcher m = charsetPattern.matcher(contentType); + if (m.find()) { + return m.group(1).trim().toUpperCase(); + } + return null; + } + + +} diff --git a/src/org/jsoup/helper/DescendableLinkedList.java b/src/org/jsoup/helper/DescendableLinkedList.java new file mode 100644 index 0000000000..28ca1971eb --- /dev/null +++ b/src/org/jsoup/helper/DescendableLinkedList.java @@ -0,0 +1,82 @@ +package org.jsoup.helper; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE. + */ +public class DescendableLinkedList<E> extends LinkedList<E> { + + /** + * Create a new DescendableLinkedList. + */ + public DescendableLinkedList() { + super(); + } + + /** + * Add a new element to the start of the list. + * @param e element to add + */ + public void push(E e) { + addFirst(e); + } + + /** + * Look at the last element, if there is one. + * @return the last element, or null + */ + public E peekLast() { + return size() == 0 ? null : getLast(); + } + + /** + * Remove and return the last element, if there is one + * @return the last element, or null + */ + public E pollLast() { + return size() == 0 ? null : removeLast(); + } + + /** + * Get an iterator that starts and the end of the list and works towards the start. + * @return an iterator that starts and the end of the list and works towards the start. + */ + public Iterator<E> descendingIterator() { + return new DescendingIterator<E>(size()); + } + + private class DescendingIterator<E> implements Iterator<E> { + private final ListIterator<E> iter; + + @SuppressWarnings("unchecked") + private DescendingIterator(int index) { + iter = (ListIterator<E>) listIterator(index); + } + + /** + * Check if there is another element on the list. + * @return if another element + */ + public boolean hasNext() { + return iter.hasPrevious(); + } + + /** + * Get the next element. + * @return the next element. + */ + public E next() { + return iter.previous(); + } + + /** + * Remove the current element. + */ + public void remove() { + iter.remove(); + } + } +} diff --git a/src/org/jsoup/helper/HttpConnection.java b/src/org/jsoup/helper/HttpConnection.java new file mode 100644 index 0000000000..06200a2547 --- /dev/null +++ b/src/org/jsoup/helper/HttpConnection.java @@ -0,0 +1,658 @@ +package org.jsoup.helper; + +import org.jsoup.Connection; +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.jsoup.parser.TokenQueue; + +import java.io.*; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.*; +import java.util.zip.GZIPInputStream; + +/** + * Implementation of {@link Connection}. + * @see org.jsoup.Jsoup#connect(String) + */ +public class HttpConnection implements Connection { + public static Connection connect(String url) { + Connection con = new HttpConnection(); + con.url(url); + return con; + } + + public static Connection connect(URL url) { + Connection con = new HttpConnection(); + con.url(url); + return con; + } + + private Connection.Request req; + private Connection.Response res; + + private HttpConnection() { + req = new Request(); + res = new Response(); + } + + public Connection url(URL url) { + req.url(url); + return this; + } + + public Connection url(String url) { + Validate.notEmpty(url, "Must supply a valid URL"); + try { + req.url(new URL(url)); + } catch (MalformedURLException e) { + throw new IllegalArgumentException("Malformed URL: " + url, e); + } + return this; + } + + public Connection userAgent(String userAgent) { + Validate.notNull(userAgent, "User agent must not be null"); + req.header("User-Agent", userAgent); + return this; + } + + public Connection timeout(int millis) { + req.timeout(millis); + return this; + } + + public Connection followRedirects(boolean followRedirects) { + req.followRedirects(followRedirects); + return this; + } + + public Connection referrer(String referrer) { + Validate.notNull(referrer, "Referrer must not be null"); + req.header("Referer", referrer); + return this; + } + + public Connection method(Method method) { + req.method(method); + return this; + } + + public Connection ignoreHttpErrors(boolean ignoreHttpErrors) { + req.ignoreHttpErrors(ignoreHttpErrors); + return this; + } + + public Connection ignoreContentType(boolean ignoreContentType) { + req.ignoreContentType(ignoreContentType); + return this; + } + + public Connection data(String key, String value) { + req.data(KeyVal.create(key, value)); + return this; + } + + public Connection data(Map<String, String> data) { + Validate.notNull(data, "Data map must not be null"); + for (Map.Entry<String, String> entry : data.entrySet()) { + req.data(KeyVal.create(entry.getKey(), entry.getValue())); + } + return this; + } + + public Connection data(String... keyvals) { + Validate.notNull(keyvals, "Data key value pairs must not be null"); + Validate.isTrue(keyvals.length %2 == 0, "Must supply an even number of key value pairs"); + for (int i = 0; i < keyvals.length; i += 2) { + String key = keyvals[i]; + String value = keyvals[i+1]; + Validate.notEmpty(key, "Data key must not be empty"); + Validate.notNull(value, "Data value must not be null"); + req.data(KeyVal.create(key, value)); + } + return this; + } + + public Connection header(String name, String value) { + req.header(name, value); + return this; + } + + public Connection cookie(String name, String value) { + req.cookie(name, value); + return this; + } + + public Connection cookies(Map<String, String> cookies) { + Validate.notNull(cookies, "Cookie map must not be null"); + for (Map.Entry<String, String> entry : cookies.entrySet()) { + req.cookie(entry.getKey(), entry.getValue()); + } + return this; + } + + public Connection parser(Parser parser) { + req.parser(parser); + return this; + } + + public Document get() throws IOException { + req.method(Method.GET); + execute(); + return res.parse(); + } + + public Document post() throws IOException { + req.method(Method.POST); + execute(); + return res.parse(); + } + + public Connection.Response execute() throws IOException { + res = Response.execute(req); + return res; + } + + public Connection.Request request() { + return req; + } + + public Connection request(Connection.Request request) { + req = request; + return this; + } + + public Connection.Response response() { + return res; + } + + public Connection response(Connection.Response response) { + res = response; + return this; + } + + @SuppressWarnings({"unchecked"}) + private static abstract class Base<T extends Connection.Base> implements Connection.Base<T> { + URL url; + Method method; + Map<String, String> headers; + Map<String, String> cookies; + + private Base() { + headers = new LinkedHashMap<String, String>(); + cookies = new LinkedHashMap<String, String>(); + } + + public URL url() { + return url; + } + + public T url(URL url) { + Validate.notNull(url, "URL must not be null"); + this.url = url; + return (T) this; + } + + public Method method() { + return method; + } + + public T method(Method method) { + Validate.notNull(method, "Method must not be null"); + this.method = method; + return (T) this; + } + + public String header(String name) { + Validate.notNull(name, "Header name must not be null"); + return getHeaderCaseInsensitive(name); + } + + public T header(String name, String value) { + Validate.notEmpty(name, "Header name must not be empty"); + Validate.notNull(value, "Header value must not be null"); + removeHeader(name); // ensures we don't get an "accept-encoding" and a "Accept-Encoding" + headers.put(name, value); + return (T) this; + } + + public boolean hasHeader(String name) { + Validate.notEmpty(name, "Header name must not be empty"); + return getHeaderCaseInsensitive(name) != null; + } + + public T removeHeader(String name) { + Validate.notEmpty(name, "Header name must not be empty"); + Map.Entry<String, String> entry = scanHeaders(name); // remove is case insensitive too + if (entry != null) + headers.remove(entry.getKey()); // ensures correct case + return (T) this; + } + + public Map<String, String> headers() { + return headers; + } + + private String getHeaderCaseInsensitive(String name) { + Validate.notNull(name, "Header name must not be null"); + // quick evals for common case of title case, lower case, then scan for mixed + String value = headers.get(name); + if (value == null) + value = headers.get(name.toLowerCase()); + if (value == null) { + Map.Entry<String, String> entry = scanHeaders(name); + if (entry != null) + value = entry.getValue(); + } + return value; + } + + private Map.Entry<String, String> scanHeaders(String name) { + String lc = name.toLowerCase(); + for (Map.Entry<String, String> entry : headers.entrySet()) { + if (entry.getKey().toLowerCase().equals(lc)) + return entry; + } + return null; + } + + public String cookie(String name) { + Validate.notNull(name, "Cookie name must not be null"); + return cookies.get(name); + } + + public T cookie(String name, String value) { + Validate.notEmpty(name, "Cookie name must not be empty"); + Validate.notNull(value, "Cookie value must not be null"); + cookies.put(name, value); + return (T) this; + } + + public boolean hasCookie(String name) { + Validate.notEmpty("Cookie name must not be empty"); + return cookies.containsKey(name); + } + + public T removeCookie(String name) { + Validate.notEmpty("Cookie name must not be empty"); + cookies.remove(name); + return (T) this; + } + + public Map<String, String> cookies() { + return cookies; + } + } + + public static class Request extends Base<Connection.Request> implements Connection.Request { + private int timeoutMilliseconds; + private boolean followRedirects; + private Collection<Connection.KeyVal> data; + private boolean ignoreHttpErrors = false; + private boolean ignoreContentType = false; + private Parser parser; + + private Request() { + timeoutMilliseconds = 3000; + followRedirects = true; + data = new ArrayList<Connection.KeyVal>(); + method = Connection.Method.GET; + headers.put("Accept-Encoding", "gzip"); + parser = Parser.htmlParser(); + } + + public int timeout() { + return timeoutMilliseconds; + } + + public Request timeout(int millis) { + Validate.isTrue(millis >= 0, "Timeout milliseconds must be 0 (infinite) or greater"); + timeoutMilliseconds = millis; + return this; + } + + public boolean followRedirects() { + return followRedirects; + } + + public Connection.Request followRedirects(boolean followRedirects) { + this.followRedirects = followRedirects; + return this; + } + + public boolean ignoreHttpErrors() { + return ignoreHttpErrors; + } + + public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) { + this.ignoreHttpErrors = ignoreHttpErrors; + return this; + } + + public boolean ignoreContentType() { + return ignoreContentType; + } + + public Connection.Request ignoreContentType(boolean ignoreContentType) { + this.ignoreContentType = ignoreContentType; + return this; + } + + public Request data(Connection.KeyVal keyval) { + Validate.notNull(keyval, "Key val must not be null"); + data.add(keyval); + return this; + } + + public Collection<Connection.KeyVal> data() { + return data; + } + + public Request parser(Parser parser) { + this.parser = parser; + return this; + } + + public Parser parser() { + return parser; + } + } + + public static class Response extends Base<Connection.Response> implements Connection.Response { + private static final int MAX_REDIRECTS = 20; + private int statusCode; + private String statusMessage; + private ByteBuffer byteData; + private String charset; + private String contentType; + private boolean executed = false; + private int numRedirects = 0; + private Connection.Request req; + + Response() { + super(); + } + + private Response(Response previousResponse) throws IOException { + super(); + if (previousResponse != null) { + numRedirects = previousResponse.numRedirects + 1; + if (numRedirects >= MAX_REDIRECTS) + throw new IOException(String.format("Too many redirects occurred trying to load URL %s", previousResponse.url())); + } + } + + static Response execute(Connection.Request req) throws IOException { + return execute(req, null); + } + + static Response execute(Connection.Request req, Response previousResponse) throws IOException { + Validate.notNull(req, "Request must not be null"); + String protocol = req.url().getProtocol(); + Validate + .isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported"); + + // set up the request for execution + if (req.method() == Connection.Method.GET && req.data().size() > 0) + serialiseRequestUrl(req); // appends query string + HttpURLConnection conn = createConnection(req); + conn.connect(); + if (req.method() == Connection.Method.POST) + writePost(req.data(), conn.getOutputStream()); + + int status = conn.getResponseCode(); + boolean needsRedirect = false; + if (status != HttpURLConnection.HTTP_OK) { + if (status == HttpURLConnection.HTTP_MOVED_TEMP || status == HttpURLConnection.HTTP_MOVED_PERM || status == HttpURLConnection.HTTP_SEE_OTHER) + needsRedirect = true; + else if (!req.ignoreHttpErrors()) + throw new IOException(status + " error loading URL " + req.url().toString()); + } + Response res = new Response(previousResponse); + res.setupFromConnection(conn, previousResponse); + if (needsRedirect && req.followRedirects()) { + req.method(Method.GET); // always redirect with a get. any data param from original req are dropped. + req.data().clear(); + req.url(new URL(req.url(), res.header("Location"))); + for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add response cookies to request (for e.g. login posts) + req.cookie(cookie.getKey(), cookie.getValue()); + } + return execute(req, res); + } + res.req = req; + + InputStream bodyStream = null; + InputStream dataStream = null; + try { + dataStream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); + bodyStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase("gzip") ? + new BufferedInputStream(new GZIPInputStream(dataStream)) : + new BufferedInputStream(dataStream); + + res.byteData = DataUtil.readToByteBuffer(bodyStream); + res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it + } finally { + if (bodyStream != null) bodyStream.close(); + if (dataStream != null) dataStream.close(); + } + + res.executed = true; + return res; + } + + public int statusCode() { + return statusCode; + } + + public String statusMessage() { + return statusMessage; + } + + public String charset() { + return charset; + } + + public String contentType() { + return contentType; + } + + public Document parse() throws IOException { + Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); + if (!req.ignoreContentType() && (contentType == null || !(contentType.startsWith("text/") || contentType.startsWith("application/xml") || contentType.startsWith("application/xhtml+xml")))) + throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*, application/xml, or application/xhtml+xml", + contentType, url.toString())); + Document doc = DataUtil.parseByteData(byteData, charset, url.toExternalForm(), req.parser()); + byteData.rewind(); + charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly + return doc; + } + + public String body() { + Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); + // charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet + String body; + if (charset == null) + body = Charset.forName(DataUtil.defaultCharset).decode(byteData).toString(); + else + body = Charset.forName(charset).decode(byteData).toString(); + byteData.rewind(); + return body; + } + + public byte[] bodyAsBytes() { + Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); + return byteData.array(); + } + + // set up connection defaults, and details from request + private static HttpURLConnection createConnection(Connection.Request req) throws IOException { + HttpURLConnection conn = (HttpURLConnection) req.url().openConnection(); + conn.setRequestMethod(req.method().name()); + conn.setInstanceFollowRedirects(false); // don't rely on native redirection support + conn.setConnectTimeout(req.timeout()); + conn.setReadTimeout(req.timeout()); + if (req.method() == Method.POST) + conn.setDoOutput(true); + if (req.cookies().size() > 0) + conn.addRequestProperty("Cookie", getRequestCookieString(req)); + for (Map.Entry<String, String> header : req.headers().entrySet()) { + conn.addRequestProperty(header.getKey(), header.getValue()); + } + return conn; + } + + // set up url, method, header, cookies + private void setupFromConnection(HttpURLConnection conn, Connection.Response previousResponse) throws IOException { + method = Connection.Method.valueOf(conn.getRequestMethod()); + url = conn.getURL(); + statusCode = conn.getResponseCode(); + statusMessage = conn.getResponseMessage(); + contentType = conn.getContentType(); + + Map<String, List<String>> resHeaders = conn.getHeaderFields(); + processResponseHeaders(resHeaders); + + // if from a redirect, map previous response cookies into this response + if (previousResponse != null) { + for (Map.Entry<String, String> prevCookie : previousResponse.cookies().entrySet()) { + if (!hasCookie(prevCookie.getKey())) + cookie(prevCookie.getKey(), prevCookie.getValue()); + } + } + } + + void processResponseHeaders(Map<String, List<String>> resHeaders) { + for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) { + String name = entry.getKey(); + if (name == null) + continue; // http/1.1 line + + List<String> values = entry.getValue(); + if (name.equalsIgnoreCase("Set-Cookie")) { + for (String value : values) { + if (value == null) + continue; + TokenQueue cd = new TokenQueue(value); + String cookieName = cd.chompTo("=").trim(); + String cookieVal = cd.consumeTo(";").trim(); + if (cookieVal == null) + cookieVal = ""; + // ignores path, date, domain, secure et al. req'd? + // name not blank, value not null + if (cookieName != null && cookieName.length() > 0) + cookie(cookieName, cookieVal); + } + } else { // only take the first instance of each header + if (!values.isEmpty()) + header(name, values.get(0)); + } + } + } + + private static void writePost(Collection<Connection.KeyVal> data, OutputStream outputStream) throws IOException { + OutputStreamWriter w = new OutputStreamWriter(outputStream, DataUtil.defaultCharset); + boolean first = true; + for (Connection.KeyVal keyVal : data) { + if (!first) + w.append('&'); + else + first = false; + + w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)); + w.write('='); + w.write(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); + } + w.close(); + } + + private static String getRequestCookieString(Connection.Request req) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry<String, String> cookie : req.cookies().entrySet()) { + if (!first) + sb.append("; "); + else + first = false; + sb.append(cookie.getKey()).append('=').append(cookie.getValue()); + // todo: spec says only ascii, no escaping / encoding defined. validate on set? or escape somehow here? + } + return sb.toString(); + } + + // for get url reqs, serialise the data map into the url + private static void serialiseRequestUrl(Connection.Request req) throws IOException { + URL in = req.url(); + StringBuilder url = new StringBuilder(); + boolean first = true; + // reconstitute the query, ready for appends + url + .append(in.getProtocol()) + .append("://") + .append(in.getAuthority()) // includes host, port + .append(in.getPath()) + .append("?"); + if (in.getQuery() != null) { + url.append(in.getQuery()); + first = false; + } + for (Connection.KeyVal keyVal : req.data()) { + if (!first) + url.append('&'); + else + first = false; + url + .append(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)) + .append('=') + .append(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); + } + req.url(new URL(url.toString())); + req.data().clear(); // moved into url as get params + } + } + + public static class KeyVal implements Connection.KeyVal { + private String key; + private String value; + + public static KeyVal create(String key, String value) { + Validate.notEmpty(key, "Data key must not be empty"); + Validate.notNull(value, "Data value must not be null"); + return new KeyVal(key, value); + } + + private KeyVal(String key, String value) { + this.key = key; + this.value = value; + } + + public KeyVal key(String key) { + Validate.notEmpty(key, "Data key must not be empty"); + this.key = key; + return this; + } + + public String key() { + return key; + } + + public KeyVal value(String value) { + Validate.notNull(value, "Data value must not be null"); + this.value = value; + return this; + } + + public String value() { + return value; + } + + @Override + public String toString() { + return key + "=" + value; + } + } +} diff --git a/src/org/jsoup/helper/StringUtil.java b/src/org/jsoup/helper/StringUtil.java new file mode 100644 index 0000000000..071a92c7a5 --- /dev/null +++ b/src/org/jsoup/helper/StringUtil.java @@ -0,0 +1,140 @@ +package org.jsoup.helper; + +import java.util.Collection; +import java.util.Iterator; + +/** + * A minimal String utility class. Designed for internal jsoup use only. + */ +public final class StringUtil { + // memoised padding up to 10 + private static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ", " ", " "}; + + /** + * Join a collection of strings by a seperator + * @param strings collection of string objects + * @param sep string to place between strings + * @return joined string + */ + public static String join(Collection strings, String sep) { + return join(strings.iterator(), sep); + } + + /** + * Join a collection of strings by a seperator + * @param strings iterator of string objects + * @param sep string to place between strings + * @return joined string + */ + public static String join(Iterator strings, String sep) { + if (!strings.hasNext()) + return ""; + + String start = strings.next().toString(); + if (!strings.hasNext()) // only one, avoid builder + return start; + + StringBuilder sb = new StringBuilder(64).append(start); + while (strings.hasNext()) { + sb.append(sep); + sb.append(strings.next()); + } + return sb.toString(); + } + + /** + * Returns space padding + * @param width amount of padding desired + * @return string of spaces * width + */ + public static String padding(int width) { + if (width < 0) + throw new IllegalArgumentException("width must be > 0"); + + if (width < padding.length) + return padding[width]; + + char[] out = new char[width]; + for (int i = 0; i < width; i++) + out[i] = ' '; + return String.valueOf(out); + } + + /** + * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, \t, etc) + * @param string string to test + * @return if string is blank + */ + public static boolean isBlank(String string) { + if (string == null || string.length() == 0) + return true; + + int l = string.length(); + for (int i = 0; i < l; i++) { + if (!StringUtil.isWhitespace(string.codePointAt(i))) + return false; + } + return true; + } + + /** + * Tests if a string is numeric, i.e. contains only digit characters + * @param string string to test + * @return true if only digit chars, false if empty or null or contains non-digit chrs + */ + public static boolean isNumeric(String string) { + if (string == null || string.length() == 0) + return false; + + int l = string.length(); + for (int i = 0; i < l; i++) { + if (!Character.isDigit(string.codePointAt(i))) + return false; + } + return true; + } + + /** + * Tests if a code point is "whitespace" as defined in the HTML spec. + * @param c code point to test + * @return true if code point is whitespace, false otherwise + */ + public static boolean isWhitespace(int c){ + return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; + } + + public static String normaliseWhitespace(String string) { + StringBuilder sb = new StringBuilder(string.length()); + + boolean lastWasWhite = false; + boolean modified = false; + + int l = string.length(); + for (int i = 0; i < l; i++) { + int c = string.codePointAt(i); + if (isWhitespace(c)) { + if (lastWasWhite) { + modified = true; + continue; + } + if (c != ' ') + modified = true; + sb.append(' '); + lastWasWhite = true; + } + else { + sb.appendCodePoint(c); + lastWasWhite = false; + } + } + return modified ? sb.toString() : string; + } + + public static boolean in(String needle, String... haystack) { + for (String hay : haystack) { + if (hay.equals(needle)) + return true; + } + return false; + } +} diff --git a/src/org/jsoup/helper/Validate.java b/src/org/jsoup/helper/Validate.java new file mode 100644 index 0000000000..814bcc3a40 --- /dev/null +++ b/src/org/jsoup/helper/Validate.java @@ -0,0 +1,112 @@ +package org.jsoup.helper; + +/** + * Simple validation methods. Designed for jsoup internal use + */ +public final class Validate { + + private Validate() {} + + /** + * Validates that the object is not null + * @param obj object to test + */ + public static void notNull(Object obj) { + if (obj == null) + throw new IllegalArgumentException("Object must not be null"); + } + + /** + * Validates that the object is not null + * @param obj object to test + * @param msg message to output if validation fails + */ + public static void notNull(Object obj, String msg) { + if (obj == null) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the value is true + * @param val object to test + */ + public static void isTrue(boolean val) { + if (!val) + throw new IllegalArgumentException("Must be true"); + } + + /** + * Validates that the value is true + * @param val object to test + * @param msg message to output if validation fails + */ + public static void isTrue(boolean val, String msg) { + if (!val) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the value is false + * @param val object to test + */ + public static void isFalse(boolean val) { + if (val) + throw new IllegalArgumentException("Must be false"); + } + + /** + * Validates that the value is false + * @param val object to test + * @param msg message to output if validation fails + */ + public static void isFalse(boolean val, String msg) { + if (val) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the array contains no null elements + * @param objects the array to test + */ + public static void noNullElements(Object[] objects) { + noNullElements(objects, "Array must not contain any null objects"); + } + + /** + * Validates that the array contains no null elements + * @param objects the array to test + * @param msg message to output if validation fails + */ + public static void noNullElements(Object[] objects, String msg) { + for (Object obj : objects) + if (obj == null) + throw new IllegalArgumentException(msg); + } + + /** + * Validates that the string is not empty + * @param string the string to test + */ + public static void notEmpty(String string) { + if (string == null || string.length() == 0) + throw new IllegalArgumentException("String must not be empty"); + } + + /** + * Validates that the string is not empty + * @param string the string to test + * @param msg message to output if validation fails + */ + public static void notEmpty(String string, String msg) { + if (string == null || string.length() == 0) + throw new IllegalArgumentException(msg); + } + + /** + Cause a failure. + @param msg message to output. + */ + public static void fail(String msg) { + throw new IllegalArgumentException(msg); + } +} diff --git a/src/org/jsoup/nodes/Attribute.java b/src/org/jsoup/nodes/Attribute.java new file mode 100644 index 0000000000..02eb29db83 --- /dev/null +++ b/src/org/jsoup/nodes/Attribute.java @@ -0,0 +1,131 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.Validate; + +import java.util.Map; + +/** + A single key + value attribute. Keys are trimmed and normalised to lower-case. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class Attribute implements Map.Entry<String, String>, Cloneable { + private String key; + private String value; + + /** + * Create a new attribute from unencoded (raw) key and value. + * @param key attribute key + * @param value attribute value + * @see #createFromEncoded + */ + public Attribute(String key, String value) { + Validate.notEmpty(key); + Validate.notNull(value); + this.key = key.trim().toLowerCase(); + this.value = value; + } + + /** + Get the attribute key. + @return the attribute key + */ + public String getKey() { + return key; + } + + /** + Set the attribute key. Gets normalised as per the constructor method. + @param key the new key; must not be null + */ + public void setKey(String key) { + Validate.notEmpty(key); + this.key = key.trim().toLowerCase(); + } + + /** + Get the attribute value. + @return the attribute value + */ + public String getValue() { + return value; + } + + /** + Set the attribute value. + @param value the new attribute value; must not be null + */ + public String setValue(String value) { + Validate.notNull(value); + String old = this.value; + this.value = value; + return old; + } + + /** + Get the HTML representation of this attribute; e.g. {@code href="index.html"}. + @return HTML + */ + public String html() { + return key + "=\"" + Entities.escape(value, (new Document("")).outputSettings()) + "\""; + } + + protected void html(StringBuilder accum, Document.OutputSettings out) { + accum + .append(key) + .append("=\"") + .append(Entities.escape(value, out)) + .append("\""); + } + + /** + Get the string representation of this attribute, implemented as {@link #html()}. + @return string + */ + public String toString() { + return html(); + } + + /** + * Create a new Attribute from an unencoded key and a HTML attribute encoded value. + * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars. + * @param encodedValue HTML attribute encoded value + * @return attribute + */ + public static Attribute createFromEncoded(String unencodedKey, String encodedValue) { + String value = Entities.unescape(encodedValue, true); + return new Attribute(unencodedKey, value); + } + + protected boolean isDataAttribute() { + return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Attribute)) return false; + + Attribute attribute = (Attribute) o; + + if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false; + if (value != null ? !value.equals(attribute.value) : attribute.value != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = key != null ? key.hashCode() : 0; + result = 31 * result + (value != null ? value.hashCode() : 0); + return result; + } + + @Override + public Attribute clone() { + try { + return (Attribute) super.clone(); // only fields are immutable strings key and value, so no more deep copy required + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } +} diff --git a/src/org/jsoup/nodes/Attributes.java b/src/org/jsoup/nodes/Attributes.java new file mode 100644 index 0000000000..9436750fc9 --- /dev/null +++ b/src/org/jsoup/nodes/Attributes.java @@ -0,0 +1,249 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.Validate; + +import java.util.*; + +/** + * The attributes of an Element. + * <p/> + * Attributes are treated as a map: there can be only one value associated with an attribute key. + * <p/> + * Attribute key and value comparisons are done case insensitively, and keys are normalised to + * lower-case. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class Attributes implements Iterable<Attribute>, Cloneable { + protected static final String dataPrefix = "data-"; + + private LinkedHashMap<String, Attribute> attributes = null; + // linked hash map to preserve insertion order. + // null be default as so many elements have no attributes -- saves a good chunk of memory + + /** + Get an attribute value by key. + @param key the attribute key + @return the attribute value if set; or empty string if not set. + @see #hasKey(String) + */ + public String get(String key) { + Validate.notEmpty(key); + + if (attributes == null) + return ""; + + Attribute attr = attributes.get(key.toLowerCase()); + return attr != null ? attr.getValue() : ""; + } + + /** + Set a new attribute, or replace an existing one by key. + @param key attribute key + @param value attribute value + */ + public void put(String key, String value) { + Attribute attr = new Attribute(key, value); + put(attr); + } + + /** + Set a new attribute, or replace an existing one by key. + @param attribute attribute + */ + public void put(Attribute attribute) { + Validate.notNull(attribute); + if (attributes == null) + attributes = new LinkedHashMap<String, Attribute>(2); + attributes.put(attribute.getKey(), attribute); + } + + /** + Remove an attribute by key. + @param key attribute key to remove + */ + public void remove(String key) { + Validate.notEmpty(key); + if (attributes == null) + return; + attributes.remove(key.toLowerCase()); + } + + /** + Tests if these attributes contain an attribute with this key. + @param key key to check for + @return true if key exists, false otherwise + */ + public boolean hasKey(String key) { + return attributes != null && attributes.containsKey(key.toLowerCase()); + } + + /** + Get the number of attributes in this set. + @return size + */ + public int size() { + if (attributes == null) + return 0; + return attributes.size(); + } + + /** + Add all the attributes from the incoming set to this set. + @param incoming attributes to add to these attributes. + */ + public void addAll(Attributes incoming) { + if (incoming.size() == 0) + return; + if (attributes == null) + attributes = new LinkedHashMap<String, Attribute>(incoming.size()); + attributes.putAll(incoming.attributes); + } + + public Iterator<Attribute> iterator() { + return asList().iterator(); + } + + /** + Get the attributes as a List, for iteration. Do not modify the keys of the attributes via this view, as changes + to keys will not be recognised in the containing set. + @return an view of the attributes as a List. + */ + public List<Attribute> asList() { + if (attributes == null) + return Collections.emptyList(); + + List<Attribute> list = new ArrayList<Attribute>(attributes.size()); + for (Map.Entry<String, Attribute> entry : attributes.entrySet()) { + list.add(entry.getValue()); + } + return Collections.unmodifiableList(list); + } + + /** + * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys + * starting with {@code data-}. + * @return map of custom data attributes. + */ + public Map<String, String> dataset() { + return new Dataset(); + } + + /** + Get the HTML representation of these attributes. + @return HTML + */ + public String html() { + StringBuilder accum = new StringBuilder(); + html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used + return accum.toString(); + } + + void html(StringBuilder accum, Document.OutputSettings out) { + if (attributes == null) + return; + + for (Map.Entry<String, Attribute> entry : attributes.entrySet()) { + Attribute attribute = entry.getValue(); + accum.append(" "); + attribute.html(accum, out); + } + } + + public String toString() { + return html(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Attributes)) return false; + + Attributes that = (Attributes) o; + + if (attributes != null ? !attributes.equals(that.attributes) : that.attributes != null) return false; + + return true; + } + + @Override + public int hashCode() { + return attributes != null ? attributes.hashCode() : 0; + } + + @Override + public Attributes clone() { + if (attributes == null) + return new Attributes(); + + Attributes clone; + try { + clone = (Attributes) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + clone.attributes = new LinkedHashMap<String, Attribute>(attributes.size()); + for (Attribute attribute: this) + clone.attributes.put(attribute.getKey(), attribute.clone()); + return clone; + } + + private class Dataset extends AbstractMap<String, String> { + + private Dataset() { + if (attributes == null) + attributes = new LinkedHashMap<String, Attribute>(2); + } + + public Set<Entry<String, String>> entrySet() { + return new EntrySet(); + } + + @Override + public String put(String key, String value) { + String dataKey = dataKey(key); + String oldValue = hasKey(dataKey) ? attributes.get(dataKey).getValue() : null; + Attribute attr = new Attribute(dataKey, value); + attributes.put(dataKey, attr); + return oldValue; + } + + private class EntrySet extends AbstractSet<Map.Entry<String, String>> { + public Iterator<Map.Entry<String, String>> iterator() { + return new DatasetIterator(); + } + + public int size() { + int count = 0; + Iterator iter = new DatasetIterator(); + while (iter.hasNext()) + count++; + return count; + } + } + + private class DatasetIterator implements Iterator<Map.Entry<String, String>> { + private Iterator<Attribute> attrIter = attributes.values().iterator(); + private Attribute attr; + public boolean hasNext() { + while (attrIter.hasNext()) { + attr = attrIter.next(); + if (attr.isDataAttribute()) return true; + } + return false; + } + + public Entry<String, String> next() { + return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue()); + } + + public void remove() { + attributes.remove(attr.getKey()); + } + } + } + + private static String dataKey(String key) { + return dataPrefix + key; + } +} diff --git a/src/org/jsoup/nodes/Comment.java b/src/org/jsoup/nodes/Comment.java new file mode 100644 index 0000000000..37fd4368fa --- /dev/null +++ b/src/org/jsoup/nodes/Comment.java @@ -0,0 +1,46 @@ +package org.jsoup.nodes; + +/** + A comment node. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class Comment extends Node { + private static final String COMMENT_KEY = "comment"; + + /** + Create a new comment node. + @param data The contents of the comment + @param baseUri base URI + */ + public Comment(String data, String baseUri) { + super(baseUri); + attributes.put(COMMENT_KEY, data); + } + + public String nodeName() { + return "#comment"; + } + + /** + Get the contents of the comment. + @return comment content + */ + public String getData() { + return attributes.get(COMMENT_KEY); + } + + void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + if (out.prettyPrint()) + indent(accum, depth, out); + accum + .append("<!--") + .append(getData()) + .append("-->"); + } + + void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + + public String toString() { + return outerHtml(); + } +} diff --git a/src/org/jsoup/nodes/DataNode.java b/src/org/jsoup/nodes/DataNode.java new file mode 100644 index 0000000000..a64f56f0a4 --- /dev/null +++ b/src/org/jsoup/nodes/DataNode.java @@ -0,0 +1,62 @@ +package org.jsoup.nodes; + +/** + A data node, for contents of style, script tags etc, where contents should not show in text(). + + @author Jonathan Hedley, jonathan@hedley.net */ +public class DataNode extends Node{ + private static final String DATA_KEY = "data"; + + /** + Create a new DataNode. + @param data data contents + @param baseUri base URI + */ + public DataNode(String data, String baseUri) { + super(baseUri); + attributes.put(DATA_KEY, data); + } + + public String nodeName() { + return "#data"; + } + + /** + Get the data contents of this node. Will be unescaped and with original new lines, space etc. + @return data + */ + public String getWholeData() { + return attributes.get(DATA_KEY); + } + + /** + * Set the data contents of this node. + * @param data unencoded data + * @return this node, for chaining + */ + public DataNode setWholeData(String data) { + attributes.put(DATA_KEY, data); + return this; + } + + void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain + } + + void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + + public String toString() { + return outerHtml(); + } + + /** + Create a new DataNode from HTML encoded data. + @param encodedData encoded data + @param baseUri bass URI + @return new DataNode + */ + public static DataNode createFromEncoded(String encodedData, String baseUri) { + String data = Entities.unescape(encodedData); + return new DataNode(data, baseUri); + } +} diff --git a/src/org/jsoup/nodes/Document.java b/src/org/jsoup/nodes/Document.java new file mode 100644 index 0000000000..adb371ce14 --- /dev/null +++ b/src/org/jsoup/nodes/Document.java @@ -0,0 +1,350 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.Validate; +import org.jsoup.parser.Tag; +import org.jsoup.select.Elements; + +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.util.ArrayList; +import java.util.List; + +/** + A HTML Document. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class Document extends Element { + private OutputSettings outputSettings = new OutputSettings(); + private QuirksMode quirksMode = QuirksMode.noQuirks; + + /** + Create a new, empty Document. + @param baseUri base URI of document + @see org.jsoup.Jsoup#parse + @see #createShell + */ + public Document(String baseUri) { + super(Tag.valueOf("#root"), baseUri); + } + + /** + Create a valid, empty shell of a document, suitable for adding more elements to. + @param baseUri baseUri of document + @return document with html, head, and body elements. + */ + static public Document createShell(String baseUri) { + Validate.notNull(baseUri); + + Document doc = new Document(baseUri); + Element html = doc.appendElement("html"); + html.appendElement("head"); + html.appendElement("body"); + + return doc; + } + + /** + Accessor to the document's {@code head} element. + @return {@code head} + */ + public Element head() { + return findFirstElementByTagName("head", this); + } + + /** + Accessor to the document's {@code body} element. + @return {@code body} + */ + public Element body() { + return findFirstElementByTagName("body", this); + } + + /** + Get the string contents of the document's {@code title} element. + @return Trimmed title, or empty string if none set. + */ + public String title() { + Element titleEl = getElementsByTag("title").first(); + return titleEl != null ? titleEl.text().trim() : ""; + } + + /** + Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if + not present + @param title string to set as title + */ + public void title(String title) { + Validate.notNull(title); + Element titleEl = getElementsByTag("title").first(); + if (titleEl == null) { // add to head + head().appendElement("title").text(title); + } else { + titleEl.text(title); + } + } + + /** + Create a new Element, with this document's base uri. Does not make the new element a child of this document. + @param tagName element tag name (e.g. {@code a}) + @return new element + */ + public Element createElement(String tagName) { + return new Element(Tag.valueOf(tagName), this.baseUri()); + } + + /** + Normalise the document. This happens after the parse phase so generally does not need to be called. + Moves any text content that is not in the body element into the body. + @return this document after normalisation + */ + public Document normalise() { + Element htmlEl = findFirstElementByTagName("html", this); + if (htmlEl == null) + htmlEl = appendElement("html"); + if (head() == null) + htmlEl.prependElement("head"); + if (body() == null) + htmlEl.appendElement("body"); + + // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care + // of. do in inverse order to maintain text order. + normaliseTextNodes(head()); + normaliseTextNodes(htmlEl); + normaliseTextNodes(this); + + normaliseStructure("head", htmlEl); + normaliseStructure("body", htmlEl); + + return this; + } + + // does not recurse. + private void normaliseTextNodes(Element element) { + List<Node> toMove = new ArrayList<Node>(); + for (Node node: element.childNodes) { + if (node instanceof TextNode) { + TextNode tn = (TextNode) node; + if (!tn.isBlank()) + toMove.add(tn); + } + } + + for (int i = toMove.size()-1; i >= 0; i--) { + Node node = toMove.get(i); + element.removeChild(node); + body().prependChild(new TextNode(" ", "")); + body().prependChild(node); + } + } + + // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html> + private void normaliseStructure(String tag, Element htmlEl) { + Elements elements = this.getElementsByTag(tag); + Element master = elements.first(); // will always be available as created above if not existent + if (elements.size() > 1) { // dupes, move contents to master + List<Node> toMove = new ArrayList<Node>(); + for (int i = 1; i < elements.size(); i++) { + Node dupe = elements.get(i); + for (Node node : dupe.childNodes) + toMove.add(node); + dupe.remove(); + } + + for (Node dupe : toMove) + master.appendChild(dupe); + } + // ensure parented by <html> + if (!master.parent().equals(htmlEl)) { + htmlEl.appendChild(master); // includes remove() + } + } + + // fast method to get first by tag name, used for html, head, body finders + private Element findFirstElementByTagName(String tag, Node node) { + if (node.nodeName().equals(tag)) + return (Element) node; + else { + for (Node child: node.childNodes) { + Element found = findFirstElementByTagName(tag, child); + if (found != null) + return found; + } + } + return null; + } + + @Override + public String outerHtml() { + return super.html(); // no outer wrapper tag + } + + /** + Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. + @param text unencoded text + @return this document + */ + @Override + public Element text(String text) { + body().text(text); // overridden to not nuke doc structure + return this; + } + + @Override + public String nodeName() { + return "#document"; + } + + @Override + public Document clone() { + Document clone = (Document) super.clone(); + clone.outputSettings = this.outputSettings.clone(); + return clone; + } + + /** + * A Document's output settings control the form of the text() and html() methods. + */ + public static class OutputSettings implements Cloneable { + private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; + private Charset charset = Charset.forName("UTF-8"); + private CharsetEncoder charsetEncoder = charset.newEncoder(); + private boolean prettyPrint = true; + private int indentAmount = 1; + + public OutputSettings() {} + + /** + * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML + * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>, + * which uses the complete set of HTML named entities. + * <p> + * The default escape mode is <code>base</code>. + * @return the document's current escape mode + */ + public Entities.EscapeMode escapeMode() { + return escapeMode; + } + + /** + * Set the document's escape mode + * @param escapeMode the new escape mode to use + * @return the document's output settings, for chaining + */ + public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { + this.escapeMode = escapeMode; + return this; + } + + /** + * Get the document's current output charset, which is used to control which characters are escaped when + * generating HTML (via the <code>html()</code> methods), and which are kept intact. + * <p> + * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the + * input charset. Otherwise, it defaults to UTF-8. + * @return the document's current charset. + */ + public Charset charset() { + return charset; + } + + /** + * Update the document's output charset. + * @param charset the new charset to use. + * @return the document's output settings, for chaining + */ + public OutputSettings charset(Charset charset) { + // todo: this should probably update the doc's meta charset + this.charset = charset; + charsetEncoder = charset.newEncoder(); + return this; + } + + /** + * Update the document's output charset. + * @param charset the new charset (by name) to use. + * @return the document's output settings, for chaining + */ + public OutputSettings charset(String charset) { + charset(Charset.forName(charset)); + return this; + } + + CharsetEncoder encoder() { + return charsetEncoder; + } + + /** + * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format + * the output, and the output will generally look like the input. + * @return if pretty printing is enabled. + */ + public boolean prettyPrint() { + return prettyPrint; + } + + /** + * Enable or disable pretty printing. + * @param pretty new pretty print setting + * @return this, for chaining + */ + public OutputSettings prettyPrint(boolean pretty) { + prettyPrint = pretty; + return this; + } + + /** + * Get the current tag indent amount, used when pretty printing. + * @return the current indent amount + */ + public int indentAmount() { + return indentAmount; + } + + /** + * Set the indent amount for pretty printing + * @param indentAmount number of spaces to use for indenting each level. Must be >= 0. + * @return this, for chaining + */ + public OutputSettings indentAmount(int indentAmount) { + Validate.isTrue(indentAmount >= 0); + this.indentAmount = indentAmount; + return this; + } + + @Override + public OutputSettings clone() { + OutputSettings clone; + try { + clone = (OutputSettings) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + clone.charset(charset.name()); // new charset and charset encoder + clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); + // indentAmount, prettyPrint are primitives so object.clone() will handle + return clone; + } + } + + /** + * Get the document's current output settings. + * @return the document's current output settings. + */ + public OutputSettings outputSettings() { + return outputSettings; + } + + public enum QuirksMode { + noQuirks, quirks, limitedQuirks; + } + + public QuirksMode quirksMode() { + return quirksMode; + } + + public Document quirksMode(QuirksMode quirksMode) { + this.quirksMode = quirksMode; + return this; + } +} + diff --git a/src/org/jsoup/nodes/DocumentType.java b/src/org/jsoup/nodes/DocumentType.java new file mode 100644 index 0000000000..f8c79f0d18 --- /dev/null +++ b/src/org/jsoup/nodes/DocumentType.java @@ -0,0 +1,46 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; + +/** + * A {@code <!DOCTPYE>} node. + */ +public class DocumentType extends Node { + // todo: quirk mode from publicId and systemId + + /** + * Create a new doctype element. + * @param name the doctype's name + * @param publicId the doctype's public ID + * @param systemId the doctype's system ID + * @param baseUri the doctype's base URI + */ + public DocumentType(String name, String publicId, String systemId, String baseUri) { + super(baseUri); + + Validate.notEmpty(name); + attr("name", name); + attr("publicId", publicId); + attr("systemId", systemId); + } + + @Override + public String nodeName() { + return "#doctype"; + } + + @Override + void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + accum.append("<!DOCTYPE ").append(attr("name")); + if (!StringUtil.isBlank(attr("publicId"))) + accum.append(" PUBLIC \"").append(attr("publicId")).append("\""); + if (!StringUtil.isBlank(attr("systemId"))) + accum.append(" \"").append(attr("systemId")).append("\""); + accum.append('>'); + } + + @Override + void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { + } +} diff --git a/src/org/jsoup/nodes/Element.java b/src/org/jsoup/nodes/Element.java new file mode 100644 index 0000000000..5c1894c934 --- /dev/null +++ b/src/org/jsoup/nodes/Element.java @@ -0,0 +1,1119 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.parser.Parser; +import org.jsoup.parser.Tag; +import org.jsoup.select.Collector; +import org.jsoup.select.Elements; +import org.jsoup.select.Evaluator; +import org.jsoup.select.Selector; + +import java.util.*; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * A HTML element consists of a tag name, attributes, and child nodes (including text nodes and + * other elements). + * + * From an Element, you can extract data, traverse the node graph, and manipulate the HTML. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class Element extends Node { + private Tag tag; + private Set<String> classNames; + + /** + * Create a new, standalone Element. (Standalone in that is has no parent.) + * + * @param tag tag of this element + * @param baseUri the base URI + * @param attributes initial attributes + * @see #appendChild(Node) + * @see #appendElement(String) + */ + public Element(Tag tag, String baseUri, Attributes attributes) { + super(baseUri, attributes); + + Validate.notNull(tag); + this.tag = tag; + } + + /** + * Create a new Element from a tag and a base URI. + * + * @param tag element tag + * @param baseUri the base URI of this element. It is acceptable for the base URI to be an empty + * string, but not null. + * @see Tag#valueOf(String) + */ + public Element(Tag tag, String baseUri) { + this(tag, baseUri, new Attributes()); + } + + @Override + public String nodeName() { + return tag.getName(); + } + + /** + * Get the name of the tag for this element. E.g. {@code div} + * + * @return the tag name + */ + public String tagName() { + return tag.getName(); + } + + /** + * Change the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with + * {@code el.tagName("div");}. + * + * @param tagName new tag name for this element + * @return this element, for chaining + */ + public Element tagName(String tagName) { + Validate.notEmpty(tagName, "Tag name must not be empty."); + tag = Tag.valueOf(tagName); + return this; + } + + /** + * Get the Tag for this element. + * + * @return the tag object + */ + public Tag tag() { + return tag; + } + + /** + * Test if this element is a block-level element. (E.g. {@code <div> == true} or an inline element + * {@code <p> == false}). + * + * @return true if block, false if not (and thus inline) + */ + public boolean isBlock() { + return tag.isBlock(); + } + + /** + * Get the {@code id} attribute of this element. + * + * @return The id attribute, if present, or an empty string if not. + */ + public String id() { + String id = attr("id"); + return id == null ? "" : id; + } + + /** + * Set an attribute value on this element. If this element already has an attribute with the + * key, its value is updated; otherwise, a new attribute is added. + * + * @return this element + */ + public Element attr(String attributeKey, String attributeValue) { + super.attr(attributeKey, attributeValue); + return this; + } + + /** + * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key + * starting with "data-" is included the dataset. + * <p> + * E.g., the element {@code <div data-package="jsoup" data-language="Java" class="group">...} has the dataset + * {@code package=jsoup, language=java}. + * <p> + * This map is a filtered view of the element's attribute map. Changes to one map (add, remove, update) are reflected + * in the other map. + * <p> + * You can find elements that have data attributes using the {@code [^data-]} attribute key prefix selector. + * @return a map of {@code key=value} custom data attributes. + */ + public Map<String, String> dataset() { + return attributes.dataset(); + } + + @Override + public final Element parent() { + return (Element) parentNode; + } + + /** + * Get this element's parent and ancestors, up to the document root. + * @return this element's stack of parents, closest first. + */ + public Elements parents() { + Elements parents = new Elements(); + accumulateParents(this, parents); + return parents; + } + + private static void accumulateParents(Element el, Elements parents) { + Element parent = el.parent(); + if (parent != null && !parent.tagName().equals("#root")) { + parents.add(parent); + accumulateParents(parent, parents); + } + } + + /** + * Get a child element of this element, by its 0-based index number. + * <p/> + * Note that an element can have both mixed Nodes and Elements as children. This method inspects + * a filtered list of children that are elements, and the index is based on that filtered list. + * + * @param index the index number of the element to retrieve + * @return the child element, if it exists, or {@code null} if absent. + * @see #childNode(int) + */ + public Element child(int index) { + return children().get(index); + } + + /** + * Get this element's child elements. + * <p/> + * This is effectively a filter on {@link #childNodes()} to get Element nodes. + * @return child elements. If this element has no children, returns an + * empty list. + * @see #childNodes() + */ + public Elements children() { + // create on the fly rather than maintaining two lists. if gets slow, memoize, and mark dirty on change + List<Element> elements = new ArrayList<Element>(); + for (Node node : childNodes) { + if (node instanceof Element) + elements.add((Element) node); + } + return new Elements(elements); + } + + /** + * Get this element's child text nodes. The list is unmodifiable but the text nodes may be manipulated. + * <p/> + * This is effectively a filter on {@link #childNodes()} to get Text nodes. + * @return child text nodes. If this element has no text nodes, returns an + * empty list. + * <p/> + * For example, with the input HTML: {@code <p>One <span>Two</span> Three <br> Four</p>} with the {@code p} element selected: + * <ul> + * <li>{@code p.text()} = {@code "One Two Three Four"}</li> + * <li>{@code p.ownText()} = {@code "One Three Four"}</li> + * <li>{@code p.children()} = {@code Elements[<span>, <br>]}</li> + * <li>{@code p.childNodes()} = {@code List<Node>["One ", <span>, " Three ", <br>, " Four"]}</li> + * <li>{@code p.textNodes()} = {@code List<TextNode>["One ", " Three ", " Four"]}</li> + * </ul> + */ + public List<TextNode> textNodes() { + List<TextNode> textNodes = new ArrayList<TextNode>(); + for (Node node : childNodes) { + if (node instanceof TextNode) + textNodes.add((TextNode) node); + } + return Collections.unmodifiableList(textNodes); + } + + /** + * Get this element's child data nodes. The list is unmodifiable but the data nodes may be manipulated. + * <p/> + * This is effectively a filter on {@link #childNodes()} to get Data nodes. + * @return child data nodes. If this element has no data nodes, returns an + * empty list. + * @see #data() + */ + public List<DataNode> dataNodes() { + List<DataNode> dataNodes = new ArrayList<DataNode>(); + for (Node node : childNodes) { + if (node instanceof DataNode) + dataNodes.add((DataNode) node); + } + return Collections.unmodifiableList(dataNodes); + } + + /** + * Find elements that match the {@link Selector} CSS query, with this element as the starting context. Matched elements + * may include this element, or any of its children. + * <p/> + * This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because + * multiple filters can be combined, e.g.: + * <ul> + * <li>{@code el.select("a[href]")} - finds links ({@code a} tags with {@code href} attributes) + * <li>{@code el.select("a[href*=example.com]")} - finds links pointing to example.com (loosely) + * </ul> + * <p/> + * See the query syntax documentation in {@link org.jsoup.select.Selector}. + * + * @param cssQuery a {@link Selector} CSS-like query + * @return elements that match the query (empty if none match) + * @see org.jsoup.select.Selector + */ + public Elements select(String cssQuery) { + return Selector.select(cssQuery, this); + } + + /** + * Add a node child node to this element. + * + * @param child node to add. Must not already have a parent. + * @return this element, so that you can add more child nodes or elements. + */ + public Element appendChild(Node child) { + Validate.notNull(child); + + addChildren(child); + return this; + } + + /** + * Add a node to the start of this element's children. + * + * @param child node to add. Must not already have a parent. + * @return this element, so that you can add more child nodes or elements. + */ + public Element prependChild(Node child) { + Validate.notNull(child); + + addChildren(0, child); + return this; + } + + /** + * Create a new element by tag name, and add it as the last child. + * + * @param tagName the name of the tag (e.g. {@code div}). + * @return the new element, to allow you to add content to it, e.g.: + * {@code parent.appendElement("h1").attr("id", "header").text("Welcome");} + */ + public Element appendElement(String tagName) { + Element child = new Element(Tag.valueOf(tagName), baseUri()); + appendChild(child); + return child; + } + + /** + * Create a new element by tag name, and add it as the first child. + * + * @param tagName the name of the tag (e.g. {@code div}). + * @return the new element, to allow you to add content to it, e.g.: + * {@code parent.prependElement("h1").attr("id", "header").text("Welcome");} + */ + public Element prependElement(String tagName) { + Element child = new Element(Tag.valueOf(tagName), baseUri()); + prependChild(child); + return child; + } + + /** + * Create and append a new TextNode to this element. + * + * @param text the unencoded text to add + * @return this element + */ + public Element appendText(String text) { + TextNode node = new TextNode(text, baseUri()); + appendChild(node); + return this; + } + + /** + * Create and prepend a new TextNode to this element. + * + * @param text the unencoded text to add + * @return this element + */ + public Element prependText(String text) { + TextNode node = new TextNode(text, baseUri()); + prependChild(node); + return this; + } + + /** + * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children. + * @param html HTML to add inside this element, after the existing HTML + * @return this element + * @see #html(String) + */ + public Element append(String html) { + Validate.notNull(html); + + List<Node> nodes = Parser.parseFragment(html, this, baseUri()); + addChildren(nodes.toArray(new Node[nodes.size()])); + return this; + } + + /** + * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children. + * @param html HTML to add inside this element, before the existing HTML + * @return this element + * @see #html(String) + */ + public Element prepend(String html) { + Validate.notNull(html); + + List<Node> nodes = Parser.parseFragment(html, this, baseUri()); + addChildren(0, nodes.toArray(new Node[nodes.size()])); + return this; + } + + /** + * Insert the specified HTML into the DOM before this element (i.e. as a preceding sibling). + * + * @param html HTML to add before this element + * @return this element, for chaining + * @see #after(String) + */ + @Override + public Element before(String html) { + return (Element) super.before(html); + } + + /** + * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). + * @param node to add before this element + * @return this Element, for chaining + * @see #after(Node) + */ + @Override + public Element before(Node node) { + return (Element) super.before(node); + } + + /** + * Insert the specified HTML into the DOM after this element (i.e. as a following sibling). + * + * @param html HTML to add after this element + * @return this element, for chaining + * @see #before(String) + */ + @Override + public Element after(String html) { + return (Element) super.after(html); + } + + /** + * Insert the specified node into the DOM after this node (i.e. as a following sibling). + * @param node to add after this element + * @return this element, for chaining + * @see #before(Node) + */ + @Override + public Element after(Node node) { + return (Element) super.after(node); + } + + /** + * Remove all of the element's child nodes. Any attributes are left as-is. + * @return this element + */ + public Element empty() { + childNodes.clear(); + return this; + } + + /** + * Wrap the supplied HTML around this element. + * + * @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. + * @return this element, for chaining. + */ + @Override + public Element wrap(String html) { + return (Element) super.wrap(html); + } + + /** + * Get sibling elements. If the element has no sibling elements, returns an empty list. An element is not a sibling + * of itself, so will not be included in the returned list. + * @return sibling elements + */ + public Elements siblingElements() { + if (parentNode == null) + return new Elements(0); + + List<Element> elements = parent().children(); + Elements siblings = new Elements(elements.size() - 1); + for (Element el: elements) + if (el != this) + siblings.add(el); + return siblings; + } + + /** + * Gets the next sibling element of this element. E.g., if a {@code div} contains two {@code p}s, + * the {@code nextElementSibling} of the first {@code p} is the second {@code p}. + * <p/> + * This is similar to {@link #nextSibling()}, but specifically finds only Elements + * @return the next element, or null if there is no next element + * @see #previousElementSibling() + */ + public Element nextElementSibling() { + if (parentNode == null) return null; + List<Element> siblings = parent().children(); + Integer index = indexInList(this, siblings); + Validate.notNull(index); + if (siblings.size() > index+1) + return siblings.get(index+1); + else + return null; + } + + /** + * Gets the previous element sibling of this element. + * @return the previous element, or null if there is no previous element + * @see #nextElementSibling() + */ + public Element previousElementSibling() { + if (parentNode == null) return null; + List<Element> siblings = parent().children(); + Integer index = indexInList(this, siblings); + Validate.notNull(index); + if (index > 0) + return siblings.get(index-1); + else + return null; + } + + /** + * Gets the first element sibling of this element. + * @return the first sibling that is an element (aka the parent's first element child) + */ + public Element firstElementSibling() { + // todo: should firstSibling() exclude this? + List<Element> siblings = parent().children(); + return siblings.size() > 1 ? siblings.get(0) : null; + } + + /** + * Get the list index of this element in its element sibling list. I.e. if this is the first element + * sibling, returns 0. + * @return position in element sibling list + */ + public Integer elementSiblingIndex() { + if (parent() == null) return 0; + return indexInList(this, parent().children()); + } + + /** + * Gets the last element sibling of this element + * @return the last sibling that is an element (aka the parent's last element child) + */ + public Element lastElementSibling() { + List<Element> siblings = parent().children(); + return siblings.size() > 1 ? siblings.get(siblings.size() - 1) : null; + } + + private static <E extends Element> Integer indexInList(Element search, List<E> elements) { + Validate.notNull(search); + Validate.notNull(elements); + + for (int i = 0; i < elements.size(); i++) { + E element = elements.get(i); + if (element.equals(search)) + return i; + } + return null; + } + + // DOM type methods + + /** + * Finds elements, including and recursively under this element, with the specified tag name. + * @param tagName The tag name to search for (case insensitively). + * @return a matching unmodifiable list of elements. Will be empty if this element and none of its children match. + */ + public Elements getElementsByTag(String tagName) { + Validate.notEmpty(tagName); + tagName = tagName.toLowerCase().trim(); + + return Collector.collect(new Evaluator.Tag(tagName), this); + } + + /** + * Find an element by ID, including or under this element. + * <p> + * Note that this finds the first matching ID, starting with this element. If you search down from a different + * starting point, it is possible to find a different element by ID. For unique element by ID within a Document, + * use {@link Document#getElementById(String)} + * @param id The ID to search for. + * @return The first matching element by ID, starting with this element, or null if none found. + */ + public Element getElementById(String id) { + Validate.notEmpty(id); + + Elements elements = Collector.collect(new Evaluator.Id(id), this); + if (elements.size() > 0) + return elements.get(0); + else + return null; + } + + /** + * Find elements that have this class, including or under this element. Case insensitive. + * <p> + * Elements can have multiple classes (e.g. {@code <div class="header round first">}. This method + * checks each class, so you can find the above with {@code el.getElementsByClass("header");}. + * + * @param className the name of the class to search for. + * @return elements with the supplied class name, empty if none + * @see #hasClass(String) + * @see #classNames() + */ + public Elements getElementsByClass(String className) { + Validate.notEmpty(className); + + return Collector.collect(new Evaluator.Class(className), this); + } + + /** + * Find elements that have a named attribute set. Case insensitive. + * + * @param key name of the attribute, e.g. {@code href} + * @return elements that have this attribute, empty if none + */ + public Elements getElementsByAttribute(String key) { + Validate.notEmpty(key); + key = key.trim().toLowerCase(); + + return Collector.collect(new Evaluator.Attribute(key), this); + } + + /** + * Find elements that have an attribute name starting with the supplied prefix. Use {@code data-} to find elements + * that have HTML5 datasets. + * @param keyPrefix name prefix of the attribute e.g. {@code data-} + * @return elements that have attribute names that start with with the prefix, empty if none. + */ + public Elements getElementsByAttributeStarting(String keyPrefix) { + Validate.notEmpty(keyPrefix); + keyPrefix = keyPrefix.trim().toLowerCase(); + + return Collector.collect(new Evaluator.AttributeStarting(keyPrefix), this); + } + + /** + * Find elements that have an attribute with the specific value. Case insensitive. + * + * @param key name of the attribute + * @param value value of the attribute + * @return elements that have this attribute with this value, empty if none + */ + public Elements getElementsByAttributeValue(String key, String value) { + return Collector.collect(new Evaluator.AttributeWithValue(key, value), this); + } + + /** + * Find elements that either do not have this attribute, or have it with a different value. Case insensitive. + * + * @param key name of the attribute + * @param value value of the attribute + * @return elements that do not have a matching attribute + */ + public Elements getElementsByAttributeValueNot(String key, String value) { + return Collector.collect(new Evaluator.AttributeWithValueNot(key, value), this); + } + + /** + * Find elements that have attributes that start with the value prefix. Case insensitive. + * + * @param key name of the attribute + * @param valuePrefix start of attribute value + * @return elements that have attributes that start with the value prefix + */ + public Elements getElementsByAttributeValueStarting(String key, String valuePrefix) { + return Collector.collect(new Evaluator.AttributeWithValueStarting(key, valuePrefix), this); + } + + /** + * Find elements that have attributes that end with the value suffix. Case insensitive. + * + * @param key name of the attribute + * @param valueSuffix end of the attribute value + * @return elements that have attributes that end with the value suffix + */ + public Elements getElementsByAttributeValueEnding(String key, String valueSuffix) { + return Collector.collect(new Evaluator.AttributeWithValueEnding(key, valueSuffix), this); + } + + /** + * Find elements that have attributes whose value contains the match string. Case insensitive. + * + * @param key name of the attribute + * @param match substring of value to search for + * @return elements that have attributes containing this text + */ + public Elements getElementsByAttributeValueContaining(String key, String match) { + return Collector.collect(new Evaluator.AttributeWithValueContaining(key, match), this); + } + + /** + * Find elements that have attributes whose values match the supplied regular expression. + * @param key name of the attribute + * @param pattern compiled regular expression to match against attribute values + * @return elements that have attributes matching this regular expression + */ + public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) { + return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this); + + } + + /** + * Find elements that have attributes whose values match the supplied regular expression. + * @param key name of the attribute + * @param regex regular expression to match against attribute values. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. + * @return elements that have attributes matching this regular expression + */ + public Elements getElementsByAttributeValueMatching(String key, String regex) { + Pattern pattern; + try { + pattern = Pattern.compile(regex); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("Pattern syntax error: " + regex, e); + } + return getElementsByAttributeValueMatching(key, pattern); + } + + /** + * Find elements whose sibling index is less than the supplied index. + * @param index 0-based index + * @return elements less than index + */ + public Elements getElementsByIndexLessThan(int index) { + return Collector.collect(new Evaluator.IndexLessThan(index), this); + } + + /** + * Find elements whose sibling index is greater than the supplied index. + * @param index 0-based index + * @return elements greater than index + */ + public Elements getElementsByIndexGreaterThan(int index) { + return Collector.collect(new Evaluator.IndexGreaterThan(index), this); + } + + /** + * Find elements whose sibling index is equal to the supplied index. + * @param index 0-based index + * @return elements equal to index + */ + public Elements getElementsByIndexEquals(int index) { + return Collector.collect(new Evaluator.IndexEquals(index), this); + } + + /** + * Find elements that contain the specified string. The search is case insensitive. The text may appear directly + * in the element, or in any of its descendants. + * @param searchText to look for in the element's text + * @return elements that contain the string, case insensitive. + * @see Element#text() + */ + public Elements getElementsContainingText(String searchText) { + return Collector.collect(new Evaluator.ContainsText(searchText), this); + } + + /** + * Find elements that directly contain the specified string. The search is case insensitive. The text must appear directly + * in the element, not in any of its descendants. + * @param searchText to look for in the element's own text + * @return elements that contain the string, case insensitive. + * @see Element#ownText() + */ + public Elements getElementsContainingOwnText(String searchText) { + return Collector.collect(new Evaluator.ContainsOwnText(searchText), this); + } + + /** + * Find elements whose text matches the supplied regular expression. + * @param pattern regular expression to match text against + * @return elements matching the supplied regular expression. + * @see Element#text() + */ + public Elements getElementsMatchingText(Pattern pattern) { + return Collector.collect(new Evaluator.Matches(pattern), this); + } + + /** + * Find elements whose text matches the supplied regular expression. + * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. + * @return elements matching the supplied regular expression. + * @see Element#text() + */ + public Elements getElementsMatchingText(String regex) { + Pattern pattern; + try { + pattern = Pattern.compile(regex); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("Pattern syntax error: " + regex, e); + } + return getElementsMatchingText(pattern); + } + + /** + * Find elements whose own text matches the supplied regular expression. + * @param pattern regular expression to match text against + * @return elements matching the supplied regular expression. + * @see Element#ownText() + */ + public Elements getElementsMatchingOwnText(Pattern pattern) { + return Collector.collect(new Evaluator.MatchesOwn(pattern), this); + } + + /** + * Find elements whose text matches the supplied regular expression. + * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. + * @return elements matching the supplied regular expression. + * @see Element#ownText() + */ + public Elements getElementsMatchingOwnText(String regex) { + Pattern pattern; + try { + pattern = Pattern.compile(regex); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("Pattern syntax error: " + regex, e); + } + return getElementsMatchingOwnText(pattern); + } + + /** + * Find all elements under this element (including self, and children of children). + * + * @return all elements + */ + public Elements getAllElements() { + return Collector.collect(new Evaluator.AllElements(), this); + } + + /** + * Gets the combined text of this element and all its children. + * <p> + * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.text()} returns {@code "Hello there now!"} + * + * @return unencoded text, or empty string if none. + * @see #ownText() + * @see #textNodes() + */ + public String text() { + StringBuilder sb = new StringBuilder(); + text(sb); + return sb.toString().trim(); + } + + private void text(StringBuilder accum) { + appendWhitespaceIfBr(this, accum); + + for (Node child : childNodes) { + if (child instanceof TextNode) { + TextNode textNode = (TextNode) child; + appendNormalisedText(accum, textNode); + } else if (child instanceof Element) { + Element element = (Element) child; + if (accum.length() > 0 && element.isBlock() && !TextNode.lastCharIsWhitespace(accum)) + accum.append(" "); + element.text(accum); + } + } + } + + /** + * Gets the text owned by this element only; does not get the combined text of all children. + * <p> + * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.ownText()} returns {@code "Hello now!"}, + * whereas {@code p.text()} returns {@code "Hello there now!"}. + * Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element. + * + * @return unencoded text, or empty string if none. + * @see #text() + * @see #textNodes() + */ + public String ownText() { + StringBuilder sb = new StringBuilder(); + ownText(sb); + return sb.toString().trim(); + } + + private void ownText(StringBuilder accum) { + for (Node child : childNodes) { + if (child instanceof TextNode) { + TextNode textNode = (TextNode) child; + appendNormalisedText(accum, textNode); + } else if (child instanceof Element) { + appendWhitespaceIfBr((Element) child, accum); + } + } + } + + private void appendNormalisedText(StringBuilder accum, TextNode textNode) { + String text = textNode.getWholeText(); + + if (!preserveWhitespace()) { + text = TextNode.normaliseWhitespace(text); + if (TextNode.lastCharIsWhitespace(accum)) + text = TextNode.stripLeadingWhitespace(text); + } + accum.append(text); + } + + private static void appendWhitespaceIfBr(Element element, StringBuilder accum) { + if (element.tag.getName().equals("br") && !TextNode.lastCharIsWhitespace(accum)) + accum.append(" "); + } + + boolean preserveWhitespace() { + return tag.preserveWhitespace() || parent() != null && parent().preserveWhitespace(); + } + + /** + * Set the text of this element. Any existing contents (text or elements) will be cleared + * @param text unencoded text + * @return this element + */ + public Element text(String text) { + Validate.notNull(text); + + empty(); + TextNode textNode = new TextNode(text, baseUri); + appendChild(textNode); + + return this; + } + + /** + Test if this element has any text content (that is not just whitespace). + @return true if element has non-blank text content. + */ + public boolean hasText() { + for (Node child: childNodes) { + if (child instanceof TextNode) { + TextNode textNode = (TextNode) child; + if (!textNode.isBlank()) + return true; + } else if (child instanceof Element) { + Element el = (Element) child; + if (el.hasText()) + return true; + } + } + return false; + } + + /** + * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. + * @return the data, or empty string if none + * + * @see #dataNodes() + */ + public String data() { + StringBuilder sb = new StringBuilder(); + + for (Node childNode : childNodes) { + if (childNode instanceof DataNode) { + DataNode data = (DataNode) childNode; + sb.append(data.getWholeData()); + } else if (childNode instanceof Element) { + Element element = (Element) childNode; + String elementData = element.data(); + sb.append(elementData); + } + } + return sb.toString(); + } + + /** + * Gets the literal value of this element's "class" attribute, which may include multiple class names, space + * separated. (E.g. on <code><div class="header gray"></code> returns, "<code>header gray</code>") + * @return The literal class attribute, or <b>empty string</b> if no class attribute set. + */ + public String className() { + return attr("class"); + } + + /** + * Get all of the element's class names. E.g. on element {@code <div class="header gray"}>}, + * returns a set of two elements {@code "header", "gray"}. Note that modifications to this set are not pushed to + * the backing {@code class} attribute; use the {@link #classNames(java.util.Set)} method to persist them. + * @return set of classnames, empty if no class attribute + */ + public Set<String> classNames() { + if (classNames == null) { + String[] names = className().split("\\s+"); + classNames = new LinkedHashSet<String>(Arrays.asList(names)); + } + return classNames; + } + + /** + Set the element's {@code class} attribute to the supplied class names. + @param classNames set of classes + @return this element, for chaining + */ + public Element classNames(Set<String> classNames) { + Validate.notNull(classNames); + attributes.put("class", StringUtil.join(classNames, " ")); + return this; + } + + /** + * Tests if this element has a class. Case insensitive. + * @param className name of class to check for + * @return true if it does, false if not + */ + public boolean hasClass(String className) { + Set<String> classNames = classNames(); + for (String name : classNames) { + if (className.equalsIgnoreCase(name)) + return true; + } + return false; + } + + /** + Add a class name to this element's {@code class} attribute. + @param className class name to add + @return this element + */ + public Element addClass(String className) { + Validate.notNull(className); + + Set<String> classes = classNames(); + classes.add(className); + classNames(classes); + + return this; + } + + /** + Remove a class name from this element's {@code class} attribute. + @param className class name to remove + @return this element + */ + public Element removeClass(String className) { + Validate.notNull(className); + + Set<String> classes = classNames(); + classes.remove(className); + classNames(classes); + + return this; + } + + /** + Toggle a class name on this element's {@code class} attribute: if present, remove it; otherwise add it. + @param className class name to toggle + @return this element + */ + public Element toggleClass(String className) { + Validate.notNull(className); + + Set<String> classes = classNames(); + if (classes.contains(className)) + classes.remove(className); + else + classes.add(className); + classNames(classes); + + return this; + } + + /** + * Get the value of a form element (input, textarea, etc). + * @return the value of the form element, or empty string if not set. + */ + public String val() { + if (tagName().equals("textarea")) + return text(); + else + return attr("value"); + } + + /** + * Set the value of a form element (input, textarea, etc). + * @param value value to set + * @return this element (for chaining) + */ + public Element val(String value) { + if (tagName().equals("textarea")) + text(value); + else + attr("value", value); + return this; + } + + void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + if (accum.length() > 0 && out.prettyPrint() && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock()))) + indent(accum, depth, out); + accum + .append("<") + .append(tagName()); + attributes.html(accum, out); + + if (childNodes.isEmpty() && tag.isSelfClosing()) + accum.append(" />"); + else + accum.append(">"); + } + + void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { + if (!(childNodes.isEmpty() && tag.isSelfClosing())) { + if (out.prettyPrint() && !childNodes.isEmpty() && tag.formatAsBlock()) + indent(accum, depth, out); + accum.append("</").append(tagName()).append(">"); + } + } + + /** + * Retrieves the element's inner HTML. E.g. on a {@code <div>} with one empty {@code <p>}, would return + * {@code <p></p>}. (Whereas {@link #outerHtml()} would return {@code <div><p></p></div>}.) + * + * @return String of HTML. + * @see #outerHtml() + */ + public String html() { + StringBuilder accum = new StringBuilder(); + html(accum); + return accum.toString().trim(); + } + + private void html(StringBuilder accum) { + for (Node node : childNodes) + node.outerHtml(accum); + } + + /** + * Set this element's inner HTML. Clears the existing HTML first. + * @param html HTML to parse and set into this element + * @return this element + * @see #append(String) + */ + public Element html(String html) { + empty(); + append(html); + return this; + } + + public String toString() { + return outerHtml(); + } + + @Override + public boolean equals(Object o) { + return this == o; + } + + @Override + public int hashCode() { + // todo: fixup, not very useful + int result = super.hashCode(); + result = 31 * result + (tag != null ? tag.hashCode() : 0); + return result; + } + + @Override + public Element clone() { + Element clone = (Element) super.clone(); + clone.classNames(); // creates linked set of class names from class attribute + return clone; + } +} diff --git a/src/org/jsoup/nodes/Entities.java b/src/org/jsoup/nodes/Entities.java new file mode 100644 index 0000000000..0ae83e1fc0 --- /dev/null +++ b/src/org/jsoup/nodes/Entities.java @@ -0,0 +1,184 @@ +package org.jsoup.nodes; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.CharsetEncoder; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * HTML entities, and escape routines. + * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML + * named character references</a>. + */ +public class Entities { + public enum EscapeMode { + /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */ + xhtml(xhtmlByVal), + /** Default HTML output entities. */ + base(baseByVal), + /** Complete HTML entities. */ + extended(fullByVal); + + private Map<Character, String> map; + + EscapeMode(Map<Character, String> map) { + this.map = map; + } + + public Map<Character, String> getMap() { + return map; + } + } + + private static final Map<String, Character> full; + private static final Map<Character, String> xhtmlByVal; + private static final Map<Character, String> baseByVal; + private static final Map<Character, String> fullByVal; + private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); + private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); + + private Entities() {} + + /** + * Check if the input is a known named entity + * @param name the possible entity name (e.g. "lt" or "amp" + * @return true if a known named entity + */ + public static boolean isNamedEntity(String name) { + return full.containsKey(name); + } + + /** + * Get the Character value of the named entity + * @param name named entity (e.g. "lt" or "amp") + * @return the Character value of the named entity (e.g. '<' or '&') + */ + public static Character getCharacterByName(String name) { + return full.get(name); + } + + static String escape(String string, Document.OutputSettings out) { + return escape(string, out.encoder(), out.escapeMode()); + } + + static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) { + StringBuilder accum = new StringBuilder(string.length() * 2); + Map<Character, String> map = escapeMode.getMap(); + + for (int pos = 0; pos < string.length(); pos++) { + Character c = string.charAt(pos); + if (map.containsKey(c)) + accum.append('&').append(map.get(c)).append(';'); + else if (encoder.canEncode(c)) + accum.append(c.charValue()); + else + accum.append("&#").append((int) c).append(';'); + } + + return accum.toString(); + } + + static String unescape(String string) { + return unescape(string, false); + } + + /** + * Unescape the input string. + * @param string + * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) + * @return + */ + static String unescape(String string, boolean strict) { + // todo: change this method to use Tokeniser.consumeCharacterReference + if (!string.contains("&")) + return string; + + Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? + StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs + // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required + + while (m.find()) { + int charval = -1; + String num = m.group(3); + if (num != null) { + try { + int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator + charval = Integer.valueOf(num, base); + } catch (NumberFormatException e) { + } // skip + } else { + String name = m.group(1); + if (full.containsKey(name)) + charval = full.get(name); + } + + if (charval != -1 || charval > 0xFFFF) { // out of range + String c = Character.toString((char) charval); + m.appendReplacement(accum, Matcher.quoteReplacement(c)); + } else { + m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string + } + } + m.appendTail(accum); + return accum.toString(); + } + + // xhtml has restricted entities + private static final Object[][] xhtmlArray = { + {"quot", 0x00022}, + {"amp", 0x00026}, + {"apos", 0x00027}, + {"lt", 0x0003C}, + {"gt", 0x0003E} + }; + + static { + xhtmlByVal = new HashMap<Character, String>(); + baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default + full = loadEntities("entities-full.properties"); // extended and overblown. + fullByVal = toCharacterKey(full); + + for (Object[] entity : xhtmlArray) { + Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); + xhtmlByVal.put(c, ((String) entity[0])); + } + } + + private static Map<String, Character> loadEntities(String filename) { + Properties properties = new Properties(); + Map<String, Character> entities = new HashMap<String, Character>(); + try { + InputStream in = Entities.class.getResourceAsStream(filename); + properties.load(in); + in.close(); + } catch (IOException e) { + throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename); + } + + for (Map.Entry entry: properties.entrySet()) { + Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16)); + String name = (String) entry.getKey(); + entities.put(name, val); + } + return entities; + } + + private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) { + Map<Character, String> outMap = new HashMap<Character, String>(); + for (Map.Entry<String, Character> entry: inMap.entrySet()) { + Character character = entry.getValue(); + String name = entry.getKey(); + + if (outMap.containsKey(character)) { + // dupe, prefer the lower case version + if (name.toLowerCase().equals(name)) + outMap.put(character, name); + } else { + outMap.put(character, name); + } + } + return outMap; + } +} diff --git a/src/org/jsoup/nodes/Node.java b/src/org/jsoup/nodes/Node.java new file mode 100644 index 0000000000..eb2b40ee73 --- /dev/null +++ b/src/org/jsoup/nodes/Node.java @@ -0,0 +1,615 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.parser.Parser; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + The base, abstract Node model. Elements, Documents, Comments etc are all Node instances. + + @author Jonathan Hedley, jonathan@hedley.net */ +public abstract class Node implements Cloneable { + Node parentNode; + List<Node> childNodes; + Attributes attributes; + String baseUri; + int siblingIndex; + + /** + Create a new Node. + @param baseUri base URI + @param attributes attributes (not null, but may be empty) + */ + protected Node(String baseUri, Attributes attributes) { + Validate.notNull(baseUri); + Validate.notNull(attributes); + + childNodes = new ArrayList<Node>(4); + this.baseUri = baseUri.trim(); + this.attributes = attributes; + } + + protected Node(String baseUri) { + this(baseUri, new Attributes()); + } + + /** + * Default constructor. Doesn't setup base uri, children, or attributes; use with caution. + */ + protected Node() { + childNodes = Collections.emptyList(); + attributes = null; + } + + /** + Get the node name of this node. Use for debugging purposes and not logic switching (for that, use instanceof). + @return node name + */ + public abstract String nodeName(); + + /** + * Get an attribute's value by its key. + * <p/> + * To get an absolute URL from an attribute that may be a relative URL, prefix the key with <code><b>abs</b></code>, + * which is a shortcut to the {@link #absUrl} method. + * E.g.: <blockquote><code>String url = a.attr("abs:href");</code></blockquote> + * @param attributeKey The attribute key. + * @return The attribute, or empty string if not present (to avoid nulls). + * @see #attributes() + * @see #hasAttr(String) + * @see #absUrl(String) + */ + public String attr(String attributeKey) { + Validate.notNull(attributeKey); + + if (attributes.hasKey(attributeKey)) + return attributes.get(attributeKey); + else if (attributeKey.toLowerCase().startsWith("abs:")) + return absUrl(attributeKey.substring("abs:".length())); + else return ""; + } + + /** + * Get all of the element's attributes. + * @return attributes (which implements iterable, in same order as presented in original HTML). + */ + public Attributes attributes() { + return attributes; + } + + /** + * Set an attribute (key=value). If the attribute already exists, it is replaced. + * @param attributeKey The attribute key. + * @param attributeValue The attribute value. + * @return this (for chaining) + */ + public Node attr(String attributeKey, String attributeValue) { + attributes.put(attributeKey, attributeValue); + return this; + } + + /** + * Test if this element has an attribute. + * @param attributeKey The attribute key to check. + * @return true if the attribute exists, false if not. + */ + public boolean hasAttr(String attributeKey) { + Validate.notNull(attributeKey); + + if (attributeKey.toLowerCase().startsWith("abs:")) { + String key = attributeKey.substring("abs:".length()); + if (attributes.hasKey(key) && !absUrl(key).equals("")) + return true; + } + return attributes.hasKey(attributeKey); + } + + /** + * Remove an attribute from this element. + * @param attributeKey The attribute to remove. + * @return this (for chaining) + */ + public Node removeAttr(String attributeKey) { + Validate.notNull(attributeKey); + attributes.remove(attributeKey); + return this; + } + + /** + Get the base URI of this node. + @return base URI + */ + public String baseUri() { + return baseUri; + } + + /** + Update the base URI of this node and all of its descendants. + @param baseUri base URI to set + */ + public void setBaseUri(final String baseUri) { + Validate.notNull(baseUri); + + traverse(new NodeVisitor() { + public void head(Node node, int depth) { + node.baseUri = baseUri; + } + + public void tail(Node node, int depth) { + } + }); + } + + /** + * Get an absolute URL from a URL attribute that may be relative (i.e. an <code><a href></code> or + * <code><img src></code>). + * <p/> + * E.g.: <code>String absUrl = linkEl.absUrl("href");</code> + * <p/> + * If the attribute value is already absolute (i.e. it starts with a protocol, like + * <code>http://</code> or <code>https://</code> etc), and it successfully parses as a URL, the attribute is + * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made + * absolute using that. + * <p/> + * As an alternate, you can use the {@link #attr} method with the <code>abs:</code> prefix, e.g.: + * <code>String absUrl = linkEl.attr("abs:href");</code> + * + * @param attributeKey The attribute key + * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or + * could not be made successfully into a URL. + * @see #attr + * @see java.net.URL#URL(java.net.URL, String) + */ + public String absUrl(String attributeKey) { + Validate.notEmpty(attributeKey); + + String relUrl = attr(attributeKey); + if (!hasAttr(attributeKey)) { + return ""; // nothing to make absolute with + } else { + URL base; + try { + try { + base = new URL(baseUri); + } catch (MalformedURLException e) { + // the base is unsuitable, but the attribute may be abs on its own, so try that + URL abs = new URL(relUrl); + return abs.toExternalForm(); + } + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired + if (relUrl.startsWith("?")) + relUrl = base.getPath() + relUrl; + URL abs = new URL(base, relUrl); + return abs.toExternalForm(); + } catch (MalformedURLException e) { + return ""; + } + } + } + + /** + Get a child node by index + @param index index of child node + @return the child node at this index. + */ + public Node childNode(int index) { + return childNodes.get(index); + } + + /** + Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes + themselves can be manipulated. + @return list of children. If no children, returns an empty list. + */ + public List<Node> childNodes() { + return Collections.unmodifiableList(childNodes); + } + + protected Node[] childNodesAsArray() { + return childNodes.toArray(new Node[childNodes().size()]); + } + + /** + Gets this node's parent node. + @return parent node; or null if no parent. + */ + public Node parent() { + return parentNode; + } + + /** + * Gets the Document associated with this Node. + * @return the Document associated with this Node, or null if there is no such Document. + */ + public Document ownerDocument() { + if (this instanceof Document) + return (Document) this; + else if (parentNode == null) + return null; + else + return parentNode.ownerDocument(); + } + + /** + * Remove (delete) this node from the DOM tree. If this node has children, they are also removed. + */ + public void remove() { + Validate.notNull(parentNode); + parentNode.removeChild(this); + } + + /** + * Insert the specified HTML into the DOM before this node (i.e. as a preceding sibling). + * @param html HTML to add before this node + * @return this node, for chaining + * @see #after(String) + */ + public Node before(String html) { + addSiblingHtml(siblingIndex(), html); + return this; + } + + /** + * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). + * @param node to add before this node + * @return this node, for chaining + * @see #after(Node) + */ + public Node before(Node node) { + Validate.notNull(node); + Validate.notNull(parentNode); + + parentNode.addChildren(siblingIndex(), node); + return this; + } + + /** + * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). + * @param html HTML to add after this node + * @return this node, for chaining + * @see #before(String) + */ + public Node after(String html) { + addSiblingHtml(siblingIndex()+1, html); + return this; + } + + /** + * Insert the specified node into the DOM after this node (i.e. as a following sibling). + * @param node to add after this node + * @return this node, for chaining + * @see #before(Node) + */ + public Node after(Node node) { + Validate.notNull(node); + Validate.notNull(parentNode); + + parentNode.addChildren(siblingIndex()+1, node); + return this; + } + + private void addSiblingHtml(int index, String html) { + Validate.notNull(html); + Validate.notNull(parentNode); + + Element context = parent() instanceof Element ? (Element) parent() : null; + List<Node> nodes = Parser.parseFragment(html, context, baseUri()); + parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); + } + + /** + Wrap the supplied HTML around this node. + @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. + @return this node, for chaining. + */ + public Node wrap(String html) { + Validate.notEmpty(html); + + Element context = parent() instanceof Element ? (Element) parent() : null; + List<Node> wrapChildren = Parser.parseFragment(html, context, baseUri()); + Node wrapNode = wrapChildren.get(0); + if (wrapNode == null || !(wrapNode instanceof Element)) // nothing to wrap with; noop + return null; + + Element wrap = (Element) wrapNode; + Element deepest = getDeepChild(wrap); + parentNode.replaceChild(this, wrap); + deepest.addChildren(this); + + // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is remainder + if (wrapChildren.size() > 0) { + for (int i = 0; i < wrapChildren.size(); i++) { + Node remainder = wrapChildren.get(i); + remainder.parentNode.removeChild(remainder); + wrap.appendChild(remainder); + } + } + return this; + } + + /** + * Removes this node from the DOM, and moves its children up into the node's parent. This has the effect of dropping + * the node but keeping its children. + * <p/> + * For example, with the input html:<br/> + * {@code <div>One <span>Two <b>Three</b></span></div>}<br/> + * Calling {@code element.unwrap()} on the {@code span} element will result in the html:<br/> + * {@code <div>One Two <b>Three</b></div>}<br/> + * and the {@code "Two "} {@link TextNode} being returned. + * @return the first child of this node, after the node has been unwrapped. Null if the node had no children. + * @see #remove() + * @see #wrap(String) + */ + public Node unwrap() { + Validate.notNull(parentNode); + + int index = siblingIndex; + Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null; + parentNode.addChildren(index, this.childNodesAsArray()); + this.remove(); + + return firstChild; + } + + private Element getDeepChild(Element el) { + List<Element> children = el.children(); + if (children.size() > 0) + return getDeepChild(children.get(0)); + else + return el; + } + + /** + * Replace this node in the DOM with the supplied node. + * @param in the node that will will replace the existing node. + */ + public void replaceWith(Node in) { + Validate.notNull(in); + Validate.notNull(parentNode); + parentNode.replaceChild(this, in); + } + + protected void setParentNode(Node parentNode) { + if (this.parentNode != null) + this.parentNode.removeChild(this); + this.parentNode = parentNode; + } + + protected void replaceChild(Node out, Node in) { + Validate.isTrue(out.parentNode == this); + Validate.notNull(in); + if (in.parentNode != null) + in.parentNode.removeChild(in); + + Integer index = out.siblingIndex(); + childNodes.set(index, in); + in.parentNode = this; + in.setSiblingIndex(index); + out.parentNode = null; + } + + protected void removeChild(Node out) { + Validate.isTrue(out.parentNode == this); + int index = out.siblingIndex(); + childNodes.remove(index); + reindexChildren(); + out.parentNode = null; + } + + protected void addChildren(Node... children) { + //most used. short circuit addChildren(int), which hits reindex children and array copy + for (Node child: children) { + reparentChild(child); + childNodes.add(child); + child.setSiblingIndex(childNodes.size()-1); + } + } + + protected void addChildren(int index, Node... children) { + Validate.noNullElements(children); + for (int i = children.length - 1; i >= 0; i--) { + Node in = children[i]; + reparentChild(in); + childNodes.add(index, in); + } + reindexChildren(); + } + + private void reparentChild(Node child) { + if (child.parentNode != null) + child.parentNode.removeChild(child); + child.setParentNode(this); + } + + private void reindexChildren() { + for (int i = 0; i < childNodes.size(); i++) { + childNodes.get(i).setSiblingIndex(i); + } + } + + /** + Retrieves this node's sibling nodes. Similar to {@link #childNodes() node.parent.childNodes()}, but does not + include this node (a node is not a sibling of itself). + @return node siblings. If the node has no parent, returns an empty list. + */ + public List<Node> siblingNodes() { + if (parentNode == null) + return Collections.emptyList(); + + List<Node> nodes = parentNode.childNodes; + List<Node> siblings = new ArrayList<Node>(nodes.size() - 1); + for (Node node: nodes) + if (node != this) + siblings.add(node); + return siblings; + } + + /** + Get this node's next sibling. + @return next sibling, or null if this is the last sibling + */ + public Node nextSibling() { + if (parentNode == null) + return null; // root + + List<Node> siblings = parentNode.childNodes; + Integer index = siblingIndex(); + Validate.notNull(index); + if (siblings.size() > index+1) + return siblings.get(index+1); + else + return null; + } + + /** + Get this node's previous sibling. + @return the previous sibling, or null if this is the first sibling + */ + public Node previousSibling() { + if (parentNode == null) + return null; // root + + List<Node> siblings = parentNode.childNodes; + Integer index = siblingIndex(); + Validate.notNull(index); + if (index > 0) + return siblings.get(index-1); + else + return null; + } + + /** + * Get the list index of this node in its node sibling list. I.e. if this is the first node + * sibling, returns 0. + * @return position in node sibling list + * @see org.jsoup.nodes.Element#elementSiblingIndex() + */ + public int siblingIndex() { + return siblingIndex; + } + + protected void setSiblingIndex(int siblingIndex) { + this.siblingIndex = siblingIndex; + } + + /** + * Perform a depth-first traversal through this node and its descendants. + * @param nodeVisitor the visitor callbacks to perform on each node + * @return this node, for chaining + */ + public Node traverse(NodeVisitor nodeVisitor) { + Validate.notNull(nodeVisitor); + NodeTraversor traversor = new NodeTraversor(nodeVisitor); + traversor.traverse(this); + return this; + } + + /** + Get the outer HTML of this node. + @return HTML + */ + public String outerHtml() { + StringBuilder accum = new StringBuilder(128); + outerHtml(accum); + return accum.toString(); + } + + protected void outerHtml(StringBuilder accum) { + new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())).traverse(this); + } + + // if this node has no document (or parent), retrieve the default output settings + private Document.OutputSettings getOutputSettings() { + return ownerDocument() != null ? ownerDocument().outputSettings() : (new Document("")).outputSettings(); + } + + /** + Get the outer HTML of this node. + @param accum accumulator to place HTML into + */ + abstract void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out); + + abstract void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out); + + public String toString() { + return outerHtml(); + } + + protected void indent(StringBuilder accum, int depth, Document.OutputSettings out) { + accum.append("\n").append(StringUtil.padding(depth * out.indentAmount())); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + // todo: have nodes hold a child index, compare against that and parent (not children) + return false; + } + + @Override + public int hashCode() { + int result = parentNode != null ? parentNode.hashCode() : 0; + // not children, or will block stack as they go back up to parent) + result = 31 * result + (attributes != null ? attributes.hashCode() : 0); + return result; + } + + /** + * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or + * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the + * original node. + * <p> + * The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}. + * @return stand-alone cloned node + */ + @Override + public Node clone() { + return doClone(null); // splits for orphan + } + + protected Node doClone(Node parent) { + Node clone; + try { + clone = (Node) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + + clone.parentNode = parent; // can be null, to create an orphan split + clone.siblingIndex = parent == null ? 0 : siblingIndex; + clone.attributes = attributes != null ? attributes.clone() : null; + clone.baseUri = baseUri; + clone.childNodes = new ArrayList<Node>(childNodes.size()); + for (Node child: childNodes) + clone.childNodes.add(child.doClone(clone)); // clone() creates orphans, doClone() keeps parent + + return clone; + } + + private static class OuterHtmlVisitor implements NodeVisitor { + private StringBuilder accum; + private Document.OutputSettings out; + + OuterHtmlVisitor(StringBuilder accum, Document.OutputSettings out) { + this.accum = accum; + this.out = out; + } + + public void head(Node node, int depth) { + node.outerHtmlHead(accum, depth, out); + } + + public void tail(Node node, int depth) { + if (!node.nodeName().equals("#text")) // saves a void hit. + node.outerHtmlTail(accum, depth, out); + } + } +} diff --git a/src/org/jsoup/nodes/TextNode.java b/src/org/jsoup/nodes/TextNode.java new file mode 100644 index 0000000000..9fd0feac8f --- /dev/null +++ b/src/org/jsoup/nodes/TextNode.java @@ -0,0 +1,175 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; + +/** + A text node. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class TextNode extends Node { + /* + TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use + memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create + them as needed on the fly. + */ + private static final String TEXT_KEY = "text"; + String text; + + /** + Create a new TextNode representing the supplied (unencoded) text). + + @param text raw text + @param baseUri base uri + @see #createFromEncoded(String, String) + */ + public TextNode(String text, String baseUri) { + this.baseUri = baseUri; + this.text = text; + } + + public String nodeName() { + return "#text"; + } + + /** + * Get the text content of this text node. + * @return Unencoded, normalised text. + * @see TextNode#getWholeText() + */ + public String text() { + return normaliseWhitespace(getWholeText()); + } + + /** + * Set the text content of this text node. + * @param text unencoded text + * @return this, for chaining + */ + public TextNode text(String text) { + this.text = text; + if (attributes != null) + attributes.put(TEXT_KEY, text); + return this; + } + + /** + Get the (unencoded) text of this text node, including any newlines and spaces present in the original. + @return text + */ + public String getWholeText() { + return attributes == null ? text : attributes.get(TEXT_KEY); + } + + /** + Test if this text node is blank -- that is, empty or only whitespace (including newlines). + @return true if this document is empty or only whitespace, false if it contains any text content. + */ + public boolean isBlank() { + return StringUtil.isBlank(getWholeText()); + } + + /** + * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the + * original text up to the offset, and will have a new text node sibling containing the text after the offset. + * @param offset string offset point to split node at. + * @return the newly created text node containing the text after the offset. + */ + public TextNode splitText(int offset) { + Validate.isTrue(offset >= 0, "Split offset must be not be negative"); + Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length"); + + String head = getWholeText().substring(0, offset); + String tail = getWholeText().substring(offset); + text(head); + TextNode tailNode = new TextNode(tail, this.baseUri()); + if (parent() != null) + parent().addChildren(siblingIndex()+1, tailNode); + + return tailNode; + } + + void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + String html = Entities.escape(getWholeText(), out); + if (out.prettyPrint() && parent() instanceof Element && !((Element) parent()).preserveWhitespace()) { + html = normaliseWhitespace(html); + } + + if (out.prettyPrint() && siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) + indent(accum, depth, out); + accum.append(html); + } + + void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + + public String toString() { + return outerHtml(); + } + + /** + * Create a new TextNode from HTML encoded (aka escaped) data. + * @param encodedText Text containing encoded HTML (e.g. &lt;) + * @return TextNode containing unencoded data (e.g. <) + */ + public static TextNode createFromEncoded(String encodedText, String baseUri) { + String text = Entities.unescape(encodedText); + return new TextNode(text, baseUri); + } + + static String normaliseWhitespace(String text) { + text = StringUtil.normaliseWhitespace(text); + return text; + } + + static String stripLeadingWhitespace(String text) { + return text.replaceFirst("^\\s+", ""); + } + + static boolean lastCharIsWhitespace(StringBuilder sb) { + return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; + } + + // attribute fiddling. create on first access. + private void ensureAttributes() { + if (attributes == null) { + attributes = new Attributes(); + attributes.put(TEXT_KEY, text); + } + } + + @Override + public String attr(String attributeKey) { + ensureAttributes(); + return super.attr(attributeKey); + } + + @Override + public Attributes attributes() { + ensureAttributes(); + return super.attributes(); + } + + @Override + public Node attr(String attributeKey, String attributeValue) { + ensureAttributes(); + return super.attr(attributeKey, attributeValue); + } + + @Override + public boolean hasAttr(String attributeKey) { + ensureAttributes(); + return super.hasAttr(attributeKey); + } + + @Override + public Node removeAttr(String attributeKey) { + ensureAttributes(); + return super.removeAttr(attributeKey); + } + + @Override + public String absUrl(String attributeKey) { + ensureAttributes(); + return super.absUrl(attributeKey); + } +} diff --git a/src/org/jsoup/nodes/XmlDeclaration.java b/src/org/jsoup/nodes/XmlDeclaration.java new file mode 100644 index 0000000000..80d4a0152f --- /dev/null +++ b/src/org/jsoup/nodes/XmlDeclaration.java @@ -0,0 +1,48 @@ +package org.jsoup.nodes; + +/** + An XML Declaration. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class XmlDeclaration extends Node { + private static final String DECL_KEY = "declaration"; + private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?) + + /** + Create a new XML declaration + @param data data + @param baseUri base uri + @param isProcessingInstruction is processing instruction + */ + public XmlDeclaration(String data, String baseUri, boolean isProcessingInstruction) { + super(baseUri); + attributes.put(DECL_KEY, data); + this.isProcessingInstruction = isProcessingInstruction; + } + + public String nodeName() { + return "#declaration"; + } + + /** + Get the unencoded XML declaration. + @return XML declaration + */ + public String getWholeDeclaration() { + return attributes.get(DECL_KEY); + } + + void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + accum + .append("<") + .append(isProcessingInstruction ? "!" : "?") + .append(getWholeDeclaration()) + .append(">"); + } + + void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + + public String toString() { + return outerHtml(); + } +} diff --git a/src/org/jsoup/nodes/entities-base.properties b/src/org/jsoup/nodes/entities-base.properties new file mode 100644 index 0000000000..3d1d11e6c4 --- /dev/null +++ b/src/org/jsoup/nodes/entities-base.properties @@ -0,0 +1,106 @@ +AElig=000C6 +AMP=00026 +Aacute=000C1 +Acirc=000C2 +Agrave=000C0 +Aring=000C5 +Atilde=000C3 +Auml=000C4 +COPY=000A9 +Ccedil=000C7 +ETH=000D0 +Eacute=000C9 +Ecirc=000CA +Egrave=000C8 +Euml=000CB +GT=0003E +Iacute=000CD +Icirc=000CE +Igrave=000CC +Iuml=000CF +LT=0003C +Ntilde=000D1 +Oacute=000D3 +Ocirc=000D4 +Ograve=000D2 +Oslash=000D8 +Otilde=000D5 +Ouml=000D6 +QUOT=00022 +REG=000AE +THORN=000DE +Uacute=000DA +Ucirc=000DB +Ugrave=000D9 +Uuml=000DC +Yacute=000DD +aacute=000E1 +acirc=000E2 +acute=000B4 +aelig=000E6 +agrave=000E0 +amp=00026 +aring=000E5 +atilde=000E3 +auml=000E4 +brvbar=000A6 +ccedil=000E7 +cedil=000B8 +cent=000A2 +copy=000A9 +curren=000A4 +deg=000B0 +divide=000F7 +eacute=000E9 +ecirc=000EA +egrave=000E8 +eth=000F0 +euml=000EB +frac12=000BD +frac14=000BC +frac34=000BE +gt=0003E +iacute=000ED +icirc=000EE +iexcl=000A1 +igrave=000EC +iquest=000BF +iuml=000EF +laquo=000AB +lt=0003C +macr=000AF +micro=000B5 +middot=000B7 +nbsp=000A0 +not=000AC +ntilde=000F1 +oacute=000F3 +ocirc=000F4 +ograve=000F2 +ordf=000AA +ordm=000BA +oslash=000F8 +otilde=000F5 +ouml=000F6 +para=000B6 +plusmn=000B1 +pound=000A3 +quot=00022 +raquo=000BB +reg=000AE +sect=000A7 +shy=000AD +sup1=000B9 +sup2=000B2 +sup3=000B3 +szlig=000DF +thorn=000FE +times=000D7 +uacute=000FA +ucirc=000FB +ugrave=000F9 +uml=000A8 +uuml=000FC +yacute=000FD +yen=000A5 +yuml=000FF diff --git a/src/org/jsoup/nodes/entities-full.properties b/src/org/jsoup/nodes/entities-full.properties new file mode 100644 index 0000000000..92f124f408 --- /dev/null +++ b/src/org/jsoup/nodes/entities-full.properties @@ -0,0 +1,2032 @@ +AElig=000C6 +AMP=00026 +Aacute=000C1 +Abreve=00102 +Acirc=000C2 +Acy=00410 +Afr=1D504 +Agrave=000C0 +Alpha=00391 +Amacr=00100 +And=02A53 +Aogon=00104 +Aopf=1D538 +ApplyFunction=02061 +Aring=000C5 +Ascr=1D49C +Assign=02254 +Atilde=000C3 +Auml=000C4 +Backslash=02216 +Barv=02AE7 +Barwed=02306 +Bcy=00411 +Because=02235 +Bernoullis=0212C +Beta=00392 +Bfr=1D505 +Bopf=1D539 +Breve=002D8 +Bscr=0212C +Bumpeq=0224E +CHcy=00427 +COPY=000A9 +Cacute=00106 +Cap=022D2 +CapitalDifferentialD=02145 +Cayleys=0212D +Ccaron=0010C +Ccedil=000C7 +Ccirc=00108 +Cconint=02230 +Cdot=0010A +Cedilla=000B8 +CenterDot=000B7 +Cfr=0212D +Chi=003A7 +CircleDot=02299 +CircleMinus=02296 +CirclePlus=02295 +CircleTimes=02297 +ClockwiseContourIntegral=02232 +CloseCurlyDoubleQuote=0201D +CloseCurlyQuote=02019 +Colon=02237 +Colone=02A74 +Congruent=02261 +Conint=0222F +ContourIntegral=0222E +Copf=02102 +Coproduct=02210 +CounterClockwiseContourIntegral=02233 +Cross=02A2F +Cscr=1D49E +Cup=022D3 +CupCap=0224D +DD=02145 +DDotrahd=02911 +DJcy=00402 +DScy=00405 +DZcy=0040F +Dagger=02021 +Darr=021A1 +Dashv=02AE4 +Dcaron=0010E +Dcy=00414 +Del=02207 +Delta=00394 +Dfr=1D507 +DiacriticalAcute=000B4 +DiacriticalDot=002D9 +DiacriticalDoubleAcute=002DD +DiacriticalGrave=00060 +DiacriticalTilde=002DC +Diamond=022C4 +DifferentialD=02146 +Dopf=1D53B +Dot=000A8 +DotDot=020DC +DotEqual=02250 +DoubleContourIntegral=0222F +DoubleDot=000A8 +DoubleDownArrow=021D3 +DoubleLeftArrow=021D0 +DoubleLeftRightArrow=021D4 +DoubleLeftTee=02AE4 +DoubleLongLeftArrow=027F8 +DoubleLongLeftRightArrow=027FA +DoubleLongRightArrow=027F9 +DoubleRightArrow=021D2 +DoubleRightTee=022A8 +DoubleUpArrow=021D1 +DoubleUpDownArrow=021D5 +DoubleVerticalBar=02225 +DownArrow=02193 +DownArrowBar=02913 +DownArrowUpArrow=021F5 +DownBreve=00311 +DownLeftRightVector=02950 +DownLeftTeeVector=0295E +DownLeftVector=021BD +DownLeftVectorBar=02956 +DownRightTeeVector=0295F +DownRightVector=021C1 +DownRightVectorBar=02957 +DownTee=022A4 +DownTeeArrow=021A7 +Downarrow=021D3 +Dscr=1D49F +Dstrok=00110 +ENG=0014A +ETH=000D0 +Eacute=000C9 +Ecaron=0011A +Ecirc=000CA +Ecy=0042D +Edot=00116 +Efr=1D508 +Egrave=000C8 +Element=02208 +Emacr=00112 +EmptySmallSquare=025FB +EmptyVerySmallSquare=025AB +Eogon=00118 +Eopf=1D53C +Epsilon=00395 +Equal=02A75 +EqualTilde=02242 +Equilibrium=021CC +Escr=02130 +Esim=02A73 +Eta=00397 +Euml=000CB +Exists=02203 +ExponentialE=02147 +Fcy=00424 +Ffr=1D509 +FilledSmallSquare=025FC +FilledVerySmallSquare=025AA +Fopf=1D53D +ForAll=02200 +Fouriertrf=02131 +Fscr=02131 +GJcy=00403 +GT=0003E +Gamma=00393 +Gammad=003DC +Gbreve=0011E +Gcedil=00122 +Gcirc=0011C +Gcy=00413 +Gdot=00120 +Gfr=1D50A +Gg=022D9 +Gopf=1D53E +GreaterEqual=02265 +GreaterEqualLess=022DB +GreaterFullEqual=02267 +GreaterGreater=02AA2 +GreaterLess=02277 +GreaterSlantEqual=02A7E +GreaterTilde=02273 +Gscr=1D4A2 +Gt=0226B +HARDcy=0042A +Hacek=002C7 +Hat=0005E +Hcirc=00124 +Hfr=0210C +HilbertSpace=0210B +Hopf=0210D +HorizontalLine=02500 +Hscr=0210B +Hstrok=00126 +HumpDownHump=0224E +HumpEqual=0224F +IEcy=00415 +IJlig=00132 +IOcy=00401 +Iacute=000CD +Icirc=000CE +Icy=00418 +Idot=00130 +Ifr=02111 +Igrave=000CC +Im=02111 +Imacr=0012A +ImaginaryI=02148 +Implies=021D2 +Int=0222C +Integral=0222B +Intersection=022C2 +InvisibleComma=02063 +InvisibleTimes=02062 +Iogon=0012E +Iopf=1D540 +Iota=00399 +Iscr=02110 +Itilde=00128 +Iukcy=00406 +Iuml=000CF +Jcirc=00134 +Jcy=00419 +Jfr=1D50D +Jopf=1D541 +Jscr=1D4A5 +Jsercy=00408 +Jukcy=00404 +KHcy=00425 +KJcy=0040C +Kappa=0039A +Kcedil=00136 +Kcy=0041A +Kfr=1D50E +Kopf=1D542 +Kscr=1D4A6 +LJcy=00409 +LT=0003C +Lacute=00139 +Lambda=0039B +Lang=027EA +Laplacetrf=02112 +Larr=0219E +Lcaron=0013D +Lcedil=0013B +Lcy=0041B +LeftAngleBracket=027E8 +LeftArrow=02190 +LeftArrowBar=021E4 +LeftArrowRightArrow=021C6 +LeftCeiling=02308 +LeftDoubleBracket=027E6 +LeftDownTeeVector=02961 +LeftDownVector=021C3 +LeftDownVectorBar=02959 +LeftFloor=0230A +LeftRightArrow=02194 +LeftRightVector=0294E +LeftTee=022A3 +LeftTeeArrow=021A4 +LeftTeeVector=0295A +LeftTriangle=022B2 +LeftTriangleBar=029CF +LeftTriangleEqual=022B4 +LeftUpDownVector=02951 +LeftUpTeeVector=02960 +LeftUpVector=021BF +LeftUpVectorBar=02958 +LeftVector=021BC +LeftVectorBar=02952 +Leftarrow=021D0 +Leftrightarrow=021D4 +LessEqualGreater=022DA +LessFullEqual=02266 +LessGreater=02276 +LessLess=02AA1 +LessSlantEqual=02A7D +LessTilde=02272 +Lfr=1D50F +Ll=022D8 +Lleftarrow=021DA +Lmidot=0013F +LongLeftArrow=027F5 +LongLeftRightArrow=027F7 +LongRightArrow=027F6 +Longleftarrow=027F8 +Longleftrightarrow=027FA +Longrightarrow=027F9 +Lopf=1D543 +LowerLeftArrow=02199 +LowerRightArrow=02198 +Lscr=02112 +Lsh=021B0 +Lstrok=00141 +Lt=0226A +Map=02905 +Mcy=0041C +MediumSpace=0205F +Mellintrf=02133 +Mfr=1D510 +MinusPlus=02213 +Mopf=1D544 +Mscr=02133 +Mu=0039C +NJcy=0040A +Nacute=00143 +Ncaron=00147 +Ncedil=00145 +Ncy=0041D +NegativeMediumSpace=0200B +NegativeThickSpace=0200B +NegativeThinSpace=0200B +NegativeVeryThinSpace=0200B +NestedGreaterGreater=0226B +NestedLessLess=0226A +NewLine=0000A +Nfr=1D511 +NoBreak=02060 +NonBreakingSpace=000A0 +Nopf=02115 +Not=02AEC +NotCongruent=02262 +NotCupCap=0226D +NotDoubleVerticalBar=02226 +NotElement=02209 +NotEqual=02260 +NotExists=02204 +NotGreater=0226F +NotGreaterEqual=02271 +NotGreaterLess=02279 +NotGreaterTilde=02275 +NotLeftTriangle=022EA +NotLeftTriangleEqual=022EC +NotLess=0226E +NotLessEqual=02270 +NotLessGreater=02278 +NotLessTilde=02274 +NotPrecedes=02280 +NotPrecedesSlantEqual=022E0 +NotReverseElement=0220C +NotRightTriangle=022EB +NotRightTriangleEqual=022ED +NotSquareSubsetEqual=022E2 +NotSquareSupersetEqual=022E3 +NotSubsetEqual=02288 +NotSucceeds=02281 +NotSucceedsSlantEqual=022E1 +NotSupersetEqual=02289 +NotTilde=02241 +NotTildeEqual=02244 +NotTildeFullEqual=02247 +NotTildeTilde=02249 +NotVerticalBar=02224 +Nscr=1D4A9 +Ntilde=000D1 +Nu=0039D +OElig=00152 +Oacute=000D3 +Ocirc=000D4 +Ocy=0041E +Odblac=00150 +Ofr=1D512 +Ograve=000D2 +Omacr=0014C +Omega=003A9 +Omicron=0039F +Oopf=1D546 +OpenCurlyDoubleQuote=0201C +OpenCurlyQuote=02018 +Or=02A54 +Oscr=1D4AA +Oslash=000D8 +Otilde=000D5 +Otimes=02A37 +Ouml=000D6 +OverBar=0203E +OverBrace=023DE +OverBracket=023B4 +OverParenthesis=023DC +PartialD=02202 +Pcy=0041F +Pfr=1D513 +Phi=003A6 +Pi=003A0 +PlusMinus=000B1 +Poincareplane=0210C +Popf=02119 +Pr=02ABB +Precedes=0227A +PrecedesEqual=02AAF +PrecedesSlantEqual=0227C +PrecedesTilde=0227E +Prime=02033 +Product=0220F +Proportion=02237 +Proportional=0221D +Pscr=1D4AB +Psi=003A8 +QUOT=00022 +Qfr=1D514 +Qopf=0211A +Qscr=1D4AC +RBarr=02910 +REG=000AE +Racute=00154 +Rang=027EB +Rarr=021A0 +Rarrtl=02916 +Rcaron=00158 +Rcedil=00156 +Rcy=00420 +Re=0211C +ReverseElement=0220B +ReverseEquilibrium=021CB +ReverseUpEquilibrium=0296F +Rfr=0211C +Rho=003A1 +RightAngleBracket=027E9 +RightArrow=02192 +RightArrowBar=021E5 +RightArrowLeftArrow=021C4 +RightCeiling=02309 +RightDoubleBracket=027E7 +RightDownTeeVector=0295D +RightDownVector=021C2 +RightDownVectorBar=02955 +RightFloor=0230B +RightTee=022A2 +RightTeeArrow=021A6 +RightTeeVector=0295B +RightTriangle=022B3 +RightTriangleBar=029D0 +RightTriangleEqual=022B5 +RightUpDownVector=0294F +RightUpTeeVector=0295C +RightUpVector=021BE +RightUpVectorBar=02954 +RightVector=021C0 +RightVectorBar=02953 +Rightarrow=021D2 +Ropf=0211D +RoundImplies=02970 +Rrightarrow=021DB +Rscr=0211B +Rsh=021B1 +RuleDelayed=029F4 +SHCHcy=00429 +SHcy=00428 +SOFTcy=0042C +Sacute=0015A +Sc=02ABC +Scaron=00160 +Scedil=0015E +Scirc=0015C +Scy=00421 +Sfr=1D516 +ShortDownArrow=02193 +ShortLeftArrow=02190 +ShortRightArrow=02192 +ShortUpArrow=02191 +Sigma=003A3 +SmallCircle=02218 +Sopf=1D54A +Sqrt=0221A +Square=025A1 +SquareIntersection=02293 +SquareSubset=0228F +SquareSubsetEqual=02291 +SquareSuperset=02290 +SquareSupersetEqual=02292 +SquareUnion=02294 +Sscr=1D4AE +Star=022C6 +Sub=022D0 +Subset=022D0 +SubsetEqual=02286 +Succeeds=0227B +SucceedsEqual=02AB0 +SucceedsSlantEqual=0227D +SucceedsTilde=0227F +SuchThat=0220B +Sum=02211 +Sup=022D1 +Superset=02283 +SupersetEqual=02287 +Supset=022D1 +THORN=000DE +TRADE=02122 +TSHcy=0040B +TScy=00426 +Tab=00009 +Tau=003A4 +Tcaron=00164 +Tcedil=00162 +Tcy=00422 +Tfr=1D517 +Therefore=02234 +Theta=00398 +ThinSpace=02009 +Tilde=0223C +TildeEqual=02243 +TildeFullEqual=02245 +TildeTilde=02248 +Topf=1D54B +TripleDot=020DB +Tscr=1D4AF +Tstrok=00166 +Uacute=000DA +Uarr=0219F +Uarrocir=02949 +Ubrcy=0040E +Ubreve=0016C +Ucirc=000DB +Ucy=00423 +Udblac=00170 +Ufr=1D518 +Ugrave=000D9 +Umacr=0016A +UnderBar=0005F +UnderBrace=023DF +UnderBracket=023B5 +UnderParenthesis=023DD +Union=022C3 +UnionPlus=0228E +Uogon=00172 +Uopf=1D54C +UpArrow=02191 +UpArrowBar=02912 +UpArrowDownArrow=021C5 +UpDownArrow=02195 +UpEquilibrium=0296E +UpTee=022A5 +UpTeeArrow=021A5 +Uparrow=021D1 +Updownarrow=021D5 +UpperLeftArrow=02196 +UpperRightArrow=02197 +Upsi=003D2 +Upsilon=003A5 +Uring=0016E +Uscr=1D4B0 +Utilde=00168 +Uuml=000DC +VDash=022AB +Vbar=02AEB +Vcy=00412 +Vdash=022A9 +Vdashl=02AE6 +Vee=022C1 +Verbar=02016 +Vert=02016 +VerticalBar=02223 +VerticalLine=0007C +VerticalSeparator=02758 +VerticalTilde=02240 +VeryThinSpace=0200A +Vfr=1D519 +Vopf=1D54D +Vscr=1D4B1 +Vvdash=022AA +Wcirc=00174 +Wedge=022C0 +Wfr=1D51A +Wopf=1D54E +Wscr=1D4B2 +Xfr=1D51B +Xi=0039E +Xopf=1D54F +Xscr=1D4B3 +YAcy=0042F +YIcy=00407 +YUcy=0042E +Yacute=000DD +Ycirc=00176 +Ycy=0042B +Yfr=1D51C +Yopf=1D550 +Yscr=1D4B4 +Yuml=00178 +ZHcy=00416 +Zacute=00179 +Zcaron=0017D +Zcy=00417 +Zdot=0017B +ZeroWidthSpace=0200B +Zeta=00396 +Zfr=02128 +Zopf=02124 +Zscr=1D4B5 +aacute=000E1 +abreve=00103 +ac=0223E +acd=0223F +acirc=000E2 +acute=000B4 +acy=00430 +aelig=000E6 +af=02061 +afr=1D51E +agrave=000E0 +alefsym=02135 +aleph=02135 +alpha=003B1 +amacr=00101 +amalg=02A3F +amp=00026 +and=02227 +andand=02A55 +andd=02A5C +andslope=02A58 +andv=02A5A +ang=02220 +ange=029A4 +angle=02220 +angmsd=02221 +angmsdaa=029A8 +angmsdab=029A9 +angmsdac=029AA +angmsdad=029AB +angmsdae=029AC +angmsdaf=029AD +angmsdag=029AE +angmsdah=029AF +angrt=0221F +angrtvb=022BE +angrtvbd=0299D +angsph=02222 +angst=000C5 +angzarr=0237C +aogon=00105 +aopf=1D552 +ap=02248 +apE=02A70 +apacir=02A6F +ape=0224A +apid=0224B +apos=00027 +approx=02248 +approxeq=0224A +aring=000E5 +ascr=1D4B6 +ast=0002A +asymp=02248 +asympeq=0224D +atilde=000E3 +auml=000E4 +awconint=02233 +awint=02A11 +bNot=02AED +backcong=0224C +backepsilon=003F6 +backprime=02035 +backsim=0223D +backsimeq=022CD +barvee=022BD +barwed=02305 +barwedge=02305 +bbrk=023B5 +bbrktbrk=023B6 +bcong=0224C +bcy=00431 +bdquo=0201E +becaus=02235 +because=02235 +bemptyv=029B0 +bepsi=003F6 +bernou=0212C +beta=003B2 +beth=02136 +between=0226C +bfr=1D51F +bigcap=022C2 +bigcirc=025EF +bigcup=022C3 +bigodot=02A00 +bigoplus=02A01 +bigotimes=02A02 +bigsqcup=02A06 +bigstar=02605 +bigtriangledown=025BD +bigtriangleup=025B3 +biguplus=02A04 +bigvee=022C1 +bigwedge=022C0 +bkarow=0290D +blacklozenge=029EB +blacksquare=025AA +blacktriangle=025B4 +blacktriangledown=025BE +blacktriangleleft=025C2 +blacktriangleright=025B8 +blank=02423 +blk12=02592 +blk14=02591 +blk34=02593 +block=02588 +bnot=02310 +bopf=1D553 +bot=022A5 +bottom=022A5 +bowtie=022C8 +boxDL=02557 +boxDR=02554 +boxDl=02556 +boxDr=02553 +boxH=02550 +boxHD=02566 +boxHU=02569 +boxHd=02564 +boxHu=02567 +boxUL=0255D +boxUR=0255A +boxUl=0255C +boxUr=02559 +boxV=02551 +boxVH=0256C +boxVL=02563 +boxVR=02560 +boxVh=0256B +boxVl=02562 +boxVr=0255F +boxbox=029C9 +boxdL=02555 +boxdR=02552 +boxdl=02510 +boxdr=0250C +boxh=02500 +boxhD=02565 +boxhU=02568 +boxhd=0252C +boxhu=02534 +boxminus=0229F +boxplus=0229E +boxtimes=022A0 +boxuL=0255B +boxuR=02558 +boxul=02518 +boxur=02514 +boxv=02502 +boxvH=0256A +boxvL=02561 +boxvR=0255E +boxvh=0253C +boxvl=02524 +boxvr=0251C +bprime=02035 +breve=002D8 +brvbar=000A6 +bscr=1D4B7 +bsemi=0204F +bsim=0223D +bsime=022CD +bsol=0005C +bsolb=029C5 +bsolhsub=027C8 +bull=02022 +bullet=02022 +bump=0224E +bumpE=02AAE +bumpe=0224F +bumpeq=0224F +cacute=00107 +cap=02229 +capand=02A44 +capbrcup=02A49 +capcap=02A4B +capcup=02A47 +capdot=02A40 +caret=02041 +caron=002C7 +ccaps=02A4D +ccaron=0010D +ccedil=000E7 +ccirc=00109 +ccups=02A4C +ccupssm=02A50 +cdot=0010B +cedil=000B8 +cemptyv=029B2 +cent=000A2 +centerdot=000B7 +cfr=1D520 +chcy=00447 +check=02713 +checkmark=02713 +chi=003C7 +cir=025CB +cirE=029C3 +circ=002C6 +circeq=02257 +circlearrowleft=021BA +circlearrowright=021BB +circledR=000AE +circledS=024C8 +circledast=0229B +circledcirc=0229A +circleddash=0229D +cire=02257 +cirfnint=02A10 +cirmid=02AEF +cirscir=029C2 +clubs=02663 +clubsuit=02663 +colon=0003A +colone=02254 +coloneq=02254 +comma=0002C +commat=00040 +comp=02201 +compfn=02218 +complement=02201 +complexes=02102 +cong=02245 +congdot=02A6D +conint=0222E +copf=1D554 +coprod=02210 +copy=000A9 +copysr=02117 +crarr=021B5 +cross=02717 +cscr=1D4B8 +csub=02ACF +csube=02AD1 +csup=02AD0 +csupe=02AD2 +ctdot=022EF +cudarrl=02938 +cudarrr=02935 +cuepr=022DE +cuesc=022DF +cularr=021B6 +cularrp=0293D +cup=0222A +cupbrcap=02A48 +cupcap=02A46 +cupcup=02A4A +cupdot=0228D +cupor=02A45 +curarr=021B7 +curarrm=0293C +curlyeqprec=022DE +curlyeqsucc=022DF +curlyvee=022CE +curlywedge=022CF +curren=000A4 +curvearrowleft=021B6 +curvearrowright=021B7 +cuvee=022CE +cuwed=022CF +cwconint=02232 +cwint=02231 +cylcty=0232D +dArr=021D3 +dHar=02965 +dagger=02020 +daleth=02138 +darr=02193 +dash=02010 +dashv=022A3 +dbkarow=0290F +dblac=002DD +dcaron=0010F +dcy=00434 +dd=02146 +ddagger=02021 +ddarr=021CA +ddotseq=02A77 +deg=000B0 +delta=003B4 +demptyv=029B1 +dfisht=0297F +dfr=1D521 +dharl=021C3 +dharr=021C2 +diam=022C4 +diamond=022C4 +diamondsuit=02666 +diams=02666 +die=000A8 +digamma=003DD +disin=022F2 +div=000F7 +divide=000F7 +divideontimes=022C7 +divonx=022C7 +djcy=00452 +dlcorn=0231E +dlcrop=0230D +dollar=00024 +dopf=1D555 +dot=002D9 +doteq=02250 +doteqdot=02251 +dotminus=02238 +dotplus=02214 +dotsquare=022A1 +doublebarwedge=02306 +downarrow=02193 +downdownarrows=021CA +downharpoonleft=021C3 +downharpoonright=021C2 +drbkarow=02910 +drcorn=0231F +drcrop=0230C +dscr=1D4B9 +dscy=00455 +dsol=029F6 +dstrok=00111 +dtdot=022F1 +dtri=025BF +dtrif=025BE +duarr=021F5 +duhar=0296F +dwangle=029A6 +dzcy=0045F +dzigrarr=027FF +eDDot=02A77 +eDot=02251 +eacute=000E9 +easter=02A6E +ecaron=0011B +ecir=02256 +ecirc=000EA +ecolon=02255 +ecy=0044D +edot=00117 +ee=02147 +efDot=02252 +efr=1D522 +eg=02A9A +egrave=000E8 +egs=02A96 +egsdot=02A98 +el=02A99 +elinters=023E7 +ell=02113 +els=02A95 +elsdot=02A97 +emacr=00113 +empty=02205 +emptyset=02205 +emptyv=02205 +emsp13=02004 +emsp14=02005 +emsp=02003 +eng=0014B +ensp=02002 +eogon=00119 +eopf=1D556 +epar=022D5 +eparsl=029E3 +eplus=02A71 +epsi=003B5 +epsilon=003B5 +epsiv=003F5 +eqcirc=02256 +eqcolon=02255 +eqsim=02242 +eqslantgtr=02A96 +eqslantless=02A95 +equals=0003D +equest=0225F +equiv=02261 +equivDD=02A78 +eqvparsl=029E5 +erDot=02253 +erarr=02971 +escr=0212F +esdot=02250 +esim=02242 +eta=003B7 +eth=000F0 +euml=000EB +euro=020AC +excl=00021 +exist=02203 +expectation=02130 +exponentiale=02147 +fallingdotseq=02252 +fcy=00444 +female=02640 +ffilig=0FB03 +fflig=0FB00 +ffllig=0FB04 +ffr=1D523 +filig=0FB01 +flat=0266D +fllig=0FB02 +fltns=025B1 +fnof=00192 +fopf=1D557 +forall=02200 +fork=022D4 +forkv=02AD9 +fpartint=02A0D +frac12=000BD +frac13=02153 +frac14=000BC +frac15=02155 +frac16=02159 +frac18=0215B +frac23=02154 +frac25=02156 +frac34=000BE +frac35=02157 +frac38=0215C +frac45=02158 +frac56=0215A +frac58=0215D +frac78=0215E +frasl=02044 +frown=02322 +fscr=1D4BB +gE=02267 +gEl=02A8C +gacute=001F5 +gamma=003B3 +gammad=003DD +gap=02A86 +gbreve=0011F +gcirc=0011D +gcy=00433 +gdot=00121 +ge=02265 +gel=022DB +geq=02265 +geqq=02267 +geqslant=02A7E +ges=02A7E +gescc=02AA9 +gesdot=02A80 +gesdoto=02A82 +gesdotol=02A84 +gesles=02A94 +gfr=1D524 +gg=0226B +ggg=022D9 +gimel=02137 +gjcy=00453 +gl=02277 +glE=02A92 +gla=02AA5 +glj=02AA4 +gnE=02269 +gnap=02A8A +gnapprox=02A8A +gne=02A88 +gneq=02A88 +gneqq=02269 +gnsim=022E7 +gopf=1D558 +grave=00060 +gscr=0210A +gsim=02273 +gsime=02A8E +gsiml=02A90 +gt=0003E +gtcc=02AA7 +gtcir=02A7A +gtdot=022D7 +gtlPar=02995 +gtquest=02A7C +gtrapprox=02A86 +gtrarr=02978 +gtrdot=022D7 +gtreqless=022DB +gtreqqless=02A8C +gtrless=02277 +gtrsim=02273 +hArr=021D4 +hairsp=0200A +half=000BD +hamilt=0210B +hardcy=0044A +harr=02194 +harrcir=02948 +harrw=021AD +hbar=0210F +hcirc=00125 +hearts=02665 +heartsuit=02665 +hellip=02026 +hercon=022B9 +hfr=1D525 +hksearow=02925 +hkswarow=02926 +hoarr=021FF +homtht=0223B +hookleftarrow=021A9 +hookrightarrow=021AA +hopf=1D559 +horbar=02015 +hscr=1D4BD +hslash=0210F +hstrok=00127 +hybull=02043 +hyphen=02010 +iacute=000ED +ic=02063 +icirc=000EE +icy=00438 +iecy=00435 +iexcl=000A1 +iff=021D4 +ifr=1D526 +igrave=000EC +ii=02148 +iiiint=02A0C +iiint=0222D +iinfin=029DC +iiota=02129 +ijlig=00133 +imacr=0012B +image=02111 +imagline=02110 +imagpart=02111 +imath=00131 +imof=022B7 +imped=001B5 +in=02208 +incare=02105 +infin=0221E +infintie=029DD +inodot=00131 +int=0222B +intcal=022BA +integers=02124 +intercal=022BA +intlarhk=02A17 +intprod=02A3C +iocy=00451 +iogon=0012F +iopf=1D55A +iota=003B9 +iprod=02A3C +iquest=000BF +iscr=1D4BE +isin=02208 +isinE=022F9 +isindot=022F5 +isins=022F4 +isinsv=022F3 +isinv=02208 +it=02062 +itilde=00129 +iukcy=00456 +iuml=000EF +jcirc=00135 +jcy=00439 +jfr=1D527 +jmath=00237 +jopf=1D55B +jscr=1D4BF +jsercy=00458 +jukcy=00454 +kappa=003BA +kappav=003F0 +kcedil=00137 +kcy=0043A +kfr=1D528 +kgreen=00138 +khcy=00445 +kjcy=0045C +kopf=1D55C +kscr=1D4C0 +lAarr=021DA +lArr=021D0 +lAtail=0291B +lBarr=0290E +lE=02266 +lEg=02A8B +lHar=02962 +lacute=0013A +laemptyv=029B4 +lagran=02112 +lambda=003BB +lang=027E8 +langd=02991 +langle=027E8 +lap=02A85 +laquo=000AB +larr=02190 +larrb=021E4 +larrbfs=0291F +larrfs=0291D +larrhk=021A9 +larrlp=021AB +larrpl=02939 +larrsim=02973 +larrtl=021A2 +lat=02AAB +latail=02919 +late=02AAD +lbarr=0290C +lbbrk=02772 +lbrace=0007B +lbrack=0005B +lbrke=0298B +lbrksld=0298F +lbrkslu=0298D +lcaron=0013E +lcedil=0013C +lceil=02308 +lcub=0007B +lcy=0043B +ldca=02936 +ldquo=0201C +ldquor=0201E +ldrdhar=02967 +ldrushar=0294B +ldsh=021B2 +le=02264 +leftarrow=02190 +leftarrowtail=021A2 +leftharpoondown=021BD +leftharpoonup=021BC +leftleftarrows=021C7 +leftrightarrow=02194 +leftrightarrows=021C6 +leftrightharpoons=021CB +leftrightsquigarrow=021AD +leftthreetimes=022CB +leg=022DA +leq=02264 +leqq=02266 +leqslant=02A7D +les=02A7D +lescc=02AA8 +lesdot=02A7F +lesdoto=02A81 +lesdotor=02A83 +lesges=02A93 +lessapprox=02A85 +lessdot=022D6 +lesseqgtr=022DA +lesseqqgtr=02A8B +lessgtr=02276 +lesssim=02272 +lfisht=0297C +lfloor=0230A +lfr=1D529 +lg=02276 +lgE=02A91 +lhard=021BD +lharu=021BC +lharul=0296A +lhblk=02584 +ljcy=00459 +ll=0226A +llarr=021C7 +llcorner=0231E +llhard=0296B +lltri=025FA +lmidot=00140 +lmoust=023B0 +lmoustache=023B0 +lnE=02268 +lnap=02A89 +lnapprox=02A89 +lne=02A87 +lneq=02A87 +lneqq=02268 +lnsim=022E6 +loang=027EC +loarr=021FD +lobrk=027E6 +longleftarrow=027F5 +longleftrightarrow=027F7 +longmapsto=027FC +longrightarrow=027F6 +looparrowleft=021AB +looparrowright=021AC +lopar=02985 +lopf=1D55D +loplus=02A2D +lotimes=02A34 +lowast=02217 +lowbar=0005F +loz=025CA +lozenge=025CA +lozf=029EB +lpar=00028 +lparlt=02993 +lrarr=021C6 +lrcorner=0231F +lrhar=021CB +lrhard=0296D +lrm=0200E +lrtri=022BF +lsaquo=02039 +lscr=1D4C1 +lsh=021B0 +lsim=02272 +lsime=02A8D +lsimg=02A8F +lsqb=0005B +lsquo=02018 +lsquor=0201A +lstrok=00142 +lt=0003C +ltcc=02AA6 +ltcir=02A79 +ltdot=022D6 +lthree=022CB +ltimes=022C9 +ltlarr=02976 +ltquest=02A7B +ltrPar=02996 +ltri=025C3 +ltrie=022B4 +ltrif=025C2 +lurdshar=0294A +luruhar=02966 +mDDot=0223A +macr=000AF +male=02642 +malt=02720 +maltese=02720 +map=021A6 +mapsto=021A6 +mapstodown=021A7 +mapstoleft=021A4 +mapstoup=021A5 +marker=025AE +mcomma=02A29 +mcy=0043C +mdash=02014 +measuredangle=02221 +mfr=1D52A +mho=02127 +micro=000B5 +mid=02223 +midast=0002A +midcir=02AF0 +middot=000B7 +minus=02212 +minusb=0229F +minusd=02238 +minusdu=02A2A +mlcp=02ADB +mldr=02026 +mnplus=02213 +models=022A7 +mopf=1D55E +mp=02213 +mscr=1D4C2 +mstpos=0223E +mu=003BC +multimap=022B8 +mumap=022B8 +nLeftarrow=021CD +nLeftrightarrow=021CE +nRightarrow=021CF +nVDash=022AF +nVdash=022AE +nabla=02207 +nacute=00144 +nap=02249 +napos=00149 +napprox=02249 +natur=0266E +natural=0266E +naturals=02115 +nbsp=000A0 +ncap=02A43 +ncaron=00148 +ncedil=00146 +ncong=02247 +ncup=02A42 +ncy=0043D +ndash=02013 +ne=02260 +neArr=021D7 +nearhk=02924 +nearr=02197 +nearrow=02197 +nequiv=02262 +nesear=02928 +nexist=02204 +nexists=02204 +nfr=1D52B +nge=02271 +ngeq=02271 +ngsim=02275 +ngt=0226F +ngtr=0226F +nhArr=021CE +nharr=021AE +nhpar=02AF2 +ni=0220B +nis=022FC +nisd=022FA +niv=0220B +njcy=0045A +nlArr=021CD +nlarr=0219A +nldr=02025 +nle=02270 +nleftarrow=0219A +nleftrightarrow=021AE +nleq=02270 +nless=0226E +nlsim=02274 +nlt=0226E +nltri=022EA +nltrie=022EC +nmid=02224 +nopf=1D55F +not=000AC +notin=02209 +notinva=02209 +notinvb=022F7 +notinvc=022F6 +notni=0220C +notniva=0220C +notnivb=022FE +notnivc=022FD +npar=02226 +nparallel=02226 +npolint=02A14 +npr=02280 +nprcue=022E0 +nprec=02280 +nrArr=021CF +nrarr=0219B +nrightarrow=0219B +nrtri=022EB +nrtrie=022ED +nsc=02281 +nsccue=022E1 +nscr=1D4C3 +nshortmid=02224 +nshortparallel=02226 +nsim=02241 +nsime=02244 +nsimeq=02244 +nsmid=02224 +nspar=02226 +nsqsube=022E2 +nsqsupe=022E3 +nsub=02284 +nsube=02288 +nsubseteq=02288 +nsucc=02281 +nsup=02285 +nsupe=02289 +nsupseteq=02289 +ntgl=02279 +ntilde=000F1 +ntlg=02278 +ntriangleleft=022EA +ntrianglelefteq=022EC +ntriangleright=022EB +ntrianglerighteq=022ED +nu=003BD +num=00023 +numero=02116 +numsp=02007 +nvDash=022AD +nvHarr=02904 +nvdash=022AC +nvinfin=029DE +nvlArr=02902 +nvrArr=02903 +nwArr=021D6 +nwarhk=02923 +nwarr=02196 +nwarrow=02196 +nwnear=02927 +oS=024C8 +oacute=000F3 +oast=0229B +ocir=0229A +ocirc=000F4 +ocy=0043E +odash=0229D +odblac=00151 +odiv=02A38 +odot=02299 +odsold=029BC +oelig=00153 +ofcir=029BF +ofr=1D52C +ogon=002DB +ograve=000F2 +ogt=029C1 +ohbar=029B5 +ohm=003A9 +oint=0222E +olarr=021BA +olcir=029BE +olcross=029BB +oline=0203E +olt=029C0 +omacr=0014D +omega=003C9 +omicron=003BF +omid=029B6 +ominus=02296 +oopf=1D560 +opar=029B7 +operp=029B9 +oplus=02295 +or=02228 +orarr=021BB +ord=02A5D +order=02134 +orderof=02134 +ordf=000AA +ordm=000BA +origof=022B6 +oror=02A56 +orslope=02A57 +orv=02A5B +oscr=02134 +oslash=000F8 +osol=02298 +otilde=000F5 +otimes=02297 +otimesas=02A36 +ouml=000F6 +ovbar=0233D +par=02225 +para=000B6 +parallel=02225 +parsim=02AF3 +parsl=02AFD +part=02202 +pcy=0043F +percnt=00025 +period=0002E +permil=02030 +perp=022A5 +pertenk=02031 +pfr=1D52D +phi=003C6 +phiv=003D5 +phmmat=02133 +phone=0260E +pi=003C0 +pitchfork=022D4 +piv=003D6 +planck=0210F +planckh=0210E +plankv=0210F +plus=0002B +plusacir=02A23 +plusb=0229E +pluscir=02A22 +plusdo=02214 +plusdu=02A25 +pluse=02A72 +plusmn=000B1 +plussim=02A26 +plustwo=02A27 +pm=000B1 +pointint=02A15 +popf=1D561 +pound=000A3 +pr=0227A +prE=02AB3 +prap=02AB7 +prcue=0227C +pre=02AAF +prec=0227A +precapprox=02AB7 +preccurlyeq=0227C +preceq=02AAF +precnapprox=02AB9 +precneqq=02AB5 +precnsim=022E8 +precsim=0227E +prime=02032 +primes=02119 +prnE=02AB5 +prnap=02AB9 +prnsim=022E8 +prod=0220F +profalar=0232E +profline=02312 +profsurf=02313 +prop=0221D +propto=0221D +prsim=0227E +prurel=022B0 +pscr=1D4C5 +psi=003C8 +puncsp=02008 +qfr=1D52E +qint=02A0C +qopf=1D562 +qprime=02057 +qscr=1D4C6 +quaternions=0210D +quatint=02A16 +quest=0003F +questeq=0225F +quot=00022 +rAarr=021DB +rArr=021D2 +rAtail=0291C +rBarr=0290F +rHar=02964 +racute=00155 +radic=0221A +raemptyv=029B3 +rang=027E9 +rangd=02992 +range=029A5 +rangle=027E9 +raquo=000BB +rarr=02192 +rarrap=02975 +rarrb=021E5 +rarrbfs=02920 +rarrc=02933 +rarrfs=0291E +rarrhk=021AA +rarrlp=021AC +rarrpl=02945 +rarrsim=02974 +rarrtl=021A3 +rarrw=0219D +ratail=0291A +ratio=02236 +rationals=0211A +rbarr=0290D +rbbrk=02773 +rbrace=0007D +rbrack=0005D +rbrke=0298C +rbrksld=0298E +rbrkslu=02990 +rcaron=00159 +rcedil=00157 +rceil=02309 +rcub=0007D +rcy=00440 +rdca=02937 +rdldhar=02969 +rdquo=0201D +rdquor=0201D +rdsh=021B3 +real=0211C +realine=0211B +realpart=0211C +reals=0211D +rect=025AD +reg=000AE +rfisht=0297D +rfloor=0230B +rfr=1D52F +rhard=021C1 +rharu=021C0 +rharul=0296C +rho=003C1 +rhov=003F1 +rightarrow=02192 +rightarrowtail=021A3 +rightharpoondown=021C1 +rightharpoonup=021C0 +rightleftarrows=021C4 +rightleftharpoons=021CC +rightrightarrows=021C9 +rightsquigarrow=0219D +rightthreetimes=022CC +ring=002DA +risingdotseq=02253 +rlarr=021C4 +rlhar=021CC +rlm=0200F +rmoust=023B1 +rmoustache=023B1 +rnmid=02AEE +roang=027ED +roarr=021FE +robrk=027E7 +ropar=02986 +ropf=1D563 +roplus=02A2E +rotimes=02A35 +rpar=00029 +rpargt=02994 +rppolint=02A12 +rrarr=021C9 +rsaquo=0203A +rscr=1D4C7 +rsh=021B1 +rsqb=0005D +rsquo=02019 +rsquor=02019 +rthree=022CC +rtimes=022CA +rtri=025B9 +rtrie=022B5 +rtrif=025B8 +rtriltri=029CE +ruluhar=02968 +rx=0211E +sacute=0015B +sbquo=0201A +sc=0227B +scE=02AB4 +scap=02AB8 +scaron=00161 +sccue=0227D +sce=02AB0 +scedil=0015F +scirc=0015D +scnE=02AB6 +scnap=02ABA +scnsim=022E9 +scpolint=02A13 +scsim=0227F +scy=00441 +sdot=022C5 +sdotb=022A1 +sdote=02A66 +seArr=021D8 +searhk=02925 +searr=02198 +searrow=02198 +sect=000A7 +semi=0003B +seswar=02929 +setminus=02216 +setmn=02216 +sext=02736 +sfr=1D530 +sfrown=02322 +sharp=0266F +shchcy=00449 +shcy=00448 +shortmid=02223 +shortparallel=02225 +shy=000AD +sigma=003C3 +sigmaf=003C2 +sigmav=003C2 +sim=0223C +simdot=02A6A +sime=02243 +simeq=02243 +simg=02A9E +simgE=02AA0 +siml=02A9D +simlE=02A9F +simne=02246 +simplus=02A24 +simrarr=02972 +slarr=02190 +smallsetminus=02216 +smashp=02A33 +smeparsl=029E4 +smid=02223 +smile=02323 +smt=02AAA +smte=02AAC +softcy=0044C +sol=0002F +solb=029C4 +solbar=0233F +sopf=1D564 +spades=02660 +spadesuit=02660 +spar=02225 +sqcap=02293 +sqcup=02294 +sqsub=0228F +sqsube=02291 +sqsubset=0228F +sqsubseteq=02291 +sqsup=02290 +sqsupe=02292 +sqsupset=02290 +sqsupseteq=02292 +squ=025A1 +square=025A1 +squarf=025AA +squf=025AA +srarr=02192 +sscr=1D4C8 +ssetmn=02216 +ssmile=02323 +sstarf=022C6 +star=02606 +starf=02605 +straightepsilon=003F5 +straightphi=003D5 +strns=000AF +sub=02282 +subE=02AC5 +subdot=02ABD +sube=02286 +subedot=02AC3 +submult=02AC1 +subnE=02ACB +subne=0228A +subplus=02ABF +subrarr=02979 +subset=02282 +subseteq=02286 +subseteqq=02AC5 +subsetneq=0228A +subsetneqq=02ACB +subsim=02AC7 +subsub=02AD5 +subsup=02AD3 +succ=0227B +succapprox=02AB8 +succcurlyeq=0227D +succeq=02AB0 +succnapprox=02ABA +succneqq=02AB6 +succnsim=022E9 +succsim=0227F +sum=02211 +sung=0266A +sup1=000B9 +sup2=000B2 +sup3=000B3 +sup=02283 +supE=02AC6 +supdot=02ABE +supdsub=02AD8 +supe=02287 +supedot=02AC4 +suphsol=027C9 +suphsub=02AD7 +suplarr=0297B +supmult=02AC2 +supnE=02ACC +supne=0228B +supplus=02AC0 +supset=02283 +supseteq=02287 +supseteqq=02AC6 +supsetneq=0228B +supsetneqq=02ACC +supsim=02AC8 +supsub=02AD4 +supsup=02AD6 +swArr=021D9 +swarhk=02926 +swarr=02199 +swarrow=02199 +swnwar=0292A +szlig=000DF +target=02316 +tau=003C4 +tbrk=023B4 +tcaron=00165 +tcedil=00163 +tcy=00442 +tdot=020DB +telrec=02315 +tfr=1D531 +there4=02234 +therefore=02234 +theta=003B8 +thetasym=003D1 +thetav=003D1 +thickapprox=02248 +thicksim=0223C +thinsp=02009 +thkap=02248 +thksim=0223C +thorn=000FE +tilde=002DC +times=000D7 +timesb=022A0 +timesbar=02A31 +timesd=02A30 +tint=0222D +toea=02928 +top=022A4 +topbot=02336 +topcir=02AF1 +topf=1D565 +topfork=02ADA +tosa=02929 +tprime=02034 +trade=02122 +triangle=025B5 +triangledown=025BF +triangleleft=025C3 +trianglelefteq=022B4 +triangleq=0225C +triangleright=025B9 +trianglerighteq=022B5 +tridot=025EC +trie=0225C +triminus=02A3A +triplus=02A39 +trisb=029CD +tritime=02A3B +trpezium=023E2 +tscr=1D4C9 +tscy=00446 +tshcy=0045B +tstrok=00167 +twixt=0226C +twoheadleftarrow=0219E +twoheadrightarrow=021A0 +uArr=021D1 +uHar=02963 +uacute=000FA +uarr=02191 +ubrcy=0045E +ubreve=0016D +ucirc=000FB +ucy=00443 +udarr=021C5 +udblac=00171 +udhar=0296E +ufisht=0297E +ufr=1D532 +ugrave=000F9 +uharl=021BF +uharr=021BE +uhblk=02580 +ulcorn=0231C +ulcorner=0231C +ulcrop=0230F +ultri=025F8 +umacr=0016B +uml=000A8 +uogon=00173 +uopf=1D566 +uparrow=02191 +updownarrow=02195 +upharpoonleft=021BF +upharpoonright=021BE +uplus=0228E +upsi=003C5 +upsih=003D2 +upsilon=003C5 +upuparrows=021C8 +urcorn=0231D +urcorner=0231D +urcrop=0230E +uring=0016F +urtri=025F9 +uscr=1D4CA +utdot=022F0 +utilde=00169 +utri=025B5 +utrif=025B4 +uuarr=021C8 +uuml=000FC +uwangle=029A7 +vArr=021D5 +vBar=02AE8 +vBarv=02AE9 +vDash=022A8 +vangrt=0299C +varepsilon=003F5 +varkappa=003F0 +varnothing=02205 +varphi=003D5 +varpi=003D6 +varpropto=0221D +varr=02195 +varrho=003F1 +varsigma=003C2 +vartheta=003D1 +vartriangleleft=022B2 +vartriangleright=022B3 +vcy=00432 +vdash=022A2 +vee=02228 +veebar=022BB +veeeq=0225A +vellip=022EE +verbar=0007C +vert=0007C +vfr=1D533 +vltri=022B2 +vopf=1D567 +vprop=0221D +vrtri=022B3 +vscr=1D4CB +vzigzag=0299A +wcirc=00175 +wedbar=02A5F +wedge=02227 +wedgeq=02259 +weierp=02118 +wfr=1D534 +wopf=1D568 +wp=02118 +wr=02240 +wreath=02240 +wscr=1D4CC +xcap=022C2 +xcirc=025EF +xcup=022C3 +xdtri=025BD +xfr=1D535 +xhArr=027FA +xharr=027F7 +xi=003BE +xlArr=027F8 +xlarr=027F5 +xmap=027FC +xnis=022FB +xodot=02A00 +xopf=1D569 +xoplus=02A01 +xotime=02A02 +xrArr=027F9 +xrarr=027F6 +xscr=1D4CD +xsqcup=02A06 +xuplus=02A04 +xutri=025B3 +xvee=022C1 +xwedge=022C0 +yacute=000FD +yacy=0044F +ycirc=00177 +ycy=0044B +yen=000A5 +yfr=1D536 +yicy=00457 +yopf=1D56A +yscr=1D4CE +yucy=0044E +yuml=000FF +zacute=0017A +zcaron=0017E +zcy=00437 +zdot=0017C +zeetrf=02128 +zeta=003B6 +zfr=1D537 +zhcy=00436 +zigrarr=021DD +zopf=1D56B +zscr=1D4CF +zwj=0200D +zwnj=0200C diff --git a/src/org/jsoup/nodes/package-info.java b/src/org/jsoup/nodes/package-info.java new file mode 100644 index 0000000000..24b12803ff --- /dev/null +++ b/src/org/jsoup/nodes/package-info.java @@ -0,0 +1,4 @@ +/** + HTML document structure nodes. + */ +package org.jsoup.nodes;
\ No newline at end of file diff --git a/src/org/jsoup/package-info.java b/src/org/jsoup/package-info.java new file mode 100644 index 0000000000..49526116b4 --- /dev/null +++ b/src/org/jsoup/package-info.java @@ -0,0 +1,4 @@ +/** + Contains the main {@link org.jsoup.Jsoup} class, which provides convenient static access to the jsoup functionality. + */ +package org.jsoup;
\ No newline at end of file diff --git a/src/org/jsoup/parser/CharacterReader.java b/src/org/jsoup/parser/CharacterReader.java new file mode 100644 index 0000000000..b549a571a0 --- /dev/null +++ b/src/org/jsoup/parser/CharacterReader.java @@ -0,0 +1,230 @@ +package org.jsoup.parser; + +import org.jsoup.helper.Validate; + +/** + CharacterReader consumes tokens off a string. To replace the old TokenQueue. + */ +class CharacterReader { + static final char EOF = (char) -1; + + private final String input; + private final int length; + private int pos = 0; + private int mark = 0; + + CharacterReader(String input) { + Validate.notNull(input); + input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns to newlines + + this.input = input; + this.length = input.length(); + } + + int pos() { + return pos; + } + + boolean isEmpty() { + return pos >= length; + } + + char current() { + return isEmpty() ? EOF : input.charAt(pos); + } + + char consume() { + char val = isEmpty() ? EOF : input.charAt(pos); + pos++; + return val; + } + + void unconsume() { + pos--; + } + + void advance() { + pos++; + } + + void mark() { + mark = pos; + } + + void rewindToMark() { + pos = mark; + } + + String consumeAsString() { + return input.substring(pos, pos++); + } + + String consumeTo(char c) { + int offset = input.indexOf(c, pos); + if (offset != -1) { + String consumed = input.substring(pos, offset); + pos += consumed.length(); + return consumed; + } else { + return consumeToEnd(); + } + } + + String consumeTo(String seq) { + int offset = input.indexOf(seq, pos); + if (offset != -1) { + String consumed = input.substring(pos, offset); + pos += consumed.length(); + return consumed; + } else { + return consumeToEnd(); + } + } + + String consumeToAny(char... seq) { + int start = pos; + + OUTER: while (!isEmpty()) { + char c = input.charAt(pos); + for (char seek : seq) { + if (seek == c) + break OUTER; + } + pos++; + } + + return pos > start ? input.substring(start, pos) : ""; + } + + String consumeToEnd() { + String data = input.substring(pos, input.length()); + pos = input.length(); + return data; + } + + String consumeLetterSequence() { + int start = pos; + while (!isEmpty()) { + char c = input.charAt(pos); + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + pos++; + else + break; + } + + return input.substring(start, pos); + } + + String consumeLetterThenDigitSequence() { + int start = pos; + while (!isEmpty()) { + char c = input.charAt(pos); + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + pos++; + else + break; + } + while (!isEmpty()) { + char c = input.charAt(pos); + if (c >= '0' && c <= '9') + pos++; + else + break; + } + + return input.substring(start, pos); + } + + String consumeHexSequence() { + int start = pos; + while (!isEmpty()) { + char c = input.charAt(pos); + if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) + pos++; + else + break; + } + return input.substring(start, pos); + } + + String consumeDigitSequence() { + int start = pos; + while (!isEmpty()) { + char c = input.charAt(pos); + if (c >= '0' && c <= '9') + pos++; + else + break; + } + return input.substring(start, pos); + } + + boolean matches(char c) { + return !isEmpty() && input.charAt(pos) == c; + + } + + boolean matches(String seq) { + return input.startsWith(seq, pos); + } + + boolean matchesIgnoreCase(String seq) { + return input.regionMatches(true, pos, seq, 0, seq.length()); + } + + boolean matchesAny(char... seq) { + if (isEmpty()) + return false; + + char c = input.charAt(pos); + for (char seek : seq) { + if (seek == c) + return true; + } + return false; + } + + boolean matchesLetter() { + if (isEmpty()) + return false; + char c = input.charAt(pos); + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); + } + + boolean matchesDigit() { + if (isEmpty()) + return false; + char c = input.charAt(pos); + return (c >= '0' && c <= '9'); + } + + boolean matchConsume(String seq) { + if (matches(seq)) { + pos += seq.length(); + return true; + } else { + return false; + } + } + + boolean matchConsumeIgnoreCase(String seq) { + if (matchesIgnoreCase(seq)) { + pos += seq.length(); + return true; + } else { + return false; + } + } + + boolean containsIgnoreCase(String seq) { + // used to check presence of </title>, </style>. only finds consistent case. + String loScan = seq.toLowerCase(); + String hiScan = seq.toUpperCase(); + return (input.indexOf(loScan, pos) > -1) || (input.indexOf(hiScan, pos) > -1); + } + + @Override + public String toString() { + return input.substring(pos); + } +} diff --git a/src/org/jsoup/parser/HtmlTreeBuilder.java b/src/org/jsoup/parser/HtmlTreeBuilder.java new file mode 100644 index 0000000000..457a4c3249 --- /dev/null +++ b/src/org/jsoup/parser/HtmlTreeBuilder.java @@ -0,0 +1,672 @@ +package org.jsoup.parser; + +import org.jsoup.helper.DescendableLinkedList; +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.*; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * HTML Tree Builder; creates a DOM from Tokens. + */ +class HtmlTreeBuilder extends TreeBuilder { + + private HtmlTreeBuilderState state; // the current state + private HtmlTreeBuilderState originalState; // original / marked state + + private boolean baseUriSetFromDoc = false; + private Element headElement; // the current head element + private Element formElement; // the current form element + private Element contextElement; // fragment parse context -- could be null even if fragment parsing + private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active (open) formatting elements + private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars in table to be shifted out + + private boolean framesetOk = true; // if ok to go into frameset + private boolean fosterInserts = false; // if next inserts should be fostered + private boolean fragmentParsing = false; // if parsing a fragment of html + + HtmlTreeBuilder() {} + + @Override + Document parse(String input, String baseUri, ParseErrorList errors) { + state = HtmlTreeBuilderState.Initial; + return super.parse(input, baseUri, errors); + } + + List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors) { + // context may be null + state = HtmlTreeBuilderState.Initial; + initialiseParse(inputFragment, baseUri, errors); + contextElement = context; + fragmentParsing = true; + Element root = null; + + if (context != null) { + if (context.ownerDocument() != null) // quirks setup: + doc.quirksMode(context.ownerDocument().quirksMode()); + + // initialise the tokeniser state: + String contextTag = context.tagName(); + if (StringUtil.in(contextTag, "title", "textarea")) + tokeniser.transition(TokeniserState.Rcdata); + else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) + tokeniser.transition(TokeniserState.Rawtext); + else if (contextTag.equals("script")) + tokeniser.transition(TokeniserState.ScriptData); + else if (contextTag.equals(("noscript"))) + tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext + else if (contextTag.equals("plaintext")) + tokeniser.transition(TokeniserState.Data); + else + tokeniser.transition(TokeniserState.Data); // default + + root = new Element(Tag.valueOf("html"), baseUri); + doc.appendChild(root); + stack.push(root); + resetInsertionMode(); + // todo: setup form element to nearest form on context (up ancestor chain) + } + + runParser(); + if (context != null) + return root.childNodes(); + else + return doc.childNodes(); + } + + @Override + protected boolean process(Token token) { + currentToken = token; + return this.state.process(token, this); + } + + boolean process(Token token, HtmlTreeBuilderState state) { + currentToken = token; + return state.process(token, this); + } + + void transition(HtmlTreeBuilderState state) { + this.state = state; + } + + HtmlTreeBuilderState state() { + return state; + } + + void markInsertionMode() { + originalState = state; + } + + HtmlTreeBuilderState originalState() { + return originalState; + } + + void framesetOk(boolean framesetOk) { + this.framesetOk = framesetOk; + } + + boolean framesetOk() { + return framesetOk; + } + + Document getDocument() { + return doc; + } + + String getBaseUri() { + return baseUri; + } + + void maybeSetBaseUri(Element base) { + if (baseUriSetFromDoc) // only listen to the first <base href> in parse + return; + + String href = base.absUrl("href"); + if (href.length() != 0) { // ignore <base target> etc + baseUri = href; + baseUriSetFromDoc = true; + doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants + } + } + + boolean isFragmentParsing() { + return fragmentParsing; + } + + void error(HtmlTreeBuilderState state) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state)); + } + + Element insert(Token.StartTag startTag) { + // handle empty unknown tags + // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate fake end tag. + if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) { + Element el = insertEmpty(startTag); + process(new Token.EndTag(el.tagName())); // ensure we get out of whatever state we are in + return el; + } + + Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes); + insert(el); + return el; + } + + Element insert(String startTagName) { + Element el = new Element(Tag.valueOf(startTagName), baseUri); + insert(el); + return el; + } + + void insert(Element el) { + insertNode(el); + stack.add(el); + } + + Element insertEmpty(Token.StartTag startTag) { + Tag tag = Tag.valueOf(startTag.name()); + Element el = new Element(tag, baseUri, startTag.attributes); + insertNode(el); + if (startTag.isSelfClosing()) { + tokeniser.acknowledgeSelfClosingFlag(); + if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output + tag.setSelfClosing(); + } + return el; + } + + void insert(Token.Comment commentToken) { + Comment comment = new Comment(commentToken.getData(), baseUri); + insertNode(comment); + } + + void insert(Token.Character characterToken) { + Node node; + // characters in script and style go in as datanodes, not text nodes + if (StringUtil.in(currentElement().tagName(), "script", "style")) + node = new DataNode(characterToken.getData(), baseUri); + else + node = new TextNode(characterToken.getData(), baseUri); + currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. + } + + private void insertNode(Node node) { + // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc + if (stack.size() == 0) + doc.appendChild(node); + else if (isFosterInserts()) + insertInFosterParent(node); + else + currentElement().appendChild(node); + } + + Element pop() { + // todo - dev, remove validation check + if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell")) + Validate.isFalse(true, "pop td not in cell"); + if (stack.peekLast().nodeName().equals("html")) + Validate.isFalse(true, "popping html!"); + return stack.pollLast(); + } + + void push(Element element) { + stack.add(element); + } + + DescendableLinkedList<Element> getStack() { + return stack; + } + + boolean onStack(Element el) { + return isElementInQueue(stack, el); + } + + private boolean isElementInQueue(DescendableLinkedList<Element> queue, Element element) { + Iterator<Element> it = queue.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next == element) { + return true; + } + } + return false; + } + + Element getFromStack(String elName) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next.nodeName().equals(elName)) { + return next; + } + } + return null; + } + + boolean removeFromStack(Element el) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next == el) { + it.remove(); + return true; + } + } + return false; + } + + void popStackToClose(String elName) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next.nodeName().equals(elName)) { + it.remove(); + break; + } else { + it.remove(); + } + } + } + + void popStackToClose(String... elNames) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (StringUtil.in(next.nodeName(), elNames)) { + it.remove(); + break; + } else { + it.remove(); + } + } + } + + void popStackToBefore(String elName) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next.nodeName().equals(elName)) { + break; + } else { + it.remove(); + } + } + } + + void clearStackToTableContext() { + clearStackToContext("table"); + } + + void clearStackToTableBodyContext() { + clearStackToContext("tbody", "tfoot", "thead"); + } + + void clearStackToTableRowContext() { + clearStackToContext("tr"); + } + + private void clearStackToContext(String... nodeNames) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html")) + break; + else + it.remove(); + } + } + + Element aboveOnStack(Element el) { + assert onStack(el); + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next == el) { + return it.next(); + } + } + return null; + } + + void insertOnStackAfter(Element after, Element in) { + int i = stack.lastIndexOf(after); + Validate.isTrue(i != -1); + stack.add(i+1, in); + } + + void replaceOnStack(Element out, Element in) { + replaceInQueue(stack, out, in); + } + + private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) { + int i = queue.lastIndexOf(out); + Validate.isTrue(i != -1); + queue.remove(i); + queue.add(i, in); + } + + void resetInsertionMode() { + boolean last = false; + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element node = it.next(); + if (!it.hasNext()) { + last = true; + node = contextElement; + } + String name = node.nodeName(); + if ("select".equals(name)) { + transition(HtmlTreeBuilderState.InSelect); + break; // frag + } else if (("td".equals(name) || "td".equals(name) && !last)) { + transition(HtmlTreeBuilderState.InCell); + break; + } else if ("tr".equals(name)) { + transition(HtmlTreeBuilderState.InRow); + break; + } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) { + transition(HtmlTreeBuilderState.InTableBody); + break; + } else if ("caption".equals(name)) { + transition(HtmlTreeBuilderState.InCaption); + break; + } else if ("colgroup".equals(name)) { + transition(HtmlTreeBuilderState.InColumnGroup); + break; // frag + } else if ("table".equals(name)) { + transition(HtmlTreeBuilderState.InTable); + break; + } else if ("head".equals(name)) { + transition(HtmlTreeBuilderState.InBody); + break; // frag + } else if ("body".equals(name)) { + transition(HtmlTreeBuilderState.InBody); + break; + } else if ("frameset".equals(name)) { + transition(HtmlTreeBuilderState.InFrameset); + break; // frag + } else if ("html".equals(name)) { + transition(HtmlTreeBuilderState.BeforeHead); + break; // frag + } else if (last) { + transition(HtmlTreeBuilderState.InBody); + break; // frag + } + } + } + + // todo: tidy up in specific scope methods + private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { + return inSpecificScope(new String[]{targetName}, baseTypes, extraTypes); + } + + private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element el = it.next(); + String elName = el.nodeName(); + if (StringUtil.in(elName, targetNames)) + return true; + if (StringUtil.in(elName, baseTypes)) + return false; + if (extraTypes != null && StringUtil.in(elName, extraTypes)) + return false; + } + Validate.fail("Should not be reachable"); + return false; + } + + boolean inScope(String[] targetNames) { + return inSpecificScope(targetNames, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, null); + } + + boolean inScope(String targetName) { + return inScope(targetName, null); + } + + boolean inScope(String targetName, String[] extras) { + return inSpecificScope(targetName, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, extras); + // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml + // todo: in svg namespace: forignOjbect, desc, title + } + + boolean inListItemScope(String targetName) { + return inScope(targetName, new String[]{"ol", "ul"}); + } + + boolean inButtonScope(String targetName) { + return inScope(targetName, new String[]{"button"}); + } + + boolean inTableScope(String targetName) { + return inSpecificScope(targetName, new String[]{"html", "table"}, null); + } + + boolean inSelectScope(String targetName) { + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element el = it.next(); + String elName = el.nodeName(); + if (elName.equals(targetName)) + return true; + if (!StringUtil.in(elName, "optgroup", "option")) // all elements except + return false; + } + Validate.fail("Should not be reachable"); + return false; + } + + void setHeadElement(Element headElement) { + this.headElement = headElement; + } + + Element getHeadElement() { + return headElement; + } + + boolean isFosterInserts() { + return fosterInserts; + } + + void setFosterInserts(boolean fosterInserts) { + this.fosterInserts = fosterInserts; + } + + Element getFormElement() { + return formElement; + } + + void setFormElement(Element formElement) { + this.formElement = formElement; + } + + void newPendingTableCharacters() { + pendingTableCharacters = new ArrayList<Token.Character>(); + } + + List<Token.Character> getPendingTableCharacters() { + return pendingTableCharacters; + } + + void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) { + this.pendingTableCharacters = pendingTableCharacters; + } + + /** + 11.2.5.2 Closing elements that have implied end tags<p/> + When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a + dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element, + the UA must pop the current node off the stack of open elements. + + @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the + process, then the UA must perform the above steps as if that element was not in the above list. + */ + void generateImpliedEndTags(String excludeTag) { + while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) && + StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) + pop(); + } + + void generateImpliedEndTags() { + generateImpliedEndTags(null); + } + + boolean isSpecial(Element el) { + // todo: mathml's mi, mo, mn + // todo: svg's foreigObject, desc, title + String name = el.nodeName(); + return StringUtil.in(name, "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", + "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", + "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", + "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", + "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", + "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", + "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", + "title", "tr", "ul", "wbr", "xmp"); + } + + // active formatting elements + void pushActiveFormattingElements(Element in) { + int numSeen = 0; + Iterator<Element> iter = formattingElements.descendingIterator(); + while (iter.hasNext()) { + Element el = iter.next(); + if (el == null) // marker + break; + + if (isSameFormattingElement(in, el)) + numSeen++; + + if (numSeen == 3) { + iter.remove(); + break; + } + } + formattingElements.add(in); + } + + private boolean isSameFormattingElement(Element a, Element b) { + // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children + return a.nodeName().equals(b.nodeName()) && + // a.namespace().equals(b.namespace()) && + a.attributes().equals(b.attributes()); + // todo: namespaces + } + + void reconstructFormattingElements() { + int size = formattingElements.size(); + if (size == 0 || formattingElements.getLast() == null || onStack(formattingElements.getLast())) + return; + + Element entry = formattingElements.getLast(); + int pos = size - 1; + boolean skip = false; + while (true) { + if (pos == 0) { // step 4. if none before, skip to 8 + skip = true; + break; + } + entry = formattingElements.get(--pos); // step 5. one earlier than entry + if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack + break; // jump to 8, else continue back to 4 + } + while(true) { + if (!skip) // step 7: on later than entry + entry = formattingElements.get(++pos); + Validate.notNull(entry); // should not occur, as we break at last element + + // 8. create new element from element, 9 insert into current node, onto stack + skip = false; // can only skip increment from 4. + Element newEl = insert(entry.nodeName()); // todo: avoid fostering here? + // newEl.namespace(entry.namespace()); // todo: namespaces + newEl.attributes().addAll(entry.attributes()); + + // 10. replace entry with new entry + formattingElements.add(pos, newEl); + formattingElements.remove(pos + 1); + + // 11 + if (pos == size-1) // if not last entry in list, jump to 7 + break; + } + } + + void clearFormattingElementsToLastMarker() { + while (!formattingElements.isEmpty()) { + Element el = formattingElements.peekLast(); + formattingElements.removeLast(); + if (el == null) + break; + } + } + + void removeFromActiveFormattingElements(Element el) { + Iterator<Element> it = formattingElements.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next == el) { + it.remove(); + break; + } + } + } + + boolean isInActiveFormattingElements(Element el) { + return isElementInQueue(formattingElements, el); + } + + Element getActiveFormattingElement(String nodeName) { + Iterator<Element> it = formattingElements.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next == null) // scope marker + break; + else if (next.nodeName().equals(nodeName)) + return next; + } + return null; + } + + void replaceActiveFormattingElement(Element out, Element in) { + replaceInQueue(formattingElements, out, in); + } + + void insertMarkerToFormattingElements() { + formattingElements.add(null); + } + + void insertInFosterParent(Node in) { + Element fosterParent = null; + Element lastTable = getFromStack("table"); + boolean isLastTableParent = false; + if (lastTable != null) { + if (lastTable.parent() != null) { + fosterParent = lastTable.parent(); + isLastTableParent = true; + } else + fosterParent = aboveOnStack(lastTable); + } else { // no table == frag + fosterParent = stack.get(0); + } + + if (isLastTableParent) { + Validate.notNull(lastTable); // last table cannot be null by this point. + lastTable.before(in); + } + else + fosterParent.appendChild(in); + } + + @Override + public String toString() { + return "TreeBuilder{" + + "currentToken=" + currentToken + + ", state=" + state + + ", currentElement=" + currentElement() + + '}'; + } +} diff --git a/src/org/jsoup/parser/HtmlTreeBuilderState.java b/src/org/jsoup/parser/HtmlTreeBuilderState.java new file mode 100644 index 0000000000..ceab9faa5a --- /dev/null +++ b/src/org/jsoup/parser/HtmlTreeBuilderState.java @@ -0,0 +1,1482 @@ +package org.jsoup.parser; + +import org.jsoup.helper.DescendableLinkedList; +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.*; + +import java.util.Iterator; +import java.util.LinkedList; + +/** + * The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states. + */ +enum HtmlTreeBuilderState { + Initial { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + return true; // ignore whitespace + } else if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype()) { + // todo: parse error check on expected doctypes + // todo: quirk state check on doctype ids + Token.Doctype d = t.asDoctype(); + DocumentType doctype = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri()); + tb.getDocument().appendChild(doctype); + if (d.isForceQuirks()) + tb.getDocument().quirksMode(Document.QuirksMode.quirks); + tb.transition(BeforeHtml); + } else { + // todo: check not iframe srcdoc + tb.transition(BeforeHtml); + return tb.process(t); // re-process token + } + return true; + } + }, + BeforeHtml { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isDoctype()) { + tb.error(this); + return false; + } else if (t.isComment()) { + tb.insert(t.asComment()); + } else if (isWhitespace(t)) { + return true; // ignore whitespace + } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { + tb.insert(t.asStartTag()); + tb.transition(BeforeHead); + } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { + return anythingElse(t, tb); + } else if (t.isEndTag()) { + tb.error(this); + return false; + } else { + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + tb.insert("html"); + tb.transition(BeforeHead); + return tb.process(t); + } + }, + BeforeHead { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + return true; + } else if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype()) { + tb.error(this); + return false; + } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { + return InBody.process(t, tb); // does not transition + } else if (t.isStartTag() && t.asStartTag().name().equals("head")) { + Element head = tb.insert(t.asStartTag()); + tb.setHeadElement(head); + tb.transition(InHead); + } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { + tb.process(new Token.StartTag("head")); + return tb.process(t); + } else if (t.isEndTag()) { + tb.error(this); + return false; + } else { + tb.process(new Token.StartTag("head")); + return tb.process(t); + } + return true; + } + }, + InHead { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + tb.insert(t.asCharacter()); + return true; + } + switch (t.type) { + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + return false; + case StartTag: + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) { + return InBody.process(t, tb); + } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { + Element el = tb.insertEmpty(start); + // jsoup special: update base the frist time it is seen + if (name.equals("base") && el.hasAttr("href")) + tb.maybeSetBaseUri(el); + } else if (name.equals("meta")) { + Element meta = tb.insertEmpty(start); + // todo: charset switches + } else if (name.equals("title")) { + handleRcData(start, tb); + } else if (StringUtil.in(name, "noframes", "style")) { + handleRawtext(start, tb); + } else if (name.equals("noscript")) { + // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) + tb.insert(start); + tb.transition(InHeadNoscript); + } else if (name.equals("script")) { + // skips some script rules as won't execute them + tb.insert(start); + tb.tokeniser.transition(TokeniserState.ScriptData); + tb.markInsertionMode(); + tb.transition(Text); + } else if (name.equals("head")) { + tb.error(this); + return false; + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag end = t.asEndTag(); + name = end.name(); + if (name.equals("head")) { + tb.pop(); + tb.transition(AfterHead); + } else if (StringUtil.in(name, "body", "html", "br")) { + return anythingElse(t, tb); + } else { + tb.error(this); + return false; + } + break; + default: + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, TreeBuilder tb) { + tb.process(new Token.EndTag("head")); + return tb.process(t); + } + }, + InHeadNoscript { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isDoctype()) { + tb.error(this); + } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { + return tb.process(t, InBody); + } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { + tb.pop(); + tb.transition(InHead); + } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), + "basefont", "bgsound", "link", "meta", "noframes", "style"))) { + return tb.process(t, InHead); + } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { + return anythingElse(t, tb); + } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) { + tb.error(this); + return false; + } else { + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + tb.error(this); + tb.process(new Token.EndTag("noscript")); + return tb.process(t); + } + }, + AfterHead { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + tb.insert(t.asCharacter()); + } else if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype()) { + tb.error(this); + } else if (t.isStartTag()) { + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) { + return tb.process(t, InBody); + } else if (name.equals("body")) { + tb.insert(startTag); + tb.framesetOk(false); + tb.transition(InBody); + } else if (name.equals("frameset")) { + tb.insert(startTag); + tb.transition(InFrameset); + } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { + tb.error(this); + Element head = tb.getHeadElement(); + tb.push(head); + tb.process(t, InHead); + tb.removeFromStack(head); + } else if (name.equals("head")) { + tb.error(this); + return false; + } else { + anythingElse(t, tb); + } + } else if (t.isEndTag()) { + if (StringUtil.in(t.asEndTag().name(), "body", "html")) { + anythingElse(t, tb); + } else { + tb.error(this); + return false; + } + } else { + anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + tb.process(new Token.StartTag("body")); + tb.framesetOk(true); + return tb.process(t); + } + }, + InBody { + boolean process(Token t, HtmlTreeBuilder tb) { + switch (t.type) { + case Character: { + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + // todo confirm that check + tb.error(this); + return false; + } else if (isWhitespace(c)) { + tb.reconstructFormattingElements(); + tb.insert(c); + } else { + tb.reconstructFormattingElements(); + tb.insert(c); + tb.framesetOk(false); + } + break; + } + case Comment: { + tb.insert(t.asComment()); + break; + } + case Doctype: { + tb.error(this); + return false; + } + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) { + tb.error(this); + // merge attributes onto real html + Element html = tb.getStack().getFirst(); + for (Attribute attribute : startTag.getAttributes()) { + if (!html.hasAttr(attribute.getKey())) + html.attributes().put(attribute); + } + } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title")) { + return tb.process(t, InHead); + } else if (name.equals("body")) { + tb.error(this); + LinkedList<Element> stack = tb.getStack(); + if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { + // only in fragment case + return false; // ignore + } else { + tb.framesetOk(false); + Element body = stack.get(1); + for (Attribute attribute : startTag.getAttributes()) { + if (!body.hasAttr(attribute.getKey())) + body.attributes().put(attribute); + } + } + } else if (name.equals("frameset")) { + tb.error(this); + LinkedList<Element> stack = tb.getStack(); + if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { + // only in fragment case + return false; // ignore + } else if (!tb.framesetOk()) { + return false; // ignore frameset + } else { + Element second = stack.get(1); + if (second.parent() != null) + second.remove(); + // pop up to html element + while (stack.size() > 1) + stack.removeLast(); + tb.insert(startTag); + tb.transition(InFrameset); + } + } else if (StringUtil.in(name, + "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", + "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", + "p", "section", "summary", "ul")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + if (StringUtil.in(tb.currentElement().nodeName(), "h1", "h2", "h3", "h4", "h5", "h6")) { + tb.error(this); + tb.pop(); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "pre", "listing")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + // todo: ignore LF if next token + tb.framesetOk(false); + } else if (name.equals("form")) { + if (tb.getFormElement() != null) { + tb.error(this); + return false; + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + Element form = tb.insert(startTag); + tb.setFormElement(form); + } else if (name.equals("li")) { + tb.framesetOk(false); + LinkedList<Element> stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (el.nodeName().equals("li")) { + tb.process(new Token.EndTag("li")); + break; + } + if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) + break; + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "dd", "dt")) { + tb.framesetOk(false); + LinkedList<Element> stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (StringUtil.in(el.nodeName(), "dd", "dt")) { + tb.process(new Token.EndTag(el.nodeName())); + break; + } + if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) + break; + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (name.equals("plaintext")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out + } else if (name.equals("button")) { + if (tb.inButtonScope("button")) { + // close and reprocess + tb.error(this); + tb.process(new Token.EndTag("button")); + tb.process(startTag); + } else { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.framesetOk(false); + } + } else if (name.equals("a")) { + if (tb.getActiveFormattingElement("a") != null) { + tb.error(this); + tb.process(new Token.EndTag("a")); + + // still on stack? + Element remainingA = tb.getFromStack("a"); + if (remainingA != null) { + tb.removeFromActiveFormattingElements(remainingA); + tb.removeFromStack(remainingA); + } + } + tb.reconstructFormattingElements(); + Element a = tb.insert(startTag); + tb.pushActiveFormattingElements(a); + } else if (StringUtil.in(name, + "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) { + tb.reconstructFormattingElements(); + Element el = tb.insert(startTag); + tb.pushActiveFormattingElements(el); + } else if (name.equals("nobr")) { + tb.reconstructFormattingElements(); + if (tb.inScope("nobr")) { + tb.error(this); + tb.process(new Token.EndTag("nobr")); + tb.reconstructFormattingElements(); + } + Element el = tb.insert(startTag); + tb.pushActiveFormattingElements(el); + } else if (StringUtil.in(name, "applet", "marquee", "object")) { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.insertMarkerToFormattingElements(); + tb.framesetOk(false); + } else if (name.equals("table")) { + if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + tb.framesetOk(false); + tb.transition(InTable); + } else if (StringUtil.in(name, "area", "br", "embed", "img", "keygen", "wbr")) { + tb.reconstructFormattingElements(); + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (name.equals("input")) { + tb.reconstructFormattingElements(); + Element el = tb.insertEmpty(startTag); + if (!el.attr("type").equalsIgnoreCase("hidden")) + tb.framesetOk(false); + } else if (StringUtil.in(name, "param", "source", "track")) { + tb.insertEmpty(startTag); + } else if (name.equals("hr")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (name.equals("image")) { + // we're not supposed to ask. + startTag.name("img"); + return tb.process(startTag); + } else if (name.equals("isindex")) { + // how much do we care about the early 90s? + tb.error(this); + if (tb.getFormElement() != null) + return false; + + tb.tokeniser.acknowledgeSelfClosingFlag(); + tb.process(new Token.StartTag("form")); + if (startTag.attributes.hasKey("action")) { + Element form = tb.getFormElement(); + form.attr("action", startTag.attributes.get("action")); + } + tb.process(new Token.StartTag("hr")); + tb.process(new Token.StartTag("label")); + // hope you like english. + String prompt = startTag.attributes.hasKey("prompt") ? + startTag.attributes.get("prompt") : + "This is a searchable index. Enter search keywords: "; + + tb.process(new Token.Character(prompt)); + + // input + Attributes inputAttribs = new Attributes(); + for (Attribute attr : startTag.attributes) { + if (!StringUtil.in(attr.getKey(), "name", "action", "prompt")) + inputAttribs.put(attr); + } + inputAttribs.put("name", "isindex"); + tb.process(new Token.StartTag("input", inputAttribs)); + tb.process(new Token.EndTag("label")); + tb.process(new Token.StartTag("hr")); + tb.process(new Token.EndTag("form")); + } else if (name.equals("textarea")) { + tb.insert(startTag); + // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) + tb.tokeniser.transition(TokeniserState.Rcdata); + tb.markInsertionMode(); + tb.framesetOk(false); + tb.transition(Text); + } else if (name.equals("xmp")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.reconstructFormattingElements(); + tb.framesetOk(false); + handleRawtext(startTag, tb); + } else if (name.equals("iframe")) { + tb.framesetOk(false); + handleRawtext(startTag, tb); + } else if (name.equals("noembed")) { + // also handle noscript if script enabled + handleRawtext(startTag, tb); + } else if (name.equals("select")) { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.framesetOk(false); + + HtmlTreeBuilderState state = tb.state(); + if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) + tb.transition(InSelectInTable); + else + tb.transition(InSelect); + } else if (StringUtil.in("optgroup", "option")) { + if (tb.currentElement().nodeName().equals("option")) + tb.process(new Token.EndTag("option")); + tb.reconstructFormattingElements(); + tb.insert(startTag); + } else if (StringUtil.in("rp", "rt")) { + if (tb.inScope("ruby")) { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals("ruby")) { + tb.error(this); + tb.popStackToBefore("ruby"); // i.e. close up to but not include name + } + tb.insert(startTag); + } + } else if (name.equals("math")) { + tb.reconstructFormattingElements(); + // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) + tb.insert(startTag); + tb.tokeniser.acknowledgeSelfClosingFlag(); + } else if (name.equals("svg")) { + tb.reconstructFormattingElements(); + // todo: handle A start tag whose tag name is "svg" (xlink, svg) + tb.insert(startTag); + tb.tokeniser.acknowledgeSelfClosingFlag(); + } else if (StringUtil.in(name, + "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) { + tb.error(this); + return false; + } else { + tb.reconstructFormattingElements(); + tb.insert(startTag); + } + break; + + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (name.equals("body")) { + if (!tb.inScope("body")) { + tb.error(this); + return false; + } else { + // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html + tb.transition(AfterBody); + } + } else if (name.equals("html")) { + boolean notIgnored = tb.process(new Token.EndTag("body")); + if (notIgnored) + return tb.process(endTag); + } else if (StringUtil.in(name, + "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", + "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", + "nav", "ol", "pre", "section", "summary", "ul")) { + // todo: refactor these lookups + if (!tb.inScope(name)) { + // nothing to close + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (name.equals("form")) { + Element currentForm = tb.getFormElement(); + tb.setFormElement(null); + if (currentForm == null || !tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + // remove currentForm from stack. will shift anything under up. + tb.removeFromStack(currentForm); + } + } else if (name.equals("p")) { + if (!tb.inButtonScope(name)) { + tb.error(this); + tb.process(new Token.StartTag(name)); // if no p to close, creates an empty <p></p> + return tb.process(endTag); + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (name.equals("li")) { + if (!tb.inListItemScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (StringUtil.in(name, "dd", "dt")) { + if (!tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + } + } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { + if (!tb.inScope(new String[]{"h1", "h2", "h3", "h4", "h5", "h6"})) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); + } + } else if (name.equals("sarcasm")) { + // *sigh* + return anyOtherEndTag(t, tb); + } else if (StringUtil.in(name, + "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u")) { + // Adoption Agency Algorithm. + OUTER: + for (int i = 0; i < 8; i++) { + Element formatEl = tb.getActiveFormattingElement(name); + if (formatEl == null) + return anyOtherEndTag(t, tb); + else if (!tb.onStack(formatEl)) { + tb.error(this); + tb.removeFromActiveFormattingElements(formatEl); + return true; + } else if (!tb.inScope(formatEl.nodeName())) { + tb.error(this); + return false; + } else if (tb.currentElement() != formatEl) + tb.error(this); + + Element furthestBlock = null; + Element commonAncestor = null; + boolean seenFormattingElement = false; + LinkedList<Element> stack = tb.getStack(); + for (int si = 0; si < stack.size(); si++) { + Element el = stack.get(si); + if (el == formatEl) { + commonAncestor = stack.get(si - 1); + seenFormattingElement = true; + } else if (seenFormattingElement && tb.isSpecial(el)) { + furthestBlock = el; + break; + } + } + if (furthestBlock == null) { + tb.popStackToClose(formatEl.nodeName()); + tb.removeFromActiveFormattingElements(formatEl); + return true; + } + + // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. + // does that mean: int pos of format el in list? + Element node = furthestBlock; + Element lastNode = furthestBlock; + INNER: + for (int j = 0; j < 3; j++) { + if (tb.onStack(node)) + node = tb.aboveOnStack(node); + if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check + tb.removeFromStack(node); + continue INNER; + } else if (node == formatEl) + break INNER; + + Element replacement = new Element(Tag.valueOf(node.nodeName()), tb.getBaseUri()); + tb.replaceActiveFormattingElement(node, replacement); + tb.replaceOnStack(node, replacement); + node = replacement; + + if (lastNode == furthestBlock) { + // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements. + // not getting how this bookmark both straddles the element above, but is inbetween here... + } + if (lastNode.parent() != null) + lastNode.remove(); + node.appendChild(lastNode); + + lastNode = node; + } + + if (StringUtil.in(commonAncestor.nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { + if (lastNode.parent() != null) + lastNode.remove(); + tb.insertInFosterParent(lastNode); + } else { + if (lastNode.parent() != null) + lastNode.remove(); + commonAncestor.appendChild(lastNode); + } + + Element adopter = new Element(Tag.valueOf(name), tb.getBaseUri()); + Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodes().size()]); + for (Node childNode : childNodes) { + adopter.appendChild(childNode); // append will reparent. thus the clone to avoid concurrent mod. + } + furthestBlock.appendChild(adopter); + tb.removeFromActiveFormattingElements(formatEl); + // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. + tb.removeFromStack(formatEl); + tb.insertOnStackAfter(furthestBlock, adopter); + } + } else if (StringUtil.in(name, "applet", "marquee", "object")) { + if (!tb.inScope("name")) { + if (!tb.inScope(name)) { + tb.error(this); + return false; + } + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + tb.clearFormattingElementsToLastMarker(); + } + } else if (name.equals("br")) { + tb.error(this); + tb.process(new Token.StartTag("br")); + return false; + } else { + return anyOtherEndTag(t, tb); + } + + break; + case EOF: + // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html + // stop parsing + break; + } + return true; + } + + boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) { + String name = t.asEndTag().name(); + DescendableLinkedList<Element> stack = tb.getStack(); + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element node = it.next(); + if (node.nodeName().equals(name)) { + tb.generateImpliedEndTags(name); + if (!name.equals(tb.currentElement().nodeName())) + tb.error(this); + tb.popStackToClose(name); + break; + } else { + if (tb.isSpecial(node)) { + tb.error(this); + return false; + } + } + } + return true; + } + }, + Text { + // in script, style etc. normally treated as data tags + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isCharacter()) { + tb.insert(t.asCharacter()); + } else if (t.isEOF()) { + tb.error(this); + // if current node is script: already started + tb.pop(); + tb.transition(tb.originalState()); + return tb.process(t); + } else if (t.isEndTag()) { + // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts + tb.pop(); + tb.transition(tb.originalState()); + } + return true; + } + }, + InTable { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isCharacter()) { + tb.newPendingTableCharacters(); + tb.markInsertionMode(); + tb.transition(InTableText); + return tb.process(t); + } else if (t.isComment()) { + tb.insert(t.asComment()); + return true; + } else if (t.isDoctype()) { + tb.error(this); + return false; + } else if (t.isStartTag()) { + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("caption")) { + tb.clearStackToTableContext(); + tb.insertMarkerToFormattingElements(); + tb.insert(startTag); + tb.transition(InCaption); + } else if (name.equals("colgroup")) { + tb.clearStackToTableContext(); + tb.insert(startTag); + tb.transition(InColumnGroup); + } else if (name.equals("col")) { + tb.process(new Token.StartTag("colgroup")); + return tb.process(t); + } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { + tb.clearStackToTableContext(); + tb.insert(startTag); + tb.transition(InTableBody); + } else if (StringUtil.in(name, "td", "th", "tr")) { + tb.process(new Token.StartTag("tbody")); + return tb.process(t); + } else if (name.equals("table")) { + tb.error(this); + boolean processed = tb.process(new Token.EndTag("table")); + if (processed) // only ignored if in fragment + return tb.process(t); + } else if (StringUtil.in(name, "style", "script")) { + return tb.process(t, InHead); + } else if (name.equals("input")) { + if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) { + return anythingElse(t, tb); + } else { + tb.insertEmpty(startTag); + } + } else if (name.equals("form")) { + tb.error(this); + if (tb.getFormElement() != null) + return false; + else { + Element form = tb.insertEmpty(startTag); + tb.setFormElement(form); + } + } else { + return anythingElse(t, tb); + } + } else if (t.isEndTag()) { + Token.EndTag endTag = t.asEndTag(); + String name = endTag.name(); + + if (name.equals("table")) { + if (!tb.inTableScope(name)) { + tb.error(this); + return false; + } else { + tb.popStackToClose("table"); + } + tb.resetInsertionMode(); + } else if (StringUtil.in(name, + "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { + tb.error(this); + return false; + } else { + return anythingElse(t, tb); + } + } else if (t.isEOF()) { + if (tb.currentElement().nodeName().equals("html")) + tb.error(this); + return true; // stops parsing + } + return anythingElse(t, tb); + } + + boolean anythingElse(Token t, HtmlTreeBuilder tb) { + tb.error(this); + boolean processed = true; + if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { + tb.setFosterInserts(true); + processed = tb.process(t, InBody); + tb.setFosterInserts(false); + } else { + processed = tb.process(t, InBody); + } + return processed; + } + }, + InTableText { + boolean process(Token t, HtmlTreeBuilder tb) { + switch (t.type) { + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + tb.error(this); + return false; + } else { + tb.getPendingTableCharacters().add(c); + } + break; + default: + if (tb.getPendingTableCharacters().size() > 0) { + for (Token.Character character : tb.getPendingTableCharacters()) { + if (!isWhitespace(character)) { + // InTable anything else section: + tb.error(this); + if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { + tb.setFosterInserts(true); + tb.process(character, InBody); + tb.setFosterInserts(false); + } else { + tb.process(character, InBody); + } + } else + tb.insert(character); + } + tb.newPendingTableCharacters(); + } + tb.transition(tb.originalState()); + return tb.process(t); + } + return true; + } + }, + InCaption { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isEndTag() && t.asEndTag().name().equals("caption")) { + Token.EndTag endTag = t.asEndTag(); + String name = endTag.name(); + if (!tb.inTableScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals("caption")) + tb.error(this); + tb.popStackToClose("caption"); + tb.clearFormattingElementsToLastMarker(); + tb.transition(InTable); + } + } else if (( + t.isStartTag() && StringUtil.in(t.asStartTag().name(), + "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || + t.isEndTag() && t.asEndTag().name().equals("table")) + ) { + tb.error(this); + boolean processed = tb.process(new Token.EndTag("caption")); + if (processed) + return tb.process(t); + } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), + "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { + tb.error(this); + return false; + } else { + return tb.process(t, InBody); + } + return true; + } + }, + InColumnGroup { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + tb.insert(t.asCharacter()); + return true; + } + switch (t.type) { + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + break; + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) + return tb.process(t, InBody); + else if (name.equals("col")) + tb.insertEmpty(startTag); + else + return anythingElse(t, tb); + break; + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (name.equals("colgroup")) { + if (tb.currentElement().nodeName().equals("html")) { // frag case + tb.error(this); + return false; + } else { + tb.pop(); + tb.transition(InTable); + } + } else + return anythingElse(t, tb); + break; + case EOF: + if (tb.currentElement().nodeName().equals("html")) + return true; // stop parsing; frag case + else + return anythingElse(t, tb); + default: + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, TreeBuilder tb) { + boolean processed = tb.process(new Token.EndTag("colgroup")); + if (processed) // only ignored in frag case + return tb.process(t); + return true; + } + }, + InTableBody { + boolean process(Token t, HtmlTreeBuilder tb) { + switch (t.type) { + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("tr")) { + tb.clearStackToTableBodyContext(); + tb.insert(startTag); + tb.transition(InRow); + } else if (StringUtil.in(name, "th", "td")) { + tb.error(this); + tb.process(new Token.StartTag("tr")); + return tb.process(startTag); + } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { + return exitTableBody(t, tb); + } else + return anythingElse(t, tb); + break; + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (StringUtil.in(name, "tbody", "tfoot", "thead")) { + if (!tb.inTableScope(name)) { + tb.error(this); + return false; + } else { + tb.clearStackToTableBodyContext(); + tb.pop(); + tb.transition(InTable); + } + } else if (name.equals("table")) { + return exitTableBody(t, tb); + } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { + tb.error(this); + return false; + } else + return anythingElse(t, tb); + break; + default: + return anythingElse(t, tb); + } + return true; + } + + private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { + if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) { + // frag case + tb.error(this); + return false; + } + tb.clearStackToTableBodyContext(); + tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, tfoot, thead + return tb.process(t); + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + return tb.process(t, InTable); + } + }, + InRow { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isStartTag()) { + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + + if (StringUtil.in(name, "th", "td")) { + tb.clearStackToTableRowContext(); + tb.insert(startTag); + tb.transition(InCell); + tb.insertMarkerToFormattingElements(); + } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { + return handleMissingTr(t, tb); + } else { + return anythingElse(t, tb); + } + } else if (t.isEndTag()) { + Token.EndTag endTag = t.asEndTag(); + String name = endTag.name(); + + if (name.equals("tr")) { + if (!tb.inTableScope(name)) { + tb.error(this); // frag + return false; + } + tb.clearStackToTableRowContext(); + tb.pop(); // tr + tb.transition(InTableBody); + } else if (name.equals("table")) { + return handleMissingTr(t, tb); + } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { + if (!tb.inTableScope(name)) { + tb.error(this); + return false; + } + tb.process(new Token.EndTag("tr")); + return tb.process(t); + } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { + tb.error(this); + return false; + } else { + return anythingElse(t, tb); + } + } else { + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + return tb.process(t, InTable); + } + + private boolean handleMissingTr(Token t, TreeBuilder tb) { + boolean processed = tb.process(new Token.EndTag("tr")); + if (processed) + return tb.process(t); + else + return false; + } + }, + InCell { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isEndTag()) { + Token.EndTag endTag = t.asEndTag(); + String name = endTag.name(); + + if (StringUtil.in(name, "td", "th")) { + if (!tb.inTableScope(name)) { + tb.error(this); + tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag + return false; + } + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) + tb.error(this); + tb.popStackToClose(name); + tb.clearFormattingElementsToLastMarker(); + tb.transition(InRow); + } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { + tb.error(this); + return false; + } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { + if (!tb.inTableScope(name)) { + tb.error(this); + return false; + } + closeCell(tb); + return tb.process(t); + } else { + return anythingElse(t, tb); + } + } else if (t.isStartTag() && + StringUtil.in(t.asStartTag().name(), + "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { + if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { + tb.error(this); + return false; + } + closeCell(tb); + return tb.process(t); + } else { + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + return tb.process(t, InBody); + } + + private void closeCell(HtmlTreeBuilder tb) { + if (tb.inTableScope("td")) + tb.process(new Token.EndTag("td")); + else + tb.process(new Token.EndTag("th")); // only here if th or td in scope + } + }, + InSelect { + boolean process(Token t, HtmlTreeBuilder tb) { + switch (t.type) { + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + tb.error(this); + return false; + } else { + tb.insert(c); + } + break; + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + return false; + case StartTag: + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) + return tb.process(start, InBody); + else if (name.equals("option")) { + tb.process(new Token.EndTag("option")); + tb.insert(start); + } else if (name.equals("optgroup")) { + if (tb.currentElement().nodeName().equals("option")) + tb.process(new Token.EndTag("option")); + else if (tb.currentElement().nodeName().equals("optgroup")) + tb.process(new Token.EndTag("optgroup")); + tb.insert(start); + } else if (name.equals("select")) { + tb.error(this); + return tb.process(new Token.EndTag("select")); + } else if (StringUtil.in(name, "input", "keygen", "textarea")) { + tb.error(this); + if (!tb.inSelectScope("select")) + return false; // frag + tb.process(new Token.EndTag("select")); + return tb.process(start); + } else if (name.equals("script")) { + return tb.process(t, InHead); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag end = t.asEndTag(); + name = end.name(); + if (name.equals("optgroup")) { + if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup")) + tb.process(new Token.EndTag("option")); + if (tb.currentElement().nodeName().equals("optgroup")) + tb.pop(); + else + tb.error(this); + } else if (name.equals("option")) { + if (tb.currentElement().nodeName().equals("option")) + tb.pop(); + else + tb.error(this); + } else if (name.equals("select")) { + if (!tb.inSelectScope(name)) { + tb.error(this); + return false; + } else { + tb.popStackToClose(name); + tb.resetInsertionMode(); + } + } else + return anythingElse(t, tb); + break; + case EOF: + if (!tb.currentElement().nodeName().equals("html")) + tb.error(this); + break; + default: + return anythingElse(t, tb); + } + return true; + } + + private boolean anythingElse(Token t, HtmlTreeBuilder tb) { + tb.error(this); + return false; + } + }, + InSelectInTable { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { + tb.error(this); + tb.process(new Token.EndTag("select")); + return tb.process(t); + } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { + tb.error(this); + if (tb.inTableScope(t.asEndTag().name())) { + tb.process(new Token.EndTag("select")); + return (tb.process(t)); + } else + return false; + } else { + return tb.process(t, InSelect); + } + } + }, + AfterBody { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + return tb.process(t, InBody); + } else if (t.isComment()) { + tb.insert(t.asComment()); // into html node + } else if (t.isDoctype()) { + tb.error(this); + return false; + } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { + return tb.process(t, InBody); + } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { + if (tb.isFragmentParsing()) { + tb.error(this); + return false; + } else { + tb.transition(AfterAfterBody); + } + } else if (t.isEOF()) { + // chillax! we're done + } else { + tb.error(this); + tb.transition(InBody); + return tb.process(t); + } + return true; + } + }, + InFrameset { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + tb.insert(t.asCharacter()); + } else if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype()) { + tb.error(this); + return false; + } else if (t.isStartTag()) { + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) { + return tb.process(start, InBody); + } else if (name.equals("frameset")) { + tb.insert(start); + } else if (name.equals("frame")) { + tb.insertEmpty(start); + } else if (name.equals("noframes")) { + return tb.process(start, InHead); + } else { + tb.error(this); + return false; + } + } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) { + if (tb.currentElement().nodeName().equals("html")) { // frag + tb.error(this); + return false; + } else { + tb.pop(); + if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) { + tb.transition(AfterFrameset); + } + } + } else if (t.isEOF()) { + if (!tb.currentElement().nodeName().equals("html")) { + tb.error(this); + return true; + } + } else { + tb.error(this); + return false; + } + return true; + } + }, + AfterFrameset { + boolean process(Token t, HtmlTreeBuilder tb) { + if (isWhitespace(t)) { + tb.insert(t.asCharacter()); + } else if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype()) { + tb.error(this); + return false; + } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { + return tb.process(t, InBody); + } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { + tb.transition(AfterAfterFrameset); + } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { + return tb.process(t, InHead); + } else if (t.isEOF()) { + // cool your heels, we're complete + } else { + tb.error(this); + return false; + } + return true; + } + }, + AfterAfterBody { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { + return tb.process(t, InBody); + } else if (t.isEOF()) { + // nice work chuck + } else { + tb.error(this); + tb.transition(InBody); + return tb.process(t); + } + return true; + } + }, + AfterAfterFrameset { + boolean process(Token t, HtmlTreeBuilder tb) { + if (t.isComment()) { + tb.insert(t.asComment()); + } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { + return tb.process(t, InBody); + } else if (t.isEOF()) { + // nice work chuck + } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { + return tb.process(t, InHead); + } else { + tb.error(this); + return false; + } + return true; + } + }, + ForeignContent { + boolean process(Token t, HtmlTreeBuilder tb) { + return true; + // todo: implement. Also; how do we get here? + } + }; + + private static String nullString = String.valueOf('\u0000'); + + abstract boolean process(Token t, HtmlTreeBuilder tb); + + private static boolean isWhitespace(Token t) { + if (t.isCharacter()) { + String data = t.asCharacter().getData(); + // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " + for (int i = 0; i < data.length(); i++) { + char c = data.charAt(i); + if (!StringUtil.isWhitespace(c)) + return false; + } + return true; + } + return false; + } + + private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) { + tb.insert(startTag); + tb.tokeniser.transition(TokeniserState.Rcdata); + tb.markInsertionMode(); + tb.transition(Text); + } + + private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) { + tb.insert(startTag); + tb.tokeniser.transition(TokeniserState.Rawtext); + tb.markInsertionMode(); + tb.transition(Text); + } +} diff --git a/src/org/jsoup/parser/ParseError.java b/src/org/jsoup/parser/ParseError.java new file mode 100644 index 0000000000..dfa090051b --- /dev/null +++ b/src/org/jsoup/parser/ParseError.java @@ -0,0 +1,40 @@ +package org.jsoup.parser; + +/** + * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. + */ +public class ParseError { + private int pos; + private String errorMsg; + + ParseError(int pos, String errorMsg) { + this.pos = pos; + this.errorMsg = errorMsg; + } + + ParseError(int pos, String errorFormat, Object... args) { + this.errorMsg = String.format(errorFormat, args); + this.pos = pos; + } + + /** + * Retrieve the error message. + * @return the error message. + */ + public String getErrorMessage() { + return errorMsg; + } + + /** + * Retrieves the offset of the error. + * @return error offset within input + */ + public int getPosition() { + return pos; + } + + @Override + public String toString() { + return pos + ": " + errorMsg; + } +} diff --git a/src/org/jsoup/parser/ParseErrorList.java b/src/org/jsoup/parser/ParseErrorList.java new file mode 100644 index 0000000000..3824ffbc4e --- /dev/null +++ b/src/org/jsoup/parser/ParseErrorList.java @@ -0,0 +1,34 @@ +package org.jsoup.parser; + +import java.util.ArrayList; + +/** + * A container for ParseErrors. + * + * @author Jonathan Hedley + */ +class ParseErrorList extends ArrayList<ParseError>{ + private static final int INITIAL_CAPACITY = 16; + private final int maxSize; + + ParseErrorList(int initialCapacity, int maxSize) { + super(initialCapacity); + this.maxSize = maxSize; + } + + boolean canAddError() { + return size() < maxSize; + } + + int getMaxSize() { + return maxSize; + } + + static ParseErrorList noTracking() { + return new ParseErrorList(0, 0); + } + + static ParseErrorList tracking(int maxSize) { + return new ParseErrorList(INITIAL_CAPACITY, maxSize); + } +} diff --git a/src/org/jsoup/parser/Parser.java b/src/org/jsoup/parser/Parser.java new file mode 100644 index 0000000000..2236219c06 --- /dev/null +++ b/src/org/jsoup/parser/Parser.java @@ -0,0 +1,157 @@ +package org.jsoup.parser; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + +import java.util.List; + +/** + * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods + * in {@link org.jsoup.Jsoup}. + */ +public class Parser { + private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. + + private TreeBuilder treeBuilder; + private int maxErrors = DEFAULT_MAX_ERRORS; + private ParseErrorList errors; + + /** + * Create a new Parser, using the specified TreeBuilder + * @param treeBuilder TreeBuilder to use to parse input into Documents. + */ + public Parser(TreeBuilder treeBuilder) { + this.treeBuilder = treeBuilder; + } + + public Document parseInput(String html, String baseUri) { + errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); + Document doc = treeBuilder.parse(html, baseUri, errors); + return doc; + } + + // gets & sets + /** + * Get the TreeBuilder currently in use. + * @return current TreeBuilder. + */ + public TreeBuilder getTreeBuilder() { + return treeBuilder; + } + + /** + * Update the TreeBuilder used when parsing content. + * @param treeBuilder current TreeBuilder + * @return this, for chaining + */ + public Parser setTreeBuilder(TreeBuilder treeBuilder) { + this.treeBuilder = treeBuilder; + return this; + } + + /** + * Check if parse error tracking is enabled. + * @return current track error state. + */ + public boolean isTrackErrors() { + return maxErrors > 0; + } + + /** + * Enable or disable parse error tracking for the next parse. + * @param maxErrors the maximum number of errors to track. Set to 0 to disable. + * @return this, for chaining + */ + public Parser setTrackErrors(int maxErrors) { + this.maxErrors = maxErrors; + return this; + } + + /** + * Retrieve the parse errors, if any, from the last parse. + * @return list of parse errors, up to the size of the maximum errors tracked. + */ + public List<ParseError> getErrors() { + return errors; + } + + // static parse functions below + /** + * Parse HTML into a Document. + * + * @param html HTML to parse + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return parsed Document + */ + public static Document parse(String html, String baseUri) { + TreeBuilder treeBuilder = new HtmlTreeBuilder(); + return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking()); + } + + /** + * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. + * + * @param fragmentHtml the fragment of HTML to parse + * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This + * provides stack context (for implicit element creation). + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. + */ + public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { + HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); + return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking()); + } + + /** + * Parse a fragment of HTML into the {@code body} of a Document. + * + * @param bodyHtml fragment of HTML + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return Document, with empty head, and HTML parsed into body + */ + public static Document parseBodyFragment(String bodyHtml, String baseUri) { + Document doc = Document.createShell(baseUri); + Element body = doc.body(); + List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); + Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented + for (Node node : nodes) { + body.appendChild(node); + } + return doc; + } + + /** + * @param bodyHtml HTML to parse + * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return parsed Document + * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. + */ + public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { + return parse(bodyHtml, baseUri); + } + + // builders + + /** + * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, + * based on a knowledge of the semantics of the incoming tags. + * @return a new HTML parser. + */ + public static Parser htmlParser() { + return new Parser(new HtmlTreeBuilder()); + } + + /** + * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, + * rather creates a simple tree directly from the input. + * @return a new simple XML parser. + */ + public static Parser xmlParser() { + return new Parser(new XmlTreeBuilder()); + } +} diff --git a/src/org/jsoup/parser/Tag.java b/src/org/jsoup/parser/Tag.java new file mode 100644 index 0000000000..40b7557b39 --- /dev/null +++ b/src/org/jsoup/parser/Tag.java @@ -0,0 +1,262 @@ +package org.jsoup.parser; + +import org.jsoup.helper.Validate; + +import java.util.HashMap; +import java.util.Map; + +/** + * HTML Tag capabilities. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class Tag { + private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map of known tags + + private String tagName; + private boolean isBlock = true; // block or inline + private boolean formatAsBlock = true; // should be formatted as a block + private boolean canContainBlock = true; // Can this tag hold block level tags? + private boolean canContainInline = true; // only pcdata if not + private boolean empty = false; // can hold nothing; e.g. img + private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. + private boolean preserveWhitespace = false; // for pre, textarea, script etc + + private Tag(String tagName) { + this.tagName = tagName.toLowerCase(); + } + + /** + * Get this tag's name. + * + * @return the tag's name + */ + public String getName() { + return tagName; + } + + /** + * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. + * <p/> + * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). + * + * @param tagName Name of tag, e.g. "p". Case insensitive. + * @return The tag, either defined or new generic. + */ + public static Tag valueOf(String tagName) { + Validate.notNull(tagName); + tagName = tagName.trim().toLowerCase(); + Validate.notEmpty(tagName); + + synchronized (tags) { + Tag tag = tags.get(tagName); + if (tag == null) { + // not defined: create default; go anywhere, do anything! (incl be inside a <p>) + tag = new Tag(tagName); + tag.isBlock = false; + tag.canContainBlock = true; + } + return tag; + } + } + + /** + * Gets if this is a block tag. + * + * @return if block tag + */ + public boolean isBlock() { + return isBlock; + } + + /** + * Gets if this tag should be formatted as a block (or as inline) + * + * @return if should be formatted as block or inline + */ + public boolean formatAsBlock() { + return formatAsBlock; + } + + /** + * Gets if this tag can contain block tags. + * + * @return if tag can contain block tags + */ + public boolean canContainBlock() { + return canContainBlock; + } + + /** + * Gets if this tag is an inline tag. + * + * @return if this tag is an inline tag. + */ + public boolean isInline() { + return !isBlock; + } + + /** + * Gets if this tag is a data only tag. + * + * @return if this tag is a data only tag + */ + public boolean isData() { + return !canContainInline && !isEmpty(); + } + + /** + * Get if this is an empty tag + * + * @return if this is an empty tag + */ + public boolean isEmpty() { + return empty; + } + + /** + * Get if this tag is self closing. + * + * @return if this tag should be output as self closing. + */ + public boolean isSelfClosing() { + return empty || selfClosing; + } + + /** + * Get if this is a pre-defined tag, or was auto created on parsing. + * + * @return if a known tag + */ + public boolean isKnownTag() { + return tags.containsKey(tagName); + } + + /** + * Check if this tagname is a known tag. + * + * @param tagName name of tag + * @return if known HTML tag + */ + public static boolean isKnownTag(String tagName) { + return tags.containsKey(tagName); + } + + /** + * Get if this tag should preserve whitespace within child text nodes. + * + * @return if preserve whitepace + */ + public boolean preserveWhitespace() { + return preserveWhitespace; + } + + Tag setSelfClosing() { + selfClosing = true; + return this; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Tag)) return false; + + Tag tag = (Tag) o; + + if (canContainBlock != tag.canContainBlock) return false; + if (canContainInline != tag.canContainInline) return false; + if (empty != tag.empty) return false; + if (formatAsBlock != tag.formatAsBlock) return false; + if (isBlock != tag.isBlock) return false; + if (preserveWhitespace != tag.preserveWhitespace) return false; + if (selfClosing != tag.selfClosing) return false; + if (!tagName.equals(tag.tagName)) return false; + + return true; + } + + @Override + public int hashCode() { + int result = tagName.hashCode(); + result = 31 * result + (isBlock ? 1 : 0); + result = 31 * result + (formatAsBlock ? 1 : 0); + result = 31 * result + (canContainBlock ? 1 : 0); + result = 31 * result + (canContainInline ? 1 : 0); + result = 31 * result + (empty ? 1 : 0); + result = 31 * result + (selfClosing ? 1 : 0); + result = 31 * result + (preserveWhitespace ? 1 : 0); + return result; + } + + public String toString() { + return tagName; + } + + // internal static initialisers: + // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources + private static final String[] blockTags = { + "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", + "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", + "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", + "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", + "td", "video", "audio", "canvas", "details", "menu", "plaintext" + }; + private static final String[] inlineTags = { + "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", + "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", + "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", + "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", + "summary", "command", "device" + }; + private static final String[] emptyTags = { + "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", + "device" + }; + private static final String[] formatAsInlineTags = { + "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style" + }; + private static final String[] preserveWhitespaceTags = {"pre", "plaintext", "title"}; + + static { + // creates + for (String tagName : blockTags) { + Tag tag = new Tag(tagName); + register(tag); + } + for (String tagName : inlineTags) { + Tag tag = new Tag(tagName); + tag.isBlock = false; + tag.canContainBlock = false; + tag.formatAsBlock = false; + register(tag); + } + + // mods: + for (String tagName : emptyTags) { + Tag tag = tags.get(tagName); + Validate.notNull(tag); + tag.canContainBlock = false; + tag.canContainInline = false; + tag.empty = true; + } + + for (String tagName : formatAsInlineTags) { + Tag tag = tags.get(tagName); + Validate.notNull(tag); + tag.formatAsBlock = false; + } + + for (String tagName : preserveWhitespaceTags) { + Tag tag = tags.get(tagName); + Validate.notNull(tag); + tag.preserveWhitespace = true; + } + } + + private static Tag register(Tag tag) { + synchronized (tags) { + tags.put(tag.tagName, tag); + } + return tag; + } +} diff --git a/src/org/jsoup/parser/Token.java b/src/org/jsoup/parser/Token.java new file mode 100644 index 0000000000..9f4f9e250d --- /dev/null +++ b/src/org/jsoup/parser/Token.java @@ -0,0 +1,252 @@ +package org.jsoup.parser; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; + +/** + * Parse tokens for the Tokeniser. + */ +abstract class Token { + TokenType type; + + private Token() { + } + + String tokenType() { + return this.getClass().getSimpleName(); + } + + static class Doctype extends Token { + final StringBuilder name = new StringBuilder(); + final StringBuilder publicIdentifier = new StringBuilder(); + final StringBuilder systemIdentifier = new StringBuilder(); + boolean forceQuirks = false; + + Doctype() { + type = TokenType.Doctype; + } + + String getName() { + return name.toString(); + } + + String getPublicIdentifier() { + return publicIdentifier.toString(); + } + + public String getSystemIdentifier() { + return systemIdentifier.toString(); + } + + public boolean isForceQuirks() { + return forceQuirks; + } + } + + static abstract class Tag extends Token { + protected String tagName; + private String pendingAttributeName; + private String pendingAttributeValue; + + boolean selfClosing = false; + Attributes attributes = new Attributes(); // todo: allow nodes to not have attributes + + void newAttribute() { + if (pendingAttributeName != null) { + if (pendingAttributeValue == null) + pendingAttributeValue = ""; + Attribute attribute = new Attribute(pendingAttributeName, pendingAttributeValue); + attributes.put(attribute); + } + pendingAttributeName = null; + pendingAttributeValue = null; + } + + void finaliseTag() { + // finalises for emit + if (pendingAttributeName != null) { + // todo: check if attribute name exists; if so, drop and error + newAttribute(); + } + } + + String name() { + Validate.isFalse(tagName.length() == 0); + return tagName; + } + + Tag name(String name) { + tagName = name; + return this; + } + + boolean isSelfClosing() { + return selfClosing; + } + + @SuppressWarnings({"TypeMayBeWeakened"}) + Attributes getAttributes() { + return attributes; + } + + // these appenders are rarely hit in not null state-- caused by null chars. + void appendTagName(String append) { + tagName = tagName == null ? append : tagName.concat(append); + } + + void appendTagName(char append) { + appendTagName(String.valueOf(append)); + } + + void appendAttributeName(String append) { + pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); + } + + void appendAttributeName(char append) { + appendAttributeName(String.valueOf(append)); + } + + void appendAttributeValue(String append) { + pendingAttributeValue = pendingAttributeValue == null ? append : pendingAttributeValue.concat(append); + } + + void appendAttributeValue(char append) { + appendAttributeValue(String.valueOf(append)); + } + } + + static class StartTag extends Tag { + StartTag() { + super(); + type = TokenType.StartTag; + } + + StartTag(String name) { + this(); + this.tagName = name; + } + + StartTag(String name, Attributes attributes) { + this(); + this.tagName = name; + this.attributes = attributes; + } + + @Override + public String toString() { + return "<" + name() + " " + attributes.toString() + ">"; + } + } + + static class EndTag extends Tag{ + EndTag() { + super(); + type = TokenType.EndTag; + } + + EndTag(String name) { + this(); + this.tagName = name; + } + + @Override + public String toString() { + return "</" + name() + " " + attributes.toString() + ">"; + } + } + + static class Comment extends Token { + final StringBuilder data = new StringBuilder(); + + Comment() { + type = TokenType.Comment; + } + + String getData() { + return data.toString(); + } + + @Override + public String toString() { + return "<!--" + getData() + "-->"; + } + } + + static class Character extends Token { + private final String data; + + Character(String data) { + type = TokenType.Character; + this.data = data; + } + + String getData() { + return data; + } + + @Override + public String toString() { + return getData(); + } + } + + static class EOF extends Token { + EOF() { + type = Token.TokenType.EOF; + } + } + + boolean isDoctype() { + return type == TokenType.Doctype; + } + + Doctype asDoctype() { + return (Doctype) this; + } + + boolean isStartTag() { + return type == TokenType.StartTag; + } + + StartTag asStartTag() { + return (StartTag) this; + } + + boolean isEndTag() { + return type == TokenType.EndTag; + } + + EndTag asEndTag() { + return (EndTag) this; + } + + boolean isComment() { + return type == TokenType.Comment; + } + + Comment asComment() { + return (Comment) this; + } + + boolean isCharacter() { + return type == TokenType.Character; + } + + Character asCharacter() { + return (Character) this; + } + + boolean isEOF() { + return type == TokenType.EOF; + } + + enum TokenType { + Doctype, + StartTag, + EndTag, + Comment, + Character, + EOF + } +} diff --git a/src/org/jsoup/parser/TokenQueue.java b/src/org/jsoup/parser/TokenQueue.java new file mode 100644 index 0000000000..a2fdfe621a --- /dev/null +++ b/src/org/jsoup/parser/TokenQueue.java @@ -0,0 +1,393 @@ +package org.jsoup.parser; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; + +/** + * A character queue with parsing helpers. + * + * @author Jonathan Hedley + */ +public class TokenQueue { + private String queue; + private int pos = 0; + + private static final char ESC = '\\'; // escape char for chomp balanced. + + /** + Create a new TokenQueue. + @param data string of data to back queue. + */ + public TokenQueue(String data) { + Validate.notNull(data); + queue = data; + } + + /** + * Is the queue empty? + * @return true if no data left in queue. + */ + public boolean isEmpty() { + return remainingLength() == 0; + } + + private int remainingLength() { + return queue.length() - pos; + } + + /** + * Retrieves but does not remove the first character from the queue. + * @return First character, or 0 if empty. + */ + public char peek() { + return isEmpty() ? 0 : queue.charAt(pos); + } + + /** + Add a character to the start of the queue (will be the next character retrieved). + @param c character to add + */ + public void addFirst(Character c) { + addFirst(c.toString()); + } + + /** + Add a string to the start of the queue. + @param seq string to add. + */ + public void addFirst(String seq) { + // not very performant, but an edge case + queue = seq + queue.substring(pos); + pos = 0; + } + + /** + * Tests if the next characters on the queue match the sequence. Case insensitive. + * @param seq String to check queue for. + * @return true if the next characters match. + */ + public boolean matches(String seq) { + return queue.regionMatches(true, pos, seq, 0, seq.length()); + } + + /** + * Case sensitive match test. + * @param seq string to case sensitively check for + * @return true if matched, false if not + */ + public boolean matchesCS(String seq) { + return queue.startsWith(seq, pos); + } + + + /** + Tests if the next characters match any of the sequences. Case insensitive. + @param seq list of strings to case insensitively check for + @return true of any matched, false if none did + */ + public boolean matchesAny(String... seq) { + for (String s : seq) { + if (matches(s)) + return true; + } + return false; + } + + public boolean matchesAny(char... seq) { + if (isEmpty()) + return false; + + for (char c: seq) { + if (queue.charAt(pos) == c) + return true; + } + return false; + } + + public boolean matchesStartTag() { + // micro opt for matching "<x" + return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); + } + + /** + * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the + * queue. + * @param seq String to search for, and if found, remove from queue. + * @return true if found and removed, false if not found. + */ + public boolean matchChomp(String seq) { + if (matches(seq)) { + pos += seq.length(); + return true; + } else { + return false; + } + } + + /** + Tests if queue starts with a whitespace character. + @return if starts with whitespace + */ + public boolean matchesWhitespace() { + return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); + } + + /** + Test if the queue matches a word character (letter or digit). + @return if matches a word character + */ + public boolean matchesWord() { + return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); + } + + /** + * Drops the next character off the queue. + */ + public void advance() { + if (!isEmpty()) pos++; + } + + /** + * Consume one character off queue. + * @return first character on queue. + */ + public char consume() { + return queue.charAt(pos++); + } + + /** + * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will + * throw an illegal state exception -- but you should be running match() against that condition. + <p> + Case insensitive. + * @param seq sequence to remove from head of queue. + */ + public void consume(String seq) { + if (!matches(seq)) + throw new IllegalStateException("Queue did not match expected sequence"); + int len = seq.length(); + if (len > remainingLength()) + throw new IllegalStateException("Queue not long enough to consume sequence"); + + pos += len; + } + + /** + * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. + * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> + * @return The matched data consumed from queue. + */ + public String consumeTo(String seq) { + int offset = queue.indexOf(seq, pos); + if (offset != -1) { + String consumed = queue.substring(pos, offset); + pos += consumed.length(); + return consumed; + } else { + return remainder(); + } + } + + public String consumeToIgnoreCase(String seq) { + int start = pos; + String first = seq.substring(0, 1); + boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of + while (!isEmpty()) { + if (matches(seq)) + break; + + if (canScan) { + int skip = queue.indexOf(first, pos) - pos; + if (skip == 0) // this char is the skip char, but not match, so force advance of pos + pos++; + else if (skip < 0) // no chance of finding, grab to end + pos = queue.length(); + else + pos += skip; + } + else + pos++; + } + + String data = queue.substring(start, pos); + return data; + } + + /** + Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. + @param seq any number of terminators to consume to. <b>Case insensitive.</b> + @return consumed string + */ + // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this + // is is a case sensitive time... + public String consumeToAny(String... seq) { + int start = pos; + while (!isEmpty() && !matchesAny(seq)) { + pos++; + } + + String data = queue.substring(start, pos); + return data; + } + + /** + * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). + * <p> + * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go + * isEmpty() == true). + * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> + * @return Data matched from queue. + */ + public String chompTo(String seq) { + String data = consumeTo(seq); + matchChomp(seq); + return data; + } + + public String chompToIgnoreCase(String seq) { + String data = consumeToIgnoreCase(seq); // case insensitive scan + matchChomp(seq); + return data; + } + + /** + * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", + * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left + * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for + * contains text strings; use unescape for that. + * @param open opener + * @param close closer + * @return data matched from the queue + */ + public String chompBalanced(char open, char close) { + StringBuilder accum = new StringBuilder(); + int depth = 0; + char last = 0; + + do { + if (isEmpty()) break; + Character c = consume(); + if (last == 0 || last != ESC) { + if (c.equals(open)) + depth++; + else if (c.equals(close)) + depth--; + } + + if (depth > 0 && last != 0) + accum.append(c); // don't include the outer match pair in the return + last = c; + } while (depth > 0); + return accum.toString(); + } + + /** + * Unescaped a \ escaped string. + * @param in backslash escaped string + * @return unescaped string + */ + public static String unescape(String in) { + StringBuilder out = new StringBuilder(); + char last = 0; + for (char c : in.toCharArray()) { + if (c == ESC) { + if (last != 0 && last == ESC) + out.append(c); + } + else + out.append(c); + last = c; + } + return out.toString(); + } + + /** + * Pulls the next run of whitespace characters of the queue. + */ + public boolean consumeWhitespace() { + boolean seen = false; + while (matchesWhitespace()) { + pos++; + seen = true; + } + return seen; + } + + /** + * Retrieves the next run of word type (letter or digit) off the queue. + * @return String of word characters from queue, or empty string if none. + */ + public String consumeWord() { + int start = pos; + while (matchesWord()) + pos++; + return queue.substring(start, pos); + } + + /** + * Consume an tag name off the queue (word or :, _, -) + * + * @return tag name + */ + public String consumeTagName() { + int start = pos; + while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) + pos++; + + return queue.substring(start, pos); + } + + /** + * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). + * + * @return tag name + */ + public String consumeElementSelector() { + int start = pos; + while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) + pos++; + + return queue.substring(start, pos); + } + + /** + Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) + http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier + @return identifier + */ + public String consumeCssIdentifier() { + int start = pos; + while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) + pos++; + + return queue.substring(start, pos); + } + + /** + Consume an attribute key off the queue (letter, digit, -, _, :") + @return attribute key + */ + public String consumeAttributeKey() { + int start = pos; + while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) + pos++; + + return queue.substring(start, pos); + } + + /** + Consume and return whatever is left on the queue. + @return remained of queue. + */ + public String remainder() { + StringBuilder accum = new StringBuilder(); + while (!isEmpty()) { + accum.append(consume()); + } + return accum.toString(); + } + + public String toString() { + return queue.substring(pos); + } +} diff --git a/src/org/jsoup/parser/Tokeniser.java b/src/org/jsoup/parser/Tokeniser.java new file mode 100644 index 0000000000..ce6ee690d6 --- /dev/null +++ b/src/org/jsoup/parser/Tokeniser.java @@ -0,0 +1,230 @@ +package org.jsoup.parser; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Entities; + +import java.util.ArrayList; +import java.util.List; + +/** + * Readers the input stream into tokens. + */ +class Tokeniser { + static final char replacementChar = '\uFFFD'; // replaces null character + + private CharacterReader reader; // html input + private ParseErrorList errors; // errors found while tokenising + + private TokeniserState state = TokeniserState.Data; // current tokenisation state + private Token emitPending; // the token we are about to emit on next read + private boolean isEmitPending = false; + private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token + StringBuilder dataBuffer; // buffers data looking for </script> + + Token.Tag tagPending; // tag we are building up + Token.Doctype doctypePending; // doctype building up + Token.Comment commentPending; // comment building up + private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag + private boolean selfClosingFlagAcknowledged = true; + + Tokeniser(CharacterReader reader, ParseErrorList errors) { + this.reader = reader; + this.errors = errors; + } + + Token read() { + if (!selfClosingFlagAcknowledged) { + error("Self closing flag not acknowledged"); + selfClosingFlagAcknowledged = true; + } + + while (!isEmitPending) + state.read(this, reader); + + // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: + if (charBuffer.length() > 0) { + String str = charBuffer.toString(); + charBuffer.delete(0, charBuffer.length()); + return new Token.Character(str); + } else { + isEmitPending = false; + return emitPending; + } + } + + void emit(Token token) { + Validate.isFalse(isEmitPending, "There is an unread token pending!"); + + emitPending = token; + isEmitPending = true; + + if (token.type == Token.TokenType.StartTag) { + Token.StartTag startTag = (Token.StartTag) token; + lastStartTag = startTag; + if (startTag.selfClosing) + selfClosingFlagAcknowledged = false; + } else if (token.type == Token.TokenType.EndTag) { + Token.EndTag endTag = (Token.EndTag) token; + if (endTag.attributes.size() > 0) + error("Attributes incorrectly present on end tag"); + } + } + + void emit(String str) { + // buffer strings up until last string token found, to emit only one token for a run of character refs etc. + // does not set isEmitPending; read checks that + charBuffer.append(str); + } + + void emit(char c) { + charBuffer.append(c); + } + + TokeniserState getState() { + return state; + } + + void transition(TokeniserState state) { + this.state = state; + } + + void advanceTransition(TokeniserState state) { + reader.advance(); + this.state = state; + } + + void acknowledgeSelfClosingFlag() { + selfClosingFlagAcknowledged = true; + } + + Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { + if (reader.isEmpty()) + return null; + if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) + return null; + if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) + return null; + + reader.mark(); + if (reader.matchConsume("#")) { // numbered + boolean isHexMode = reader.matchConsumeIgnoreCase("X"); + String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); + if (numRef.length() == 0) { // didn't match anything + characterReferenceError("numeric reference with no numerals"); + reader.rewindToMark(); + return null; + } + if (!reader.matchConsume(";")) + characterReferenceError("missing semicolon"); // missing semi + int charval = -1; + try { + int base = isHexMode ? 16 : 10; + charval = Integer.valueOf(numRef, base); + } catch (NumberFormatException e) { + } // skip + if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { + characterReferenceError("character outside of valid range"); + return replacementChar; + } else { + // todo: implement number replacement table + // todo: check for extra illegal unicode points as parse errors + return (char) charval; + } + } else { // named + // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found + String nameRef = reader.consumeLetterThenDigitSequence(); + String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches + boolean looksLegit = reader.matches(';'); + boolean found = false; + while (nameRef.length() > 0 && !found) { + if (Entities.isNamedEntity(nameRef)) + found = true; + else { + nameRef = nameRef.substring(0, nameRef.length()-1); + reader.unconsume(); + } + } + if (!found) { + if (looksLegit) // named with semicolon + characterReferenceError(String.format("invalid named referenece '%s'", origNameRef)); + reader.rewindToMark(); + return null; + } + if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { + // don't want that to match + reader.rewindToMark(); + return null; + } + if (!reader.matchConsume(";")) + characterReferenceError("missing semicolon"); // missing semi + return Entities.getCharacterByName(nameRef); + } + } + + Token.Tag createTagPending(boolean start) { + tagPending = start ? new Token.StartTag() : new Token.EndTag(); + return tagPending; + } + + void emitTagPending() { + tagPending.finaliseTag(); + emit(tagPending); + } + + void createCommentPending() { + commentPending = new Token.Comment(); + } + + void emitCommentPending() { + emit(commentPending); + } + + void createDoctypePending() { + doctypePending = new Token.Doctype(); + } + + void emitDoctypePending() { + emit(doctypePending); + } + + void createTempBuffer() { + dataBuffer = new StringBuilder(); + } + + boolean isAppropriateEndTagToken() { + if (lastStartTag == null) + return false; + return tagPending.tagName.equals(lastStartTag.tagName); + } + + String appropriateEndTagName() { + return lastStartTag.tagName; + } + + void error(TokeniserState state) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state)); + } + + void eofError(TokeniserState state) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state)); + } + + private void characterReferenceError(String message) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message)); + } + + private void error(String errorMsg) { + if (errors.canAddError()) + errors.add(new ParseError(reader.pos(), errorMsg)); + } + + boolean currentNodeInHtmlNS() { + // todo: implement namespaces correctly + return true; + // Element currentNode = currentNode(); + // return currentNode != null && currentNode.namespace().equals("HTML"); + } +} diff --git a/src/org/jsoup/parser/TokeniserState.java b/src/org/jsoup/parser/TokeniserState.java new file mode 100644 index 0000000000..e3013c73e9 --- /dev/null +++ b/src/org/jsoup/parser/TokeniserState.java @@ -0,0 +1,1778 @@ +package org.jsoup.parser; + +/** + * States and transition activations for the Tokeniser. + */ +enum TokeniserState { + Data { + // in data state, gather characters until a character reference or tag is found + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '&': + t.advanceTransition(CharacterReferenceInData); + break; + case '<': + t.advanceTransition(TagOpen); + break; + case nullChar: + t.error(this); // NOT replacement character (oddly?) + t.emit(r.consume()); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('&', '<', nullChar); + t.emit(data); + break; + } + } + }, + CharacterReferenceInData { + // from & in data + void read(Tokeniser t, CharacterReader r) { + Character c = t.consumeCharacterReference(null, false); + if (c == null) + t.emit('&'); + else + t.emit(c); + t.transition(Data); + } + }, + Rcdata { + /// handles data in title, textarea etc + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '&': + t.advanceTransition(CharacterReferenceInRcdata); + break; + case '<': + t.advanceTransition(RcdataLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('&', '<', nullChar); + t.emit(data); + break; + } + } + }, + CharacterReferenceInRcdata { + void read(Tokeniser t, CharacterReader r) { + Character c = t.consumeCharacterReference(null, false); + if (c == null) + t.emit('&'); + else + t.emit(c); + t.transition(Rcdata); + } + }, + Rawtext { + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '<': + t.advanceTransition(RawtextLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('<', nullChar); + t.emit(data); + break; + } + } + }, + ScriptData { + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '<': + t.advanceTransition(ScriptDataLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('<', nullChar); + t.emit(data); + break; + } + } + }, + PLAINTEXT { + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeTo(nullChar); + t.emit(data); + break; + } + } + }, + TagOpen { + // from < in data + void read(Tokeniser t, CharacterReader r) { + switch (r.current()) { + case '!': + t.advanceTransition(MarkupDeclarationOpen); + break; + case '/': + t.advanceTransition(EndTagOpen); + break; + case '?': + t.advanceTransition(BogusComment); + break; + default: + if (r.matchesLetter()) { + t.createTagPending(true); + t.transition(TagName); + } else { + t.error(this); + t.emit('<'); // char that got us here + t.transition(Data); + } + break; + } + } + }, + EndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.emit("</"); + t.transition(Data); + } else if (r.matchesLetter()) { + t.createTagPending(false); + t.transition(TagName); + } else if (r.matches('>')) { + t.error(this); + t.advanceTransition(Data); + } else { + t.error(this); + t.advanceTransition(BogusComment); + } + } + }, + TagName { + // from < or </ in data, will have start or end tag pending + void read(Tokeniser t, CharacterReader r) { + // previous TagOpen state did NOT consume, will have a letter char in current + String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase(); + t.tagPending.appendTagName(tagName); + + switch (r.consume()) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: // replacement + t.tagPending.appendTagName(replacementStr); + break; + case eof: // should emit pending tag? + t.eofError(this); + t.transition(Data); + // no default, as covered with above consumeToAny + } + } + }, + RcdataLessthanSign { + // from < in rcdata + void read(Tokeniser t, CharacterReader r) { + if (r.matches('/')) { + t.createTempBuffer(); + t.advanceTransition(RCDATAEndTagOpen); + } else if (r.matchesLetter() && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) { + // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than + // consuming to EOF; break out here + t.tagPending = new Token.EndTag(t.appropriateEndTagName()); + t.emitTagPending(); + r.unconsume(); // undo "<" + t.transition(Data); + } else { + t.emit("<"); + t.transition(Rcdata); + } + } + }, + RCDATAEndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTagPending(false); + t.tagPending.appendTagName(Character.toLowerCase(r.current())); + t.dataBuffer.append(Character.toLowerCase(r.current())); + t.advanceTransition(RCDATAEndTagName); + } else { + t.emit("</"); + t.transition(Rcdata); + } + } + }, + RCDATAEndTagName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.tagPending.appendTagName(name.toLowerCase()); + t.dataBuffer.append(name); + return; + } + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + if (t.isAppropriateEndTagToken()) + t.transition(BeforeAttributeName); + else + anythingElse(t, r); + break; + case '/': + if (t.isAppropriateEndTagToken()) + t.transition(SelfClosingStartTag); + else + anythingElse(t, r); + break; + case '>': + if (t.isAppropriateEndTagToken()) { + t.emitTagPending(); + t.transition(Data); + } + else + anythingElse(t, r); + break; + default: + anythingElse(t, r); + } + } + + private void anythingElse(Tokeniser t, CharacterReader r) { + t.emit("</" + t.dataBuffer.toString()); + t.transition(Rcdata); + } + }, + RawtextLessthanSign { + void read(Tokeniser t, CharacterReader r) { + if (r.matches('/')) { + t.createTempBuffer(); + t.advanceTransition(RawtextEndTagOpen); + } else { + t.emit('<'); + t.transition(Rawtext); + } + } + }, + RawtextEndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTagPending(false); + t.transition(RawtextEndTagName); + } else { + t.emit("</"); + t.transition(Rawtext); + } + } + }, + RawtextEndTagName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.tagPending.appendTagName(name.toLowerCase()); + t.dataBuffer.append(name); + return; + } + + if (t.isAppropriateEndTagToken() && !r.isEmpty()) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + anythingElse(t, r); + } + } else + anythingElse(t, r); + } + + private void anythingElse(Tokeniser t, CharacterReader r) { + t.emit("</" + t.dataBuffer.toString()); + t.transition(Rawtext); + } + }, + ScriptDataLessthanSign { + void read(Tokeniser t, CharacterReader r) { + switch (r.consume()) { + case '/': + t.createTempBuffer(); + t.transition(ScriptDataEndTagOpen); + break; + case '!': + t.emit("<!"); + t.transition(ScriptDataEscapeStart); + break; + default: + t.emit("<"); + r.unconsume(); + t.transition(ScriptData); + } + } + }, + ScriptDataEndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTagPending(false); + t.transition(ScriptDataEndTagName); + } else { + t.emit("</"); + t.transition(ScriptData); + } + + } + }, + ScriptDataEndTagName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.tagPending.appendTagName(name.toLowerCase()); + t.dataBuffer.append(name); + return; + } + + if (t.isAppropriateEndTagToken() && !r.isEmpty()) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + anythingElse(t, r); + } + } else { + anythingElse(t, r); + } + } + + private void anythingElse(Tokeniser t, CharacterReader r) { + t.emit("</" + t.dataBuffer.toString()); + t.transition(ScriptData); + } + }, + ScriptDataEscapeStart { + void read(Tokeniser t, CharacterReader r) { + if (r.matches('-')) { + t.emit('-'); + t.advanceTransition(ScriptDataEscapeStartDash); + } else { + t.transition(ScriptData); + } + } + }, + ScriptDataEscapeStartDash { + void read(Tokeniser t, CharacterReader r) { + if (r.matches('-')) { + t.emit('-'); + t.advanceTransition(ScriptDataEscapedDashDash); + } else { + t.transition(ScriptData); + } + } + }, + ScriptDataEscaped { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.transition(Data); + return; + } + + switch (r.current()) { + case '-': + t.emit('-'); + t.advanceTransition(ScriptDataEscapedDash); + break; + case '<': + t.advanceTransition(ScriptDataEscapedLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + default: + String data = r.consumeToAny('-', '<', nullChar); + t.emit(data); + } + } + }, + ScriptDataEscapedDash { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.transition(Data); + return; + } + + char c = r.consume(); + switch (c) { + case '-': + t.emit(c); + t.transition(ScriptDataEscapedDashDash); + break; + case '<': + t.transition(ScriptDataEscapedLessthanSign); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataEscaped); + break; + default: + t.emit(c); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataEscapedDashDash { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.transition(Data); + return; + } + + char c = r.consume(); + switch (c) { + case '-': + t.emit(c); + break; + case '<': + t.transition(ScriptDataEscapedLessthanSign); + break; + case '>': + t.emit(c); + t.transition(ScriptData); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataEscaped); + break; + default: + t.emit(c); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataEscapedLessthanSign { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTempBuffer(); + t.dataBuffer.append(Character.toLowerCase(r.current())); + t.emit("<" + r.current()); + t.advanceTransition(ScriptDataDoubleEscapeStart); + } else if (r.matches('/')) { + t.createTempBuffer(); + t.advanceTransition(ScriptDataEscapedEndTagOpen); + } else { + t.emit('<'); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataEscapedEndTagOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createTagPending(false); + t.tagPending.appendTagName(Character.toLowerCase(r.current())); + t.dataBuffer.append(r.current()); + t.advanceTransition(ScriptDataEscapedEndTagName); + } else { + t.emit("</"); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataEscapedEndTagName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.tagPending.appendTagName(name.toLowerCase()); + t.dataBuffer.append(name); + return; + } + + if (t.isAppropriateEndTagToken() && !r.isEmpty()) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + anythingElse(t, r); + break; + } + } else { + anythingElse(t, r); + } + } + + private void anythingElse(Tokeniser t, CharacterReader r) { + t.emit("</" + t.dataBuffer.toString()); + t.transition(ScriptDataEscaped); + } + }, + ScriptDataDoubleEscapeStart { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.dataBuffer.append(name.toLowerCase()); + t.emit(name); + return; + } + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + case '/': + case '>': + if (t.dataBuffer.toString().equals("script")) + t.transition(ScriptDataDoubleEscaped); + else + t.transition(ScriptDataEscaped); + t.emit(c); + break; + default: + r.unconsume(); + t.transition(ScriptDataEscaped); + } + } + }, + ScriptDataDoubleEscaped { + void read(Tokeniser t, CharacterReader r) { + char c = r.current(); + switch (c) { + case '-': + t.emit(c); + t.advanceTransition(ScriptDataDoubleEscapedDash); + break; + case '<': + t.emit(c); + t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + String data = r.consumeToAny('-', '<', nullChar); + t.emit(data); + } + } + }, + ScriptDataDoubleEscapedDash { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.emit(c); + t.transition(ScriptDataDoubleEscapedDashDash); + break; + case '<': + t.emit(c); + t.transition(ScriptDataDoubleEscapedLessthanSign); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataDoubleEscaped); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.emit(c); + t.transition(ScriptDataDoubleEscaped); + } + } + }, + ScriptDataDoubleEscapedDashDash { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.emit(c); + break; + case '<': + t.emit(c); + t.transition(ScriptDataDoubleEscapedLessthanSign); + break; + case '>': + t.emit(c); + t.transition(ScriptData); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataDoubleEscaped); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.emit(c); + t.transition(ScriptDataDoubleEscaped); + } + } + }, + ScriptDataDoubleEscapedLessthanSign { + void read(Tokeniser t, CharacterReader r) { + if (r.matches('/')) { + t.emit('/'); + t.createTempBuffer(); + t.advanceTransition(ScriptDataDoubleEscapeEnd); + } else { + t.transition(ScriptDataDoubleEscaped); + } + } + }, + ScriptDataDoubleEscapeEnd { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.dataBuffer.append(name.toLowerCase()); + t.emit(name); + return; + } + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + case '/': + case '>': + if (t.dataBuffer.toString().equals("script")) + t.transition(ScriptDataEscaped); + else + t.transition(ScriptDataDoubleEscaped); + t.emit(c); + break; + default: + r.unconsume(); + t.transition(ScriptDataDoubleEscaped); + } + } + }, + BeforeAttributeName { + // from tagname <xxx + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + break; // ignore whitespace + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + case '=': + t.error(this); + t.tagPending.newAttribute(); + t.tagPending.appendAttributeName(c); + t.transition(AttributeName); + break; + default: // A-Z, anything else + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + } + } + }, + AttributeName { + // from before attribute name + void read(Tokeniser t, CharacterReader r) { + String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'); + t.tagPending.appendAttributeName(name.toLowerCase()); + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(AfterAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '=': + t.transition(BeforeAttributeValue); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeName(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + t.error(this); + t.tagPending.appendAttributeName(c); + // no default, as covered in consumeToAny + } + } + }, + AfterAttributeName { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + // ignore + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '=': + t.transition(BeforeAttributeValue); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeName(replacementChar); + t.transition(AttributeName); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + t.error(this); + t.tagPending.newAttribute(); + t.tagPending.appendAttributeName(c); + t.transition(AttributeName); + break; + default: // A-Z, anything else + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + } + } + }, + BeforeAttributeValue { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + // ignore + break; + case '"': + t.transition(AttributeValue_doubleQuoted); + break; + case '&': + r.unconsume(); + t.transition(AttributeValue_unquoted); + break; + case '\'': + t.transition(AttributeValue_singleQuoted); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + t.transition(AttributeValue_unquoted); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '>': + t.error(this); + t.emitTagPending(); + t.transition(Data); + break; + case '<': + case '=': + case '`': + t.error(this); + t.tagPending.appendAttributeValue(c); + t.transition(AttributeValue_unquoted); + break; + default: + r.unconsume(); + t.transition(AttributeValue_unquoted); + } + } + }, + AttributeValue_doubleQuoted { + void read(Tokeniser t, CharacterReader r) { + String value = r.consumeToAny('"', '&', nullChar); + if (value.length() > 0) + t.tagPending.appendAttributeValue(value); + + char c = r.consume(); + switch (c) { + case '"': + t.transition(AfterAttributeValue_quoted); + break; + case '&': + Character ref = t.consumeCharacterReference('"', true); + if (ref != null) + t.tagPending.appendAttributeValue(ref); + else + t.tagPending.appendAttributeValue('&'); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + // no default, handled in consume to any above + } + } + }, + AttributeValue_singleQuoted { + void read(Tokeniser t, CharacterReader r) { + String value = r.consumeToAny('\'', '&', nullChar); + if (value.length() > 0) + t.tagPending.appendAttributeValue(value); + + char c = r.consume(); + switch (c) { + case '\'': + t.transition(AfterAttributeValue_quoted); + break; + case '&': + Character ref = t.consumeCharacterReference('\'', true); + if (ref != null) + t.tagPending.appendAttributeValue(ref); + else + t.tagPending.appendAttributeValue('&'); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + // no default, handled in consume to any above + } + } + }, + AttributeValue_unquoted { + void read(Tokeniser t, CharacterReader r) { + String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'); + if (value.length() > 0) + t.tagPending.appendAttributeValue(value); + + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '&': + Character ref = t.consumeCharacterReference('>', true); + if (ref != null) + t.tagPending.appendAttributeValue(ref); + else + t.tagPending.appendAttributeValue('&'); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + case '=': + case '`': + t.error(this); + t.tagPending.appendAttributeValue(c); + break; + // no default, handled in consume to any above + } + + } + }, + // CharacterReferenceInAttributeValue state handled inline + AfterAttributeValue_quoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.error(this); + r.unconsume(); + t.transition(BeforeAttributeName); + } + + } + }, + SelfClosingStartTag { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '>': + t.tagPending.selfClosing = true; + t.emitTagPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BeforeAttributeName); + } + } + }, + BogusComment { + void read(Tokeniser t, CharacterReader r) { + // todo: handle bogus comment starting from eof. when does that trigger? + // rewind to capture character that lead us here + r.unconsume(); + Token.Comment comment = new Token.Comment(); + comment.data.append(r.consumeTo('>')); + // todo: replace nullChar with replaceChar + t.emit(comment); + t.advanceTransition(Data); + } + }, + MarkupDeclarationOpen { + void read(Tokeniser t, CharacterReader r) { + if (r.matchConsume("--")) { + t.createCommentPending(); + t.transition(CommentStart); + } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { + t.transition(Doctype); + } else if (r.matchConsume("[CDATA[")) { + // todo: should actually check current namepspace, and only non-html allows cdata. until namespace + // is implemented properly, keep handling as cdata + //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) { + t.transition(CdataSection); + } else { + t.error(this); + t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind + } + } + }, + CommentStart { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.transition(CommentStartDash); + break; + case nullChar: + t.error(this); + t.commentPending.data.append(replacementChar); + t.transition(Comment); + break; + case '>': + t.error(this); + t.emitCommentPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(c); + t.transition(Comment); + } + } + }, + CommentStartDash { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.transition(CommentStartDash); + break; + case nullChar: + t.error(this); + t.commentPending.data.append(replacementChar); + t.transition(Comment); + break; + case '>': + t.error(this); + t.emitCommentPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(c); + t.transition(Comment); + } + } + }, + Comment { + void read(Tokeniser t, CharacterReader r) { + char c = r.current(); + switch (c) { + case '-': + t.advanceTransition(CommentEndDash); + break; + case nullChar: + t.error(this); + r.advance(); + t.commentPending.data.append(replacementChar); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(r.consumeToAny('-', nullChar)); + } + } + }, + CommentEndDash { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.transition(CommentEnd); + break; + case nullChar: + t.error(this); + t.commentPending.data.append('-').append(replacementChar); + t.transition(Comment); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append('-').append(c); + t.transition(Comment); + } + } + }, + CommentEnd { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '>': + t.emitCommentPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.commentPending.data.append("--").append(replacementChar); + t.transition(Comment); + break; + case '!': + t.error(this); + t.transition(CommentEndBang); + break; + case '-': + t.error(this); + t.commentPending.data.append('-'); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.error(this); + t.commentPending.data.append("--").append(c); + t.transition(Comment); + } + } + }, + CommentEndBang { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '-': + t.commentPending.data.append("--!"); + t.transition(CommentEndDash); + break; + case '>': + t.emitCommentPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.commentPending.data.append("--!").append(replacementChar); + t.transition(Comment); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append("--!").append(c); + t.transition(Comment); + } + } + }, + Doctype { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeDoctypeName); + break; + case eof: + t.eofError(this); + t.createDoctypePending(); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BeforeDoctypeName); + } + } + }, + BeforeDoctypeName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + t.createDoctypePending(); + t.transition(DoctypeName); + return; + } + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + break; // ignore whitespace + case nullChar: + t.error(this); + t.doctypePending.name.append(replacementChar); + t.transition(DoctypeName); + break; + case eof: + t.eofError(this); + t.createDoctypePending(); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.createDoctypePending(); + t.doctypePending.name.append(c); + t.transition(DoctypeName); + } + } + }, + DoctypeName { + void read(Tokeniser t, CharacterReader r) { + if (r.matchesLetter()) { + String name = r.consumeLetterSequence(); + t.doctypePending.name.append(name.toLowerCase()); + return; + } + char c = r.consume(); + switch (c) { + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(AfterDoctypeName); + break; + case nullChar: + t.error(this); + t.doctypePending.name.append(replacementChar); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.name.append(c); + } + } + }, + AfterDoctypeName { + void read(Tokeniser t, CharacterReader r) { + if (r.isEmpty()) { + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + return; + } + if (r.matchesAny('\t', '\n', '\f', ' ')) + r.advance(); // ignore whitespace + else if (r.matches('>')) { + t.emitDoctypePending(); + t.advanceTransition(Data); + } else if (r.matchConsumeIgnoreCase("PUBLIC")) { + t.transition(AfterDoctypePublicKeyword); + } else if (r.matchConsumeIgnoreCase("SYSTEM")) { + t.transition(AfterDoctypeSystemKeyword); + } else { + t.error(this); + t.doctypePending.forceQuirks = true; + t.advanceTransition(BogusDoctype); + } + + } + }, + AfterDoctypePublicKeyword { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeDoctypePublicIdentifier); + break; + case '"': + t.error(this); + // set public id to empty string + t.transition(DoctypePublicIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // set public id to empty string + t.transition(DoctypePublicIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + BeforeDoctypePublicIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '"': + // set public id to empty string + t.transition(DoctypePublicIdentifier_doubleQuoted); + break; + case '\'': + // set public id to empty string + t.transition(DoctypePublicIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + DoctypePublicIdentifier_doubleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '"': + t.transition(AfterDoctypePublicIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.publicIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.publicIdentifier.append(c); + } + } + }, + DoctypePublicIdentifier_singleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\'': + t.transition(AfterDoctypePublicIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.publicIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.publicIdentifier.append(c); + } + } + }, + AfterDoctypePublicIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BetweenDoctypePublicAndSystemIdentifiers); + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + BetweenDoctypePublicAndSystemIdentifiers { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + AfterDoctypeSystemKeyword { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeDoctypeSystemIdentifier); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + } + } + }, + BeforeDoctypeSystemIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '"': + // set system id to empty string + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + // set public id to empty string + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); + } + } + }, + DoctypeSystemIdentifier_doubleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '"': + t.transition(AfterDoctypeSystemIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.systemIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.systemIdentifier.append(c); + } + } + }, + DoctypeSystemIdentifier_singleQuoted { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\'': + t.transition(AfterDoctypeSystemIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.systemIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.systemIdentifier.append(c); + } + } + }, + AfterDoctypeSystemIdentifier { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BogusDoctype); + // NOT force quirks + } + } + }, + BogusDoctype { + void read(Tokeniser t, CharacterReader r) { + char c = r.consume(); + switch (c) { + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.emitDoctypePending(); + t.transition(Data); + break; + default: + // ignore char + break; + } + } + }, + CdataSection { + void read(Tokeniser t, CharacterReader r) { + String data = r.consumeTo("]]>"); + t.emit(data); + r.matchConsume("]]>"); + t.transition(Data); + } + }; + + + abstract void read(Tokeniser t, CharacterReader r); + + private static final char nullChar = '\u0000'; + private static final char replacementChar = Tokeniser.replacementChar; + private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); + private static final char eof = CharacterReader.EOF; +} diff --git a/src/org/jsoup/parser/TreeBuilder.java b/src/org/jsoup/parser/TreeBuilder.java new file mode 100644 index 0000000000..e06caad501 --- /dev/null +++ b/src/org/jsoup/parser/TreeBuilder.java @@ -0,0 +1,60 @@ +package org.jsoup.parser; + +import org.jsoup.helper.DescendableLinkedList; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author Jonathan Hedley + */ +abstract class TreeBuilder { + CharacterReader reader; + Tokeniser tokeniser; + protected Document doc; // current doc we are building into + protected DescendableLinkedList<Element> stack; // the stack of open elements + protected String baseUri; // current base uri, for creating new elements + protected Token currentToken; // currentToken is used only for error tracking. + protected ParseErrorList errors; // null when not tracking errors + + protected void initialiseParse(String input, String baseUri, ParseErrorList errors) { + Validate.notNull(input, "String input must not be null"); + Validate.notNull(baseUri, "BaseURI must not be null"); + + doc = new Document(baseUri); + reader = new CharacterReader(input); + this.errors = errors; + tokeniser = new Tokeniser(reader, errors); + stack = new DescendableLinkedList<Element>(); + this.baseUri = baseUri; + } + + Document parse(String input, String baseUri) { + return parse(input, baseUri, ParseErrorList.noTracking()); + } + + Document parse(String input, String baseUri, ParseErrorList errors) { + initialiseParse(input, baseUri, errors); + runParser(); + return doc; + } + + protected void runParser() { + while (true) { + Token token = tokeniser.read(); + process(token); + + if (token.type == Token.TokenType.EOF) + break; + } + } + + protected abstract boolean process(Token token); + + protected Element currentElement() { + return stack.getLast(); + } +} diff --git a/src/org/jsoup/parser/XmlTreeBuilder.java b/src/org/jsoup/parser/XmlTreeBuilder.java new file mode 100644 index 0000000000..3f03ad26ac --- /dev/null +++ b/src/org/jsoup/parser/XmlTreeBuilder.java @@ -0,0 +1,111 @@ +package org.jsoup.parser; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.*; + +import java.util.Iterator; + +/** + * @author Jonathan Hedley + */ +public class XmlTreeBuilder extends TreeBuilder { + @Override + protected void initialiseParse(String input, String baseUri, ParseErrorList errors) { + super.initialiseParse(input, baseUri, errors); + stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) + } + + @Override + protected boolean process(Token token) { + // start tag, end tag, doctype, comment, character, eof + switch (token.type) { + case StartTag: + insert(token.asStartTag()); + break; + case EndTag: + popStackToClose(token.asEndTag()); + break; + case Comment: + insert(token.asComment()); + break; + case Character: + insert(token.asCharacter()); + break; + case Doctype: + insert(token.asDoctype()); + break; + case EOF: // could put some normalisation here if desired + break; + default: + Validate.fail("Unexpected token type: " + token.type); + } + return true; + } + + private void insertNode(Node node) { + currentElement().appendChild(node); + } + + Element insert(Token.StartTag startTag) { + Tag tag = Tag.valueOf(startTag.name()); + // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. + Element el = new Element(tag, baseUri, startTag.attributes); + insertNode(el); + if (startTag.isSelfClosing()) { + tokeniser.acknowledgeSelfClosingFlag(); + if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. + tag.setSelfClosing(); + } else { + stack.add(el); + } + return el; + } + + void insert(Token.Comment commentToken) { + Comment comment = new Comment(commentToken.getData(), baseUri); + insertNode(comment); + } + + void insert(Token.Character characterToken) { + Node node = new TextNode(characterToken.getData(), baseUri); + insertNode(node); + } + + void insert(Token.Doctype d) { + DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); + insertNode(doctypeNode); + } + + /** + * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not + * found, skips. + * + * @param endTag + */ + private void popStackToClose(Token.EndTag endTag) { + String elName = endTag.name(); + Element firstFound = null; + + Iterator<Element> it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next.nodeName().equals(elName)) { + firstFound = next; + break; + } + } + if (firstFound == null) + return; // not found, skip + + it = stack.descendingIterator(); + while (it.hasNext()) { + Element next = it.next(); + if (next == firstFound) { + it.remove(); + break; + } else { + it.remove(); + } + } + } +} diff --git a/src/org/jsoup/parser/package-info.java b/src/org/jsoup/parser/package-info.java new file mode 100644 index 0000000000..168fdf4086 --- /dev/null +++ b/src/org/jsoup/parser/package-info.java @@ -0,0 +1,4 @@ +/** + Contains the HTML parser, tag specifications, and HTML tokeniser. + */ +package org.jsoup.parser; diff --git a/src/org/jsoup/safety/Cleaner.java b/src/org/jsoup/safety/Cleaner.java new file mode 100644 index 0000000000..eda67df86b --- /dev/null +++ b/src/org/jsoup/safety/Cleaner.java @@ -0,0 +1,129 @@ +package org.jsoup.safety; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.*; +import org.jsoup.parser.Tag; + +import java.util.List; + +/** + The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes + that you are expecting; no junk, and no cross-site scripting attacks! + <p/> + The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain + HTML that is allowed by the whitelist. + <p/> + It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the + canned white-lists only allow body contained tags. + <p/> + Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. + */ +public class Cleaner { + private Whitelist whitelist; + + /** + Create a new cleaner, that sanitizes documents using the supplied whitelist. + @param whitelist white-list to clean with + */ + public Cleaner(Whitelist whitelist) { + Validate.notNull(whitelist); + this.whitelist = whitelist; + } + + /** + Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. + The original document is not modified. Only elements from the dirt document's <code>body</code> are used. + @param dirtyDocument Untrusted base document to clean. + @return cleaned document. + */ + public Document clean(Document dirtyDocument) { + Validate.notNull(dirtyDocument); + + Document clean = Document.createShell(dirtyDocument.baseUri()); + copySafeNodes(dirtyDocument.body(), clean.body()); + + return clean; + } + + /** + Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes + in the input HTML are allowed by the whitelist. + <p/> + This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully + using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document + to ensure enforced attributes are set correctly, and that the output is tidied. + @param dirtyDocument document to test + @return true if no tags or attributes need to be removed; false if they do + */ + public boolean isValid(Document dirtyDocument) { + Validate.notNull(dirtyDocument); + + Document clean = Document.createShell(dirtyDocument.baseUri()); + int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); + return numDiscarded == 0; + } + + /** + Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. + @param source source of HTML + @param dest destination element to copy into + @return number of discarded elements (that were considered unsafe) + */ + private int copySafeNodes(Element source, Element dest) { + List<Node> sourceChildren = source.childNodes(); + int numDiscarded = 0; + + for (Node sourceChild : sourceChildren) { + if (sourceChild instanceof Element) { + Element sourceEl = (Element) sourceChild; + + if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs + ElementMeta meta = createSafeElement(sourceEl); + Element destChild = meta.el; + dest.appendChild(destChild); + + numDiscarded += meta.numAttribsDiscarded; + numDiscarded += copySafeNodes(sourceEl, destChild); // recurs + } else { // not a safe tag, but it may have children (els or text) that are, so recurse + numDiscarded++; + numDiscarded += copySafeNodes(sourceEl, dest); + } + } else if (sourceChild instanceof TextNode) { + TextNode sourceText = (TextNode) sourceChild; + TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri()); + dest.appendChild(destText); + } // else, we don't care about comments, xml proc instructions, etc + } + return numDiscarded; + } + + private ElementMeta createSafeElement(Element sourceEl) { + String sourceTag = sourceEl.tagName(); + Attributes destAttrs = new Attributes(); + Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); + int numDiscarded = 0; + + Attributes sourceAttrs = sourceEl.attributes(); + for (Attribute sourceAttr : sourceAttrs) { + if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) + destAttrs.put(sourceAttr); + else + numDiscarded++; + } + Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); + destAttrs.addAll(enforcedAttrs); + + return new ElementMeta(dest, numDiscarded); + } + + private static class ElementMeta { + Element el; + int numAttribsDiscarded; + + ElementMeta(Element el, int numAttribsDiscarded) { + this.el = el; + this.numAttribsDiscarded = numAttribsDiscarded; + } + } + +} diff --git a/src/org/jsoup/safety/Whitelist.java b/src/org/jsoup/safety/Whitelist.java new file mode 100644 index 0000000000..2c1150ce9e --- /dev/null +++ b/src/org/jsoup/safety/Whitelist.java @@ -0,0 +1,451 @@ +package org.jsoup.safety; + +/* + Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired + this whitelist configuration, and the initial defaults. + */ + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Element; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +/** + Whitelists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed. + <p/> + Start with one of the defaults: + <ul> + <li>{@link #none} + <li>{@link #simpleText} + <li>{@link #basic} + <li>{@link #basicWithImages} + <li>{@link #relaxed} + </ul> + <p/> + If you need to allow more through (please be careful!), tweak a base whitelist with: + <ul> + <li>{@link #addTags} + <li>{@link #addAttributes} + <li>{@link #addEnforcedAttribute} + <li>{@link #addProtocols} + </ul> + <p/> + The cleaner and these whitelists assume that you want to clean a <code>body</code> fragment of HTML (to add user + supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, either wrap the + document HTML around the cleaned body HTML, or create a whitelist that allows <code>html</code> and <code>head</code> + elements as appropriate. + <p/> + If you are going to extend a whitelist, please be very careful. Make sure you understand what attributes may lead to + XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See + http://ha.ckers.org/xss.html for some XSS attack examples. + + @author Jonathan Hedley + */ +public class Whitelist { + private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span] + private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. + private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values + private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes + private boolean preserveRelativeLinks; // option to preserve relative links + + /** + This whitelist allows only text nodes: all HTML will be stripped. + + @return whitelist + */ + public static Whitelist none() { + return new Whitelist(); + } + + /** + This whitelist allows only simple text formatting: <code>b, em, i, strong, u</code>. All other HTML (tags and + attributes) will be removed. + + @return whitelist + */ + public static Whitelist simpleText() { + return new Whitelist() + .addTags("b", "em", "i", "strong", "u") + ; + } + + /** + This whitelist allows a fuller range of text nodes: <code>a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, + ol, p, pre, q, small, strike, strong, sub, sup, u, ul</code>, and appropriate attributes. + <p/> + Links (<code>a</code> elements) can point to <code>http, https, ftp, mailto</code>, and have an enforced + <code>rel=nofollow</code> attribute. + <p/> + Does not allow images. + + @return whitelist + */ + public static Whitelist basic() { + return new Whitelist() + .addTags( + "a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em", + "i", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", + "sup", "u", "ul") + + .addAttributes("a", "href") + .addAttributes("blockquote", "cite") + .addAttributes("q", "cite") + + .addProtocols("a", "href", "ftp", "http", "https", "mailto") + .addProtocols("blockquote", "cite", "http", "https") + .addProtocols("cite", "cite", "http", "https") + + .addEnforcedAttribute("a", "rel", "nofollow") + ; + + } + + /** + This whitelist allows the same text tags as {@link #basic}, and also allows <code>img</code> tags, with appropriate + attributes, with <code>src</code> pointing to <code>http</code> or <code>https</code>. + + @return whitelist + */ + public static Whitelist basicWithImages() { + return basic() + .addTags("img") + .addAttributes("img", "align", "alt", "height", "src", "title", "width") + .addProtocols("img", "src", "http", "https") + ; + } + + /** + This whitelist allows a full range of text and structural body HTML: <code>a, b, blockquote, br, caption, cite, + code, col, colgroup, dd, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, strike, strong, sub, + sup, table, tbody, td, tfoot, th, thead, tr, u, ul</code> + <p/> + Links do not have an enforced <code>rel=nofollow</code> attribute, but you can add that if desired. + + @return whitelist + */ + public static Whitelist relaxed() { + return new Whitelist() + .addTags( + "a", "b", "blockquote", "br", "caption", "cite", "code", "col", + "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", + "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", + "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", + "ul") + + .addAttributes("a", "href", "title") + .addAttributes("blockquote", "cite") + .addAttributes("col", "span", "width") + .addAttributes("colgroup", "span", "width") + .addAttributes("img", "align", "alt", "height", "src", "title", "width") + .addAttributes("ol", "start", "type") + .addAttributes("q", "cite") + .addAttributes("table", "summary", "width") + .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width") + .addAttributes( + "th", "abbr", "axis", "colspan", "rowspan", "scope", + "width") + .addAttributes("ul", "type") + + .addProtocols("a", "href", "ftp", "http", "https", "mailto") + .addProtocols("blockquote", "cite", "http", "https") + .addProtocols("img", "src", "http", "https") + .addProtocols("q", "cite", "http", "https") + ; + } + + /** + Create a new, empty whitelist. Generally it will be better to start with a default prepared whitelist instead. + + @see #basic() + @see #basicWithImages() + @see #simpleText() + @see #relaxed() + */ + public Whitelist() { + tagNames = new HashSet<TagName>(); + attributes = new HashMap<TagName, Set<AttributeKey>>(); + enforcedAttributes = new HashMap<TagName, Map<AttributeKey, AttributeValue>>(); + protocols = new HashMap<TagName, Map<AttributeKey, Set<Protocol>>>(); + preserveRelativeLinks = false; + } + + /** + Add a list of allowed elements to a whitelist. (If a tag is not allowed, it will be removed from the HTML.) + + @param tags tag names to allow + @return this (for chaining) + */ + public Whitelist addTags(String... tags) { + Validate.notNull(tags); + + for (String tagName : tags) { + Validate.notEmpty(tagName); + tagNames.add(TagName.valueOf(tagName)); + } + return this; + } + + /** + Add a list of allowed attributes to a tag. (If an attribute is not allowed on an element, it will be removed.) + <p/> + E.g.: <code>addAttributes("a", "href", "class")</code> allows <code>href</code> and <code>class</code> attributes + on <code>a</code> tags. + <p/> + To make an attribute valid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g. + <code>addAttributes(":all", "class")</code>. + + @param tag The tag the attributes are for. The tag will be added to the allowed tag list if necessary. + @param keys List of valid attributes for the tag + @return this (for chaining) + */ + public Whitelist addAttributes(String tag, String... keys) { + Validate.notEmpty(tag); + Validate.notNull(keys); + Validate.isTrue(keys.length > 0, "No attributes supplied."); + + TagName tagName = TagName.valueOf(tag); + if (!tagNames.contains(tagName)) + tagNames.add(tagName); + Set<AttributeKey> attributeSet = new HashSet<AttributeKey>(); + for (String key : keys) { + Validate.notEmpty(key); + attributeSet.add(AttributeKey.valueOf(key)); + } + if (attributes.containsKey(tagName)) { + Set<AttributeKey> currentSet = attributes.get(tagName); + currentSet.addAll(attributeSet); + } else { + attributes.put(tagName, attributeSet); + } + return this; + } + + /** + Add an enforced attribute to a tag. An enforced attribute will always be added to the element. If the element + already has the attribute set, it will be overridden. + <p/> + E.g.: <code>addEnforcedAttribute("a", "rel", "nofollow")</code> will make all <code>a</code> tags output as + <code><a href="..." rel="nofollow"></code> + + @param tag The tag the enforced attribute is for. The tag will be added to the allowed tag list if necessary. + @param key The attribute key + @param value The enforced attribute value + @return this (for chaining) + */ + public Whitelist addEnforcedAttribute(String tag, String key, String value) { + Validate.notEmpty(tag); + Validate.notEmpty(key); + Validate.notEmpty(value); + + TagName tagName = TagName.valueOf(tag); + if (!tagNames.contains(tagName)) + tagNames.add(tagName); + AttributeKey attrKey = AttributeKey.valueOf(key); + AttributeValue attrVal = AttributeValue.valueOf(value); + + if (enforcedAttributes.containsKey(tagName)) { + enforcedAttributes.get(tagName).put(attrKey, attrVal); + } else { + Map<AttributeKey, AttributeValue> attrMap = new HashMap<AttributeKey, AttributeValue>(); + attrMap.put(attrKey, attrVal); + enforcedAttributes.put(tagName, attrMap); + } + return this; + } + + /** + * Configure this Whitelist to preserve relative links in an element's URL attribute, or convert them to absolute + * links. By default, this is <b>false</b>: URLs will be made absolute (e.g. start with an allowed protocol, like + * e.g. {@code http://}. + * <p /> + * Note that when handling relative links, the input document must have an appropriate {@code base URI} set when + * parsing, so that the link's protocol can be confirmed. Regardless of the setting of the {@code preserve relative + * links} option, the link must be resolvable against the base URI to an allowed protocol; otherwise the attribute + * will be removed. + * + * @param preserve {@code true} to allow relative links, {@code false} (default) to deny + * @return this Whitelist, for chaining. + * @see #addProtocols + */ + public Whitelist preserveRelativeLinks(boolean preserve) { + preserveRelativeLinks = preserve; + return this; + } + + /** + Add allowed URL protocols for an element's URL attribute. This restricts the possible values of the attribute to + URLs with the defined protocol. + <p/> + E.g.: <code>addProtocols("a", "href", "ftp", "http", "https")</code> + + @param tag Tag the URL protocol is for + @param key Attribute key + @param protocols List of valid protocols + @return this, for chaining + */ + public Whitelist addProtocols(String tag, String key, String... protocols) { + Validate.notEmpty(tag); + Validate.notEmpty(key); + Validate.notNull(protocols); + + TagName tagName = TagName.valueOf(tag); + AttributeKey attrKey = AttributeKey.valueOf(key); + Map<AttributeKey, Set<Protocol>> attrMap; + Set<Protocol> protSet; + + if (this.protocols.containsKey(tagName)) { + attrMap = this.protocols.get(tagName); + } else { + attrMap = new HashMap<AttributeKey, Set<Protocol>>(); + this.protocols.put(tagName, attrMap); + } + if (attrMap.containsKey(attrKey)) { + protSet = attrMap.get(attrKey); + } else { + protSet = new HashSet<Protocol>(); + attrMap.put(attrKey, protSet); + } + for (String protocol : protocols) { + Validate.notEmpty(protocol); + Protocol prot = Protocol.valueOf(protocol); + protSet.add(prot); + } + return this; + } + + boolean isSafeTag(String tag) { + return tagNames.contains(TagName.valueOf(tag)); + } + + boolean isSafeAttribute(String tagName, Element el, Attribute attr) { + TagName tag = TagName.valueOf(tagName); + AttributeKey key = AttributeKey.valueOf(attr.getKey()); + + if (attributes.containsKey(tag)) { + if (attributes.get(tag).contains(key)) { + if (protocols.containsKey(tag)) { + Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag); + // ok if not defined protocol; otherwise test + return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key)); + } else { // attribute found, no protocols defined, so OK + return true; + } + } + } + // no attributes defined for tag, try :all tag + return !tagName.equals(":all") && isSafeAttribute(":all", el, attr); + } + + private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) { + // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. + // rels without a baseuri get removed + String value = el.absUrl(attr.getKey()); + if (value.length() == 0) + value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols + if (!preserveRelativeLinks) + attr.setValue(value); + + for (Protocol protocol : protocols) { + String prot = protocol.toString() + ":"; + if (value.toLowerCase().startsWith(prot)) { + return true; + } + } + return false; + } + + Attributes getEnforcedAttributes(String tagName) { + Attributes attrs = new Attributes(); + TagName tag = TagName.valueOf(tagName); + if (enforcedAttributes.containsKey(tag)) { + Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag); + for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) { + attrs.put(entry.getKey().toString(), entry.getValue().toString()); + } + } + return attrs; + } + + // named types for config. All just hold strings, but here for my sanity. + + static class TagName extends TypedValue { + TagName(String value) { + super(value); + } + + static TagName valueOf(String value) { + return new TagName(value); + } + } + + static class AttributeKey extends TypedValue { + AttributeKey(String value) { + super(value); + } + + static AttributeKey valueOf(String value) { + return new AttributeKey(value); + } + } + + static class AttributeValue extends TypedValue { + AttributeValue(String value) { + super(value); + } + + static AttributeValue valueOf(String value) { + return new AttributeValue(value); + } + } + + static class Protocol extends TypedValue { + Protocol(String value) { + super(value); + } + + static Protocol valueOf(String value) { + return new Protocol(value); + } + } + + abstract static class TypedValue { + private String value; + + TypedValue(String value) { + Validate.notNull(value); + this.value = value; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + TypedValue other = (TypedValue) obj; + if (value == null) { + if (other.value != null) return false; + } else if (!value.equals(other.value)) return false; + return true; + } + + @Override + public String toString() { + return value; + } + } +} + diff --git a/src/org/jsoup/safety/package-info.java b/src/org/jsoup/safety/package-info.java new file mode 100644 index 0000000000..ac890f0607 --- /dev/null +++ b/src/org/jsoup/safety/package-info.java @@ -0,0 +1,4 @@ +/** + Contains the jsoup HTML cleaner, and whitelist definitions. + */ +package org.jsoup.safety; diff --git a/src/org/jsoup/select/Collector.java b/src/org/jsoup/select/Collector.java new file mode 100644 index 0000000000..8f01045768 --- /dev/null +++ b/src/org/jsoup/select/Collector.java @@ -0,0 +1,51 @@ +package org.jsoup.select; + +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + +/** + * Collects a list of elements that match the supplied criteria. + * + * @author Jonathan Hedley + */ +public class Collector { + + private Collector() { + } + + /** + Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator. + @param eval Evaluator to test elements against + @param root root of tree to descend + @return list of matches; empty if none + */ + public static Elements collect (Evaluator eval, Element root) { + Elements elements = new Elements(); + new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root); + return elements; + } + + private static class Accumulator implements NodeVisitor { + private final Element root; + private final Elements elements; + private final Evaluator eval; + + Accumulator(Element root, Elements elements, Evaluator eval) { + this.root = root; + this.elements = elements; + this.eval = eval; + } + + public void head(Node node, int depth) { + if (node instanceof Element) { + Element el = (Element) node; + if (eval.matches(root, el)) + elements.add(el); + } + } + + public void tail(Node node, int depth) { + // void + } + } +} diff --git a/src/org/jsoup/select/CombiningEvaluator.java b/src/org/jsoup/select/CombiningEvaluator.java new file mode 100644 index 0000000000..a31ed2636f --- /dev/null +++ b/src/org/jsoup/select/CombiningEvaluator.java @@ -0,0 +1,94 @@ +package org.jsoup.select; + +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +/** + * Base combining (and, or) evaluator. + */ +abstract class CombiningEvaluator extends Evaluator { + final List<Evaluator> evaluators; + + CombiningEvaluator() { + super(); + evaluators = new ArrayList<Evaluator>(); + } + + CombiningEvaluator(Collection<Evaluator> evaluators) { + this(); + this.evaluators.addAll(evaluators); + } + + Evaluator rightMostEvaluator() { + return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) : null; + } + + void replaceRightMostEvaluator(Evaluator replacement) { + evaluators.set(evaluators.size() - 1, replacement); + } + + static final class And extends CombiningEvaluator { + And(Collection<Evaluator> evaluators) { + super(evaluators); + } + + And(Evaluator... evaluators) { + this(Arrays.asList(evaluators)); + } + + @Override + public boolean matches(Element root, Element node) { + for (Evaluator s : evaluators) { + if (!s.matches(root, node)) + return false; + } + return true; + } + + @Override + public String toString() { + return StringUtil.join(evaluators, " "); + } + } + + static final class Or extends CombiningEvaluator { + /** + * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR. + * @param evaluators initial OR clause (these are wrapped into an AND evaluator). + */ + Or(Collection<Evaluator> evaluators) { + super(); + if (evaluators.size() > 1) + this.evaluators.add(new And(evaluators)); + else // 0 or 1 + this.evaluators.addAll(evaluators); + } + + Or() { + super(); + } + + public void add(Evaluator e) { + evaluators.add(e); + } + + @Override + public boolean matches(Element root, Element node) { + for (Evaluator s : evaluators) { + if (s.matches(root, node)) + return true; + } + return false; + } + + @Override + public String toString() { + return String.format(":or%s", evaluators); + } + } +} diff --git a/src/org/jsoup/select/Elements.java b/src/org/jsoup/select/Elements.java new file mode 100644 index 0000000000..8302da1e53 --- /dev/null +++ b/src/org/jsoup/select/Elements.java @@ -0,0 +1,536 @@ +package org.jsoup.select; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + +import java.util.*; + +/** + A list of {@link Element Elements}, with methods that act on every element in the list. + <p/> + To get an Elements object, use the {@link Element#select(String)} method. + + @author Jonathan Hedley, jonathan@hedley.net */ +public class Elements implements List<Element>, Cloneable { + private List<Element> contents; + + public Elements() { + contents = new ArrayList<Element>(); + } + + public Elements(int initialCapacity) { + contents = new ArrayList<Element>(initialCapacity); + } + + public Elements(Collection<Element> elements) { + contents = new ArrayList<Element>(elements); + } + + public Elements(List<Element> elements) { + contents = elements; + } + + public Elements(Element... elements) { + this(Arrays.asList(elements)); + } + + @Override + public Elements clone() { + List<Element> elements = new ArrayList<Element>(); + + for(Element e : contents) + elements.add(e.clone()); + + + return new Elements(elements); + } + + // attribute methods + /** + Get an attribute value from the first matched element that has the attribute. + @param attributeKey The attribute key. + @return The attribute value from the first matched element that has the attribute.. If no elements were matched (isEmpty() == true), + or if the no elements have the attribute, returns empty string. + @see #hasAttr(String) + */ + public String attr(String attributeKey) { + for (Element element : contents) { + if (element.hasAttr(attributeKey)) + return element.attr(attributeKey); + } + return ""; + } + + /** + Checks if any of the matched elements have this attribute set. + @param attributeKey attribute key + @return true if any of the elements have the attribute; false if none do. + */ + public boolean hasAttr(String attributeKey) { + for (Element element : contents) { + if (element.hasAttr(attributeKey)) + return true; + } + return false; + } + + /** + * Set an attribute on all matched elements. + * @param attributeKey attribute key + * @param attributeValue attribute value + * @return this + */ + public Elements attr(String attributeKey, String attributeValue) { + for (Element element : contents) { + element.attr(attributeKey, attributeValue); + } + return this; + } + + /** + * Remove an attribute from every matched element. + * @param attributeKey The attribute to remove. + * @return this (for chaining) + */ + public Elements removeAttr(String attributeKey) { + for (Element element : contents) { + element.removeAttr(attributeKey); + } + return this; + } + + /** + Add the class name to every matched element's {@code class} attribute. + @param className class name to add + @return this + */ + public Elements addClass(String className) { + for (Element element : contents) { + element.addClass(className); + } + return this; + } + + /** + Remove the class name from every matched element's {@code class} attribute, if present. + @param className class name to remove + @return this + */ + public Elements removeClass(String className) { + for (Element element : contents) { + element.removeClass(className); + } + return this; + } + + /** + Toggle the class name on every matched element's {@code class} attribute. + @param className class name to add if missing, or remove if present, from every element. + @return this + */ + public Elements toggleClass(String className) { + for (Element element : contents) { + element.toggleClass(className); + } + return this; + } + + /** + Determine if any of the matched elements have this class name set in their {@code class} attribute. + @param className class name to check for + @return true if any do, false if none do + */ + public boolean hasClass(String className) { + for (Element element : contents) { + if (element.hasClass(className)) + return true; + } + return false; + } + + /** + * Get the form element's value of the first matched element. + * @return The form element's value, or empty if not set. + * @see Element#val() + */ + public String val() { + if (size() > 0) + return first().val(); + else + return ""; + } + + /** + * Set the form element's value in each of the matched elements. + * @param value The value to set into each matched element + * @return this (for chaining) + */ + public Elements val(String value) { + for (Element element : contents) + element.val(value); + return this; + } + + /** + * Get the combined text of all the matched elements. + * <p> + * Note that it is possible to get repeats if the matched elements contain both parent elements and their own + * children, as the Element.text() method returns the combined text of a parent and all its children. + * @return string of all text: unescaped and no HTML. + * @see Element#text() + */ + public String text() { + StringBuilder sb = new StringBuilder(); + for (Element element : contents) { + if (sb.length() != 0) + sb.append(" "); + sb.append(element.text()); + } + return sb.toString(); + } + + public boolean hasText() { + for (Element element: contents) { + if (element.hasText()) + return true; + } + return false; + } + + /** + * Get the combined inner HTML of all matched elements. + * @return string of all element's inner HTML. + * @see #text() + * @see #outerHtml() + */ + public String html() { + StringBuilder sb = new StringBuilder(); + for (Element element : contents) { + if (sb.length() != 0) + sb.append("\n"); + sb.append(element.html()); + } + return sb.toString(); + } + + /** + * Get the combined outer HTML of all matched elements. + * @return string of all element's outer HTML. + * @see #text() + * @see #html() + */ + public String outerHtml() { + StringBuilder sb = new StringBuilder(); + for (Element element : contents) { + if (sb.length() != 0) + sb.append("\n"); + sb.append(element.outerHtml()); + } + return sb.toString(); + } + + /** + * Get the combined outer HTML of all matched elements. Alias of {@link #outerHtml()}. + * @return string of all element's outer HTML. + * @see #text() + * @see #html() + */ + public String toString() { + return outerHtml(); + } + + /** + * Update the tag name of each matched element. For example, to change each {@code <i>} to a {@code <em>}, do + * {@code doc.select("i").tagName("em");} + * @param tagName the new tag name + * @return this, for chaining + * @see Element#tagName(String) + */ + public Elements tagName(String tagName) { + for (Element element : contents) { + element.tagName(tagName); + } + return this; + } + + /** + * Set the inner HTML of each matched element. + * @param html HTML to parse and set into each matched element. + * @return this, for chaining + * @see Element#html(String) + */ + public Elements html(String html) { + for (Element element : contents) { + element.html(html); + } + return this; + } + + /** + * Add the supplied HTML to the start of each matched element's inner HTML. + * @param html HTML to add inside each element, before the existing HTML + * @return this, for chaining + * @see Element#prepend(String) + */ + public Elements prepend(String html) { + for (Element element : contents) { + element.prepend(html); + } + return this; + } + + /** + * Add the supplied HTML to the end of each matched element's inner HTML. + * @param html HTML to add inside each element, after the existing HTML + * @return this, for chaining + * @see Element#append(String) + */ + public Elements append(String html) { + for (Element element : contents) { + element.append(html); + } + return this; + } + + /** + * Insert the supplied HTML before each matched element's outer HTML. + * @param html HTML to insert before each element + * @return this, for chaining + * @see Element#before(String) + */ + public Elements before(String html) { + for (Element element : contents) { + element.before(html); + } + return this; + } + + /** + * Insert the supplied HTML after each matched element's outer HTML. + * @param html HTML to insert after each element + * @return this, for chaining + * @see Element#after(String) + */ + public Elements after(String html) { + for (Element element : contents) { + element.after(html); + } + return this; + } + + /** + Wrap the supplied HTML around each matched elements. For example, with HTML + {@code <p><b>This</b> is <b>Jsoup</b></p>}, + <code>doc.select("b").wrap("<i></i>");</code> + becomes {@code <p><i><b>This</b></i> is <i><b>jsoup</b></i></p>} + @param html HTML to wrap around each element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. + @return this (for chaining) + @see Element#wrap + */ + public Elements wrap(String html) { + Validate.notEmpty(html); + for (Element element : contents) { + element.wrap(html); + } + return this; + } + + /** + * Removes the matched elements from the DOM, and moves their children up into their parents. This has the effect of + * dropping the elements but keeping their children. + * <p/> + * This is useful for e.g removing unwanted formatting elements but keeping their contents. + * <p/> + * E.g. with HTML: {@code <div><font>One</font> <font><a href="/">Two</a></font></div>}<br/> + * {@code doc.select("font").unwrap();}<br/> + * HTML = {@code <div>One <a href="/">Two</a></div>} + * + * @return this (for chaining) + * @see Node#unwrap + */ + public Elements unwrap() { + for (Element element : contents) { + element.unwrap(); + } + return this; + } + + /** + * Empty (remove all child nodes from) each matched element. This is similar to setting the inner HTML of each + * element to nothing. + * <p> + * E.g. HTML: {@code <div><p>Hello <b>there</b></p> <p>now</p></div>}<br> + * <code>doc.select("p").empty();</code><br> + * HTML = {@code <div><p></p> <p></p></div>} + * @return this, for chaining + * @see Element#empty() + * @see #remove() + */ + public Elements empty() { + for (Element element : contents) { + element.empty(); + } + return this; + } + + /** + * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing. + * <p> + * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br> + * <code>doc.select("p").remove();</code><br> + * HTML = {@code <div> <img /></div>} + * <p> + * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. + * @return this, for chaining + * @see Element#empty() + * @see #empty() + */ + public Elements remove() { + for (Element element : contents) { + element.remove(); + } + return this; + } + + // filters + + /** + * Find matching elements within this element list. + * @param query A {@link Selector} query + * @return the filtered list of elements, or an empty list if none match. + */ + public Elements select(String query) { + return Selector.select(query, this); + } + + /** + * Remove elements from this list that match the {@link Selector} query. + * <p> + * E.g. HTML: {@code <div class=logo>One</div> <div>Two</div>}<br> + * <code>Elements divs = doc.select("div").not("#logo");</code><br> + * Result: {@code divs: [<div>Two</div>]} + * <p> + * @param query the selector query whose results should be removed from these elements + * @return a new elements list that contains only the filtered results + */ + public Elements not(String query) { + Elements out = Selector.select(query, this); + return Selector.filterOut(this, out); + } + + /** + * Get the <i>nth</i> matched element as an Elements object. + * <p> + * See also {@link #get(int)} to retrieve an Element. + * @param index the (zero-based) index of the element in the list to retain + * @return Elements containing only the specified element, or, if that element did not exist, an empty list. + */ + public Elements eq(int index) { + return contents.size() > index ? new Elements(get(index)) : new Elements(); + } + + /** + * Test if any of the matched elements match the supplied query. + * @param query A selector + * @return true if at least one element in the list matches the query. + */ + public boolean is(String query) { + Elements children = select(query); + return !children.isEmpty(); + } + + /** + * Get all of the parents and ancestor elements of the matched elements. + * @return all of the parents and ancestor elements of the matched elements + */ + public Elements parents() { + HashSet<Element> combo = new LinkedHashSet<Element>(); + for (Element e: contents) { + combo.addAll(e.parents()); + } + return new Elements(combo); + } + + // list-like methods + /** + Get the first matched element. + @return The first matched element, or <code>null</code> if contents is empty; + */ + public Element first() { + return contents.isEmpty() ? null : contents.get(0); + } + + /** + Get the last matched element. + @return The last matched element, or <code>null</code> if contents is empty. + */ + public Element last() { + return contents.isEmpty() ? null : contents.get(contents.size() - 1); + } + + /** + * Perform a depth-first traversal on each of the selected elements. + * @param nodeVisitor the visitor callbacks to perform on each node + * @return this, for chaining + */ + public Elements traverse(NodeVisitor nodeVisitor) { + Validate.notNull(nodeVisitor); + NodeTraversor traversor = new NodeTraversor(nodeVisitor); + for (Element el: contents) { + traversor.traverse(el); + } + return this; + } + + // implements List<Element> delegates: + public int size() {return contents.size();} + + public boolean isEmpty() {return contents.isEmpty();} + + public boolean contains(Object o) {return contents.contains(o);} + + public Iterator<Element> iterator() {return contents.iterator();} + + public Object[] toArray() {return contents.toArray();} + + public <T> T[] toArray(T[] a) {return contents.toArray(a);} + + public boolean add(Element element) {return contents.add(element);} + + public boolean remove(Object o) {return contents.remove(o);} + + public boolean containsAll(Collection<?> c) {return contents.containsAll(c);} + + public boolean addAll(Collection<? extends Element> c) {return contents.addAll(c);} + + public boolean addAll(int index, Collection<? extends Element> c) {return contents.addAll(index, c);} + + public boolean removeAll(Collection<?> c) {return contents.removeAll(c);} + + public boolean retainAll(Collection<?> c) {return contents.retainAll(c);} + + public void clear() {contents.clear();} + + public boolean equals(Object o) {return contents.equals(o);} + + public int hashCode() {return contents.hashCode();} + + public Element get(int index) {return contents.get(index);} + + public Element set(int index, Element element) {return contents.set(index, element);} + + public void add(int index, Element element) {contents.add(index, element);} + + public Element remove(int index) {return contents.remove(index);} + + public int indexOf(Object o) {return contents.indexOf(o);} + + public int lastIndexOf(Object o) {return contents.lastIndexOf(o);} + + public ListIterator<Element> listIterator() {return contents.listIterator();} + + public ListIterator<Element> listIterator(int index) {return contents.listIterator(index);} + + public List<Element> subList(int fromIndex, int toIndex) {return contents.subList(fromIndex, toIndex);} +} diff --git a/src/org/jsoup/select/Evaluator.java b/src/org/jsoup/select/Evaluator.java new file mode 100644 index 0000000000..16a083bd77 --- /dev/null +++ b/src/org/jsoup/select/Evaluator.java @@ -0,0 +1,454 @@ +package org.jsoup.select; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Element; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * Evaluates that an element matches the selector. + */ +public abstract class Evaluator { + protected Evaluator() { + } + + /** + * Test if the element meets the evaluator's requirements. + * + * @param root Root of the matching subtree + * @param element tested element + */ + public abstract boolean matches(Element root, Element element); + + /** + * Evaluator for tag name + */ + public static final class Tag extends Evaluator { + private String tagName; + + public Tag(String tagName) { + this.tagName = tagName; + } + + @Override + public boolean matches(Element root, Element element) { + return (element.tagName().equals(tagName)); + } + + @Override + public String toString() { + return String.format("%s", tagName); + } + } + + /** + * Evaluator for element id + */ + public static final class Id extends Evaluator { + private String id; + + public Id(String id) { + this.id = id; + } + + @Override + public boolean matches(Element root, Element element) { + return (id.equals(element.id())); + } + + @Override + public String toString() { + return String.format("#%s", id); + } + + } + + /** + * Evaluator for element class + */ + public static final class Class extends Evaluator { + private String className; + + public Class(String className) { + this.className = className; + } + + @Override + public boolean matches(Element root, Element element) { + return (element.hasClass(className)); + } + + @Override + public String toString() { + return String.format(".%s", className); + } + + } + + /** + * Evaluator for attribute name matching + */ + public static final class Attribute extends Evaluator { + private String key; + + public Attribute(String key) { + this.key = key; + } + + @Override + public boolean matches(Element root, Element element) { + return element.hasAttr(key); + } + + @Override + public String toString() { + return String.format("[%s]", key); + } + + } + + /** + * Evaluator for attribute name prefix matching + */ + public static final class AttributeStarting extends Evaluator { + private String keyPrefix; + + public AttributeStarting(String keyPrefix) { + this.keyPrefix = keyPrefix; + } + + @Override + public boolean matches(Element root, Element element) { + List<org.jsoup.nodes.Attribute> values = element.attributes().asList(); + for (org.jsoup.nodes.Attribute attribute : values) { + if (attribute.getKey().startsWith(keyPrefix)) + return true; + } + return false; + } + + @Override + public String toString() { + return String.format("[^%s]", keyPrefix); + } + + } + + /** + * Evaluator for attribute name/value matching + */ + public static final class AttributeWithValue extends AttributeKeyPair { + public AttributeWithValue(String key, String value) { + super(key, value); + } + + @Override + public boolean matches(Element root, Element element) { + return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key)); + } + + @Override + public String toString() { + return String.format("[%s=%s]", key, value); + } + + } + + /** + * Evaluator for attribute name != value matching + */ + public static final class AttributeWithValueNot extends AttributeKeyPair { + public AttributeWithValueNot(String key, String value) { + super(key, value); + } + + @Override + public boolean matches(Element root, Element element) { + return !value.equalsIgnoreCase(element.attr(key)); + } + + @Override + public String toString() { + return String.format("[%s!=%s]", key, value); + } + + } + + /** + * Evaluator for attribute name/value matching (value prefix) + */ + public static final class AttributeWithValueStarting extends AttributeKeyPair { + public AttributeWithValueStarting(String key, String value) { + super(key, value); + } + + @Override + public boolean matches(Element root, Element element) { + return element.hasAttr(key) && element.attr(key).toLowerCase().startsWith(value); // value is lower case already + } + + @Override + public String toString() { + return String.format("[%s^=%s]", key, value); + } + + } + + /** + * Evaluator for attribute name/value matching (value ending) + */ + public static final class AttributeWithValueEnding extends AttributeKeyPair { + public AttributeWithValueEnding(String key, String value) { + super(key, value); + } + + @Override + public boolean matches(Element root, Element element) { + return element.hasAttr(key) && element.attr(key).toLowerCase().endsWith(value); // value is lower case + } + + @Override + public String toString() { + return String.format("[%s$=%s]", key, value); + } + + } + + /** + * Evaluator for attribute name/value matching (value containing) + */ + public static final class AttributeWithValueContaining extends AttributeKeyPair { + public AttributeWithValueContaining(String key, String value) { + super(key, value); + } + + @Override + public boolean matches(Element root, Element element) { + return element.hasAttr(key) && element.attr(key).toLowerCase().contains(value); // value is lower case + } + + @Override + public String toString() { + return String.format("[%s*=%s]", key, value); + } + + } + + /** + * Evaluator for attribute name/value matching (value regex matching) + */ + public static final class AttributeWithValueMatching extends Evaluator { + String key; + Pattern pattern; + + public AttributeWithValueMatching(String key, Pattern pattern) { + this.key = key.trim().toLowerCase(); + this.pattern = pattern; + } + + @Override + public boolean matches(Element root, Element element) { + return element.hasAttr(key) && pattern.matcher(element.attr(key)).find(); + } + + @Override + public String toString() { + return String.format("[%s~=%s]", key, pattern.toString()); + } + + } + + /** + * Abstract evaluator for attribute name/value matching + */ + public abstract static class AttributeKeyPair extends Evaluator { + String key; + String value; + + public AttributeKeyPair(String key, String value) { + Validate.notEmpty(key); + Validate.notEmpty(value); + + this.key = key.trim().toLowerCase(); + this.value = value.trim().toLowerCase(); + } + } + + /** + * Evaluator for any / all element matching + */ + public static final class AllElements extends Evaluator { + + @Override + public boolean matches(Element root, Element element) { + return true; + } + + @Override + public String toString() { + return "*"; + } + } + + /** + * Evaluator for matching by sibling index number (e < idx) + */ + public static final class IndexLessThan extends IndexEvaluator { + public IndexLessThan(int index) { + super(index); + } + + @Override + public boolean matches(Element root, Element element) { + return element.elementSiblingIndex() < index; + } + + @Override + public String toString() { + return String.format(":lt(%d)", index); + } + + } + + /** + * Evaluator for matching by sibling index number (e > idx) + */ + public static final class IndexGreaterThan extends IndexEvaluator { + public IndexGreaterThan(int index) { + super(index); + } + + @Override + public boolean matches(Element root, Element element) { + return element.elementSiblingIndex() > index; + } + + @Override + public String toString() { + return String.format(":gt(%d)", index); + } + + } + + /** + * Evaluator for matching by sibling index number (e = idx) + */ + public static final class IndexEquals extends IndexEvaluator { + public IndexEquals(int index) { + super(index); + } + + @Override + public boolean matches(Element root, Element element) { + return element.elementSiblingIndex() == index; + } + + @Override + public String toString() { + return String.format(":eq(%d)", index); + } + + } + + /** + * Abstract evaluator for sibling index matching + * + * @author ant + */ + public abstract static class IndexEvaluator extends Evaluator { + int index; + + public IndexEvaluator(int index) { + this.index = index; + } + } + + /** + * Evaluator for matching Element (and its descendants) text + */ + public static final class ContainsText extends Evaluator { + private String searchText; + + public ContainsText(String searchText) { + this.searchText = searchText.toLowerCase(); + } + + @Override + public boolean matches(Element root, Element element) { + return (element.text().toLowerCase().contains(searchText)); + } + + @Override + public String toString() { + return String.format(":contains(%s", searchText); + } + } + + /** + * Evaluator for matching Element's own text + */ + public static final class ContainsOwnText extends Evaluator { + private String searchText; + + public ContainsOwnText(String searchText) { + this.searchText = searchText.toLowerCase(); + } + + @Override + public boolean matches(Element root, Element element) { + return (element.ownText().toLowerCase().contains(searchText)); + } + + @Override + public String toString() { + return String.format(":containsOwn(%s", searchText); + } + } + + /** + * Evaluator for matching Element (and its descendants) text with regex + */ + public static final class Matches extends Evaluator { + private Pattern pattern; + + public Matches(Pattern pattern) { + this.pattern = pattern; + } + + @Override + public boolean matches(Element root, Element element) { + Matcher m = pattern.matcher(element.text()); + return m.find(); + } + + @Override + public String toString() { + return String.format(":matches(%s", pattern); + } + } + + /** + * Evaluator for matching Element's own text with regex + */ + public static final class MatchesOwn extends Evaluator { + private Pattern pattern; + + public MatchesOwn(Pattern pattern) { + this.pattern = pattern; + } + + @Override + public boolean matches(Element root, Element element) { + Matcher m = pattern.matcher(element.ownText()); + return m.find(); + } + + @Override + public String toString() { + return String.format(":matchesOwn(%s", pattern); + } + } +} diff --git a/src/org/jsoup/select/NodeTraversor.java b/src/org/jsoup/select/NodeTraversor.java new file mode 100644 index 0000000000..9bb081e56c --- /dev/null +++ b/src/org/jsoup/select/NodeTraversor.java @@ -0,0 +1,47 @@ +package org.jsoup.select; + +import org.jsoup.nodes.Node; + +/** + * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node. + * <p/> + * This implementation does not use recursion, so a deep DOM does not risk blowing the stack. + */ +public class NodeTraversor { + private NodeVisitor visitor; + + /** + * Create a new traversor. + * @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node. + */ + public NodeTraversor(NodeVisitor visitor) { + this.visitor = visitor; + } + + /** + * Start a depth-first traverse of the root and all of its descendants. + * @param root the root node point to traverse. + */ + public void traverse(Node root) { + Node node = root; + int depth = 0; + + while (node != null) { + visitor.head(node, depth); + if (node.childNodes().size() > 0) { + node = node.childNode(0); + depth++; + } else { + while (node.nextSibling() == null && depth > 0) { + visitor.tail(node, depth); + node = node.parent(); + depth--; + } + visitor.tail(node, depth); + if (node == root) + break; + node = node.nextSibling(); + } + } + } +} diff --git a/src/org/jsoup/select/NodeVisitor.java b/src/org/jsoup/select/NodeVisitor.java new file mode 100644 index 0000000000..20112e8d29 --- /dev/null +++ b/src/org/jsoup/select/NodeVisitor.java @@ -0,0 +1,30 @@ +package org.jsoup.select; + +import org.jsoup.nodes.Node; + +/** + * Node visitor interface. Provide an implementing class to {@link NodeTraversor} to iterate through nodes. + * <p/> + * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first + * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to + * create a start tag for a node, and tail to create the end tag. + */ +public interface NodeVisitor { + /** + * Callback for when a node is first visited. + * + * @param node the node being visited. + * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node + * of that will have depth 1. + */ + public void head(Node node, int depth); + + /** + * Callback for when a node is last visited, after all of its descendants have been visited. + * + * @param node the node being visited. + * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node + * of that will have depth 1. + */ + public void tail(Node node, int depth); +} diff --git a/src/org/jsoup/select/QueryParser.java b/src/org/jsoup/select/QueryParser.java new file mode 100644 index 0000000000..d3cc36f91c --- /dev/null +++ b/src/org/jsoup/select/QueryParser.java @@ -0,0 +1,293 @@ +package org.jsoup.select; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.parser.TokenQueue; + +/** + * Parses a CSS selector into an Evaluator tree. + */ +class QueryParser { + private final static String[] combinators = {",", ">", "+", "~", " "}; + + private TokenQueue tq; + private String query; + private List<Evaluator> evals = new ArrayList<Evaluator>(); + + /** + * Create a new QueryParser. + * @param query CSS query + */ + private QueryParser(String query) { + this.query = query; + this.tq = new TokenQueue(query); + } + + /** + * Parse a CSS query into an Evaluator. + * @param query CSS query + * @return Evaluator + */ + public static Evaluator parse(String query) { + QueryParser p = new QueryParser(query); + return p.parse(); + } + + /** + * Parse the query + * @return Evaluator + */ + Evaluator parse() { + tq.consumeWhitespace(); + + if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements + evals.add(new StructuralEvaluator.Root()); + combinator(tq.consume()); + } else { + findElements(); + } + + while (!tq.isEmpty()) { + // hierarchy and extras + boolean seenWhite = tq.consumeWhitespace(); + + if (tq.matchesAny(combinators)) { + combinator(tq.consume()); + } else if (seenWhite) { + combinator(' '); + } else { // E.class, E#id, E[attr] etc. AND + findElements(); // take next el, #. etc off queue + } + } + + if (evals.size() == 1) + return evals.get(0); + + return new CombiningEvaluator.And(evals); + } + + private void combinator(char combinator) { + tq.consumeWhitespace(); + String subQuery = consumeSubQuery(); // support multi > childs + + Evaluator rootEval; // the new topmost evaluator + Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. + Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator + boolean replaceRightMost = false; + + if (evals.size() == 1) { + rootEval = currentEval = evals.get(0); + // make sure OR (,) has precedence: + if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { + currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); + replaceRightMost = true; + } + } + else { + rootEval = currentEval = new CombiningEvaluator.And(evals); + } + evals.clear(); + + // for most combinators: change the current eval into an AND of the current eval and the new eval + if (combinator == '>') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediateParent(currentEval)); + else if (combinator == ' ') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.Parent(currentEval)); + else if (combinator == '+') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediatePreviousSibling(currentEval)); + else if (combinator == '~') + currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.PreviousSibling(currentEval)); + else if (combinator == ',') { // group or. + CombiningEvaluator.Or or; + if (currentEval instanceof CombiningEvaluator.Or) { + or = (CombiningEvaluator.Or) currentEval; + or.add(newEval); + } else { + or = new CombiningEvaluator.Or(); + or.add(currentEval); + or.add(newEval); + } + currentEval = or; + } + else + throw new Selector.SelectorParseException("Unknown combinator: " + combinator); + + if (replaceRightMost) + ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); + else rootEval = currentEval; + evals.add(rootEval); + } + + private String consumeSubQuery() { + StringBuilder sq = new StringBuilder(); + while (!tq.isEmpty()) { + if (tq.matches("(")) + sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); + else if (tq.matches("[")) + sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); + else if (tq.matchesAny(combinators)) + break; + else + sq.append(tq.consume()); + } + return sq.toString(); + } + + private void findElements() { + if (tq.matchChomp("#")) + byId(); + else if (tq.matchChomp(".")) + byClass(); + else if (tq.matchesWord()) + byTag(); + else if (tq.matches("[")) + byAttribute(); + else if (tq.matchChomp("*")) + allElements(); + else if (tq.matchChomp(":lt(")) + indexLessThan(); + else if (tq.matchChomp(":gt(")) + indexGreaterThan(); + else if (tq.matchChomp(":eq(")) + indexEquals(); + else if (tq.matches(":has(")) + has(); + else if (tq.matches(":contains(")) + contains(false); + else if (tq.matches(":containsOwn(")) + contains(true); + else if (tq.matches(":matches(")) + matches(false); + else if (tq.matches(":matchesOwn(")) + matches(true); + else if (tq.matches(":not(")) + not(); + else // unhandled + throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); + + } + + private void byId() { + String id = tq.consumeCssIdentifier(); + Validate.notEmpty(id); + evals.add(new Evaluator.Id(id)); + } + + private void byClass() { + String className = tq.consumeCssIdentifier(); + Validate.notEmpty(className); + evals.add(new Evaluator.Class(className.trim().toLowerCase())); + } + + private void byTag() { + String tagName = tq.consumeElementSelector(); + Validate.notEmpty(tagName); + + // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: + if (tagName.contains("|")) + tagName = tagName.replace("|", ":"); + + evals.add(new Evaluator.Tag(tagName.trim().toLowerCase())); + } + + private void byAttribute() { + TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue + String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, not, start, end, contain, match, (no val) + Validate.notEmpty(key); + cq.consumeWhitespace(); + + if (cq.isEmpty()) { + if (key.startsWith("^")) + evals.add(new Evaluator.AttributeStarting(key.substring(1))); + else + evals.add(new Evaluator.Attribute(key)); + } else { + if (cq.matchChomp("=")) + evals.add(new Evaluator.AttributeWithValue(key, cq.remainder())); + + else if (cq.matchChomp("!=")) + evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder())); + + else if (cq.matchChomp("^=")) + evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder())); + + else if (cq.matchChomp("$=")) + evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder())); + + else if (cq.matchChomp("*=")) + evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder())); + + else if (cq.matchChomp("~=")) + evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()))); + else + throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); + } + } + + private void allElements() { + evals.add(new Evaluator.AllElements()); + } + + // pseudo selectors :lt, :gt, :eq + private void indexLessThan() { + evals.add(new Evaluator.IndexLessThan(consumeIndex())); + } + + private void indexGreaterThan() { + evals.add(new Evaluator.IndexGreaterThan(consumeIndex())); + } + + private void indexEquals() { + evals.add(new Evaluator.IndexEquals(consumeIndex())); + } + + private int consumeIndex() { + String indexS = tq.chompTo(")").trim(); + Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric"); + return Integer.parseInt(indexS); + } + + // pseudo selector :has(el) + private void has() { + tq.consume(":has"); + String subQuery = tq.chompBalanced('(', ')'); + Validate.notEmpty(subQuery, ":has(el) subselect must not be empty"); + evals.add(new StructuralEvaluator.Has(parse(subQuery))); + } + + // pseudo selector :contains(text), containsOwn(text) + private void contains(boolean own) { + tq.consume(own ? ":containsOwn" : ":contains"); + String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); + Validate.notEmpty(searchText, ":contains(text) query must not be empty"); + if (own) + evals.add(new Evaluator.ContainsOwnText(searchText)); + else + evals.add(new Evaluator.ContainsText(searchText)); + } + + // :matches(regex), matchesOwn(regex) + private void matches(boolean own) { + tq.consume(own ? ":matchesOwn" : ":matches"); + String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped + Validate.notEmpty(regex, ":matches(regex) query must not be empty"); + + if (own) + evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex))); + else + evals.add(new Evaluator.Matches(Pattern.compile(regex))); + } + + // :not(selector) + private void not() { + tq.consume(":not"); + String subQuery = tq.chompBalanced('(', ')'); + Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); + + evals.add(new StructuralEvaluator.Not(parse(subQuery))); + } +} diff --git a/src/org/jsoup/select/Selector.java b/src/org/jsoup/select/Selector.java new file mode 100644 index 0000000000..8fc6286798 --- /dev/null +++ b/src/org/jsoup/select/Selector.java @@ -0,0 +1,126 @@ +package org.jsoup.select; + +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Element; + +import java.util.Collection; +import java.util.LinkedHashSet; + +/** + * CSS-like element selector, that finds elements matching a query. + * <p/> + * <h2>Selector syntax</h2> + * A selector is a chain of simple selectors, separated by combinators. Selectors are case insensitive (including against + * elements, attributes, and attribute values). + * <p/> + * The universal selector (*) is implicit when no element selector is supplied (i.e. {@code *.header} and {@code .header} + * is equivalent). + * <p/> + * <table> + * <tr><th>Pattern</th><th>Matches</th><th>Example</th></tr> + * <tr><td><code>*</code></td><td>any element</td><td><code>*</code></td></tr> + * <tr><td><code>tag</code></td><td>elements with the given tag name</td><td><code>div</code></td></tr> + * <tr><td><code>ns|E</code></td><td>elements of type E in the namespace <i>ns</i></td><td><code>fb|name</code> finds <code><fb:name></code> elements</td></tr> + * <tr><td><code>#id</code></td><td>elements with attribute ID of "id"</td><td><code>div#wrap</code>, <code>#logo</code></td></tr> + * <tr><td><code>.class</code></td><td>elements with a class name of "class"</td><td><code>div.left</code>, <code>.result</code></td></tr> + * <tr><td><code>[attr]</code></td><td>elements with an attribute named "attr" (with any value)</td><td><code>a[href]</code>, <code>[title]</code></td></tr> + * <tr><td><code>[^attrPrefix]</code></td><td>elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets</td><td><code>[^data-]</code>, <code>div[^data-]</code></td></tr> + * <tr><td><code>[attr=val]</code></td><td>elements with an attribute named "attr", and value equal to "val"</td><td><code>img[width=500]</code>, <code>a[rel=nofollow]</code></td></tr> + * <tr><td><code>[attr^=valPrefix]</code></td><td>elements with an attribute named "attr", and value starting with "valPrefix"</td><td><code>a[href^=http:]</code></code></td></tr> + * <tr><td><code>[attr$=valSuffix]</code></td><td>elements with an attribute named "attr", and value ending with "valSuffix"</td><td><code>img[src$=.png]</code></td></tr> + * <tr><td><code>[attr*=valContaining]</code></td><td>elements with an attribute named "attr", and value containing "valContaining"</td><td><code>a[href*=/search/]</code></td></tr> + * <tr><td><code>[attr~=<em>regex</em>]</code></td><td>elements with an attribute named "attr", and value matching the regular expression</td><td><code>img[src~=(?i)\\.(png|jpe?g)]</code></td></tr> + * <tr><td></td><td>The above may be combined in any order</td><td><code>div.header[title]</code></td></tr> + * <tr><td><td colspan="3"><h3>Combinators</h3></td></tr> + * <tr><td><code>E F</code></td><td>an F element descended from an E element</td><td><code>div a</code>, <code>.logo h1</code></td></tr> + * <tr><td><code>E > F</code></td><td>an F direct child of E</td><td><code>ol > li</code></td></tr> + * <tr><td><code>E + F</code></td><td>an F element immediately preceded by sibling E</td><td><code>li + li</code>, <code>div.head + div</code></td></tr> + * <tr><td><code>E ~ F</code></td><td>an F element preceded by sibling E</td><td><code>h1 ~ p</code></td></tr> + * <tr><td><code>E, F, G</code></td><td>all matching elements E, F, or G</td><td><code>a[href], div, h3</code></td></tr> + * <tr><td><td colspan="3"><h3>Pseudo selectors</h3></td></tr> + * <tr><td><code>:lt(<em>n</em>)</code></td><td>elements whose sibling index is less than <em>n</em></td><td><code>td:lt(3)</code> finds the first 2 cells of each row</td></tr> + * <tr><td><code>:gt(<em>n</em>)</code></td><td>elements whose sibling index is greater than <em>n</em></td><td><code>td:gt(1)</code> finds cells after skipping the first two</td></tr> + * <tr><td><code>:eq(<em>n</em>)</code></td><td>elements whose sibling index is equal to <em>n</em></td><td><code>td:eq(0)</code> finds the first cell of each row</td></tr> + * <tr><td><code>:has(<em>selector</em>)</code></td><td>elements that contains at least one element matching the <em>selector</em></td><td><code>div:has(p)</code> finds divs that contain p elements </td></tr> + * <tr><td><code>:not(<em>selector</em>)</code></td><td>elements that do not match the <em>selector</em>. See also {@link Elements#not(String)}</td><td><code>div:not(.logo)</code> finds all divs that do not have the "logo" class.<br /><code>div:not(:has(div))</code> finds divs that do not contain divs.</code></td></tr> + * <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants.</td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".</td></tr> + * <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements whose text matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> + * <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contains the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr> + * <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> + * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr> + * </table> + * + * @author Jonathan Hedley, jonathan@hedley.net + * @see Element#select(String) + */ +public class Selector { + private final Evaluator evaluator; + private final Element root; + + private Selector(String query, Element root) { + Validate.notNull(query); + query = query.trim(); + Validate.notEmpty(query); + Validate.notNull(root); + + this.evaluator = QueryParser.parse(query); + + this.root = root; + } + + /** + * Find elements matching selector. + * + * @param query CSS selector + * @param root root element to descend into + * @return matching elements, empty if not + */ + public static Elements select(String query, Element root) { + return new Selector(query, root).select(); + } + + /** + * Find elements matching selector. + * + * @param query CSS selector + * @param roots root elements to descend into + * @return matching elements, empty if not + */ + public static Elements select(String query, Iterable<Element> roots) { + Validate.notEmpty(query); + Validate.notNull(roots); + LinkedHashSet<Element> elements = new LinkedHashSet<Element>(); + + for (Element root : roots) { + elements.addAll(select(query, root)); + } + return new Elements(elements); + } + + private Elements select() { + return Collector.collect(evaluator, root); + } + + // exclude set. package open so that Elements can implement .not() selector. + static Elements filterOut(Collection<Element> elements, Collection<Element> outs) { + Elements output = new Elements(); + for (Element el : elements) { + boolean found = false; + for (Element out : outs) { + if (el.equals(out)) { + found = true; + break; + } + } + if (!found) + output.add(el); + } + return output; + } + + public static class SelectorParseException extends IllegalStateException { + public SelectorParseException(String msg, Object... params) { + super(String.format(msg, params)); + } + } +} diff --git a/src/org/jsoup/select/StructuralEvaluator.java b/src/org/jsoup/select/StructuralEvaluator.java new file mode 100644 index 0000000000..69e8a62e58 --- /dev/null +++ b/src/org/jsoup/select/StructuralEvaluator.java @@ -0,0 +1,132 @@ +package org.jsoup.select; + +import org.jsoup.nodes.Element; + +/** + * Base structural evaluator. + */ +abstract class StructuralEvaluator extends Evaluator { + Evaluator evaluator; + + static class Root extends Evaluator { + public boolean matches(Element root, Element element) { + return root == element; + } + } + + static class Has extends StructuralEvaluator { + public Has(Evaluator evaluator) { + this.evaluator = evaluator; + } + + public boolean matches(Element root, Element element) { + for (Element e : element.getAllElements()) { + if (e != element && evaluator.matches(root, e)) + return true; + } + return false; + } + + public String toString() { + return String.format(":has(%s)", evaluator); + } + } + + static class Not extends StructuralEvaluator { + public Not(Evaluator evaluator) { + this.evaluator = evaluator; + } + + public boolean matches(Element root, Element node) { + return !evaluator.matches(root, node); + } + + public String toString() { + return String.format(":not%s", evaluator); + } + } + + static class Parent extends StructuralEvaluator { + public Parent(Evaluator evaluator) { + this.evaluator = evaluator; + } + + public boolean matches(Element root, Element element) { + if (root == element) + return false; + + Element parent = element.parent(); + while (parent != root) { + if (evaluator.matches(root, parent)) + return true; + parent = parent.parent(); + } + return false; + } + + public String toString() { + return String.format(":parent%s", evaluator); + } + } + + static class ImmediateParent extends StructuralEvaluator { + public ImmediateParent(Evaluator evaluator) { + this.evaluator = evaluator; + } + + public boolean matches(Element root, Element element) { + if (root == element) + return false; + + Element parent = element.parent(); + return parent != null && evaluator.matches(root, parent); + } + + public String toString() { + return String.format(":ImmediateParent%s", evaluator); + } + } + + static class PreviousSibling extends StructuralEvaluator { + public PreviousSibling(Evaluator evaluator) { + this.evaluator = evaluator; + } + + public boolean matches(Element root, Element element) { + if (root == element) + return false; + + Element prev = element.previousElementSibling(); + + while (prev != null) { + if (evaluator.matches(root, prev)) + return true; + + prev = prev.previousElementSibling(); + } + return false; + } + + public String toString() { + return String.format(":prev*%s", evaluator); + } + } + + static class ImmediatePreviousSibling extends StructuralEvaluator { + public ImmediatePreviousSibling(Evaluator evaluator) { + this.evaluator = evaluator; + } + + public boolean matches(Element root, Element element) { + if (root == element) + return false; + + Element prev = element.previousElementSibling(); + return prev != null && evaluator.matches(root, prev); + } + + public String toString() { + return String.format(":prev%s", evaluator); + } + } +} diff --git a/src/org/jsoup/select/package-info.java b/src/org/jsoup/select/package-info.java new file mode 100644 index 0000000000..a6e6a2fa0f --- /dev/null +++ b/src/org/jsoup/select/package-info.java @@ -0,0 +1,4 @@ +/** + Packages to support the CSS-style element selector. + */ +package org.jsoup.select;
\ No newline at end of file |