From 38212596d91e9e167253d7debb154d18e3ff38b0 Mon Sep 17 00:00:00 2001 From: Artur Signell Date: Tue, 28 Aug 2012 20:00:00 +0300 Subject: [PATCH] Jsoup is now declared as a dependency (#9299) --- server/src/org/jsoup/Connection.java | 607 ----- server/src/org/jsoup/Jsoup.java | 293 --- .../org/jsoup/examples/HtmlToPlainText.java | 128 -- server/src/org/jsoup/examples/ListLinks.java | 60 - .../src/org/jsoup/examples/package-info.java | 4 - server/src/org/jsoup/helper/DataUtil.java | 186 -- .../jsoup/helper/DescendableLinkedList.java | 99 - .../src/org/jsoup/helper/HttpConnection.java | 814 ------- server/src/org/jsoup/helper/StringUtil.java | 167 -- server/src/org/jsoup/helper/Validate.java | 150 -- server/src/org/jsoup/nodes/Attribute.java | 167 -- server/src/org/jsoup/nodes/Attributes.java | 315 --- server/src/org/jsoup/nodes/Comment.java | 56 - server/src/org/jsoup/nodes/DataNode.java | 82 - server/src/org/jsoup/nodes/Document.java | 402 ---- server/src/org/jsoup/nodes/DocumentType.java | 56 - server/src/org/jsoup/nodes/Element.java | 1355 ----------- server/src/org/jsoup/nodes/Entities.java | 217 -- server/src/org/jsoup/nodes/Node.java | 727 ------ server/src/org/jsoup/nodes/TextNode.java | 206 -- .../src/org/jsoup/nodes/XmlDeclaration.java | 61 - .../org/jsoup/nodes/entities-base.properties | 106 - .../org/jsoup/nodes/entities-full.properties | 2032 ----------------- server/src/org/jsoup/nodes/package-info.java | 4 - server/src/org/jsoup/package-info.java | 4 - .../src/org/jsoup/parser/CharacterReader.java | 244 -- .../src/org/jsoup/parser/HtmlTreeBuilder.java | 754 ------ .../jsoup/parser/HtmlTreeBuilderState.java | 1671 -------------- server/src/org/jsoup/parser/ParseError.java | 43 - .../src/org/jsoup/parser/ParseErrorList.java | 34 - server/src/org/jsoup/parser/Parser.java | 198 -- server/src/org/jsoup/parser/Tag.java | 298 --- server/src/org/jsoup/parser/Token.java | 253 -- server/src/org/jsoup/parser/TokenQueue.java | 473 ---- server/src/org/jsoup/parser/Tokeniser.java | 264 --- .../src/org/jsoup/parser/TokeniserState.java | 1870 --------------- server/src/org/jsoup/parser/TreeBuilder.java | 61 - .../src/org/jsoup/parser/XmlTreeBuilder.java | 121 - server/src/org/jsoup/parser/package-info.java | 5 - server/src/org/jsoup/safety/Cleaner.java | 161 -- server/src/org/jsoup/safety/Whitelist.java | 509 ----- server/src/org/jsoup/safety/package-info.java | 5 - server/src/org/jsoup/select/Collector.java | 58 - .../org/jsoup/select/CombiningEvaluator.java | 102 - server/src/org/jsoup/select/Elements.java | 704 ------ server/src/org/jsoup/select/Evaluator.java | 474 ---- .../src/org/jsoup/select/NodeTraversor.java | 55 - server/src/org/jsoup/select/NodeVisitor.java | 39 - server/src/org/jsoup/select/QueryParser.java | 334 --- server/src/org/jsoup/select/Selector.java | 278 --- .../org/jsoup/select/StructuralEvaluator.java | 152 -- server/src/org/jsoup/select/package-info.java | 4 - 52 files changed, 17462 deletions(-) delete mode 100644 server/src/org/jsoup/Connection.java delete mode 100644 server/src/org/jsoup/Jsoup.java delete mode 100644 server/src/org/jsoup/examples/HtmlToPlainText.java delete mode 100644 server/src/org/jsoup/examples/ListLinks.java delete mode 100644 server/src/org/jsoup/examples/package-info.java delete mode 100644 server/src/org/jsoup/helper/DataUtil.java delete mode 100644 server/src/org/jsoup/helper/DescendableLinkedList.java delete mode 100644 server/src/org/jsoup/helper/HttpConnection.java delete mode 100644 server/src/org/jsoup/helper/StringUtil.java delete mode 100644 server/src/org/jsoup/helper/Validate.java delete mode 100644 server/src/org/jsoup/nodes/Attribute.java delete mode 100644 server/src/org/jsoup/nodes/Attributes.java delete mode 100644 server/src/org/jsoup/nodes/Comment.java delete mode 100644 server/src/org/jsoup/nodes/DataNode.java delete mode 100644 server/src/org/jsoup/nodes/Document.java delete mode 100644 server/src/org/jsoup/nodes/DocumentType.java delete mode 100644 server/src/org/jsoup/nodes/Element.java delete mode 100644 server/src/org/jsoup/nodes/Entities.java delete mode 100644 server/src/org/jsoup/nodes/Node.java delete mode 100644 server/src/org/jsoup/nodes/TextNode.java delete mode 100644 server/src/org/jsoup/nodes/XmlDeclaration.java delete mode 100644 server/src/org/jsoup/nodes/entities-base.properties delete mode 100644 server/src/org/jsoup/nodes/entities-full.properties delete mode 100644 server/src/org/jsoup/nodes/package-info.java delete mode 100644 server/src/org/jsoup/package-info.java delete mode 100644 server/src/org/jsoup/parser/CharacterReader.java delete mode 100644 server/src/org/jsoup/parser/HtmlTreeBuilder.java delete mode 100644 server/src/org/jsoup/parser/HtmlTreeBuilderState.java delete mode 100644 server/src/org/jsoup/parser/ParseError.java delete mode 100644 server/src/org/jsoup/parser/ParseErrorList.java delete mode 100644 server/src/org/jsoup/parser/Parser.java delete mode 100644 server/src/org/jsoup/parser/Tag.java delete mode 100644 server/src/org/jsoup/parser/Token.java delete mode 100644 server/src/org/jsoup/parser/TokenQueue.java delete mode 100644 server/src/org/jsoup/parser/Tokeniser.java delete mode 100644 server/src/org/jsoup/parser/TokeniserState.java delete mode 100644 server/src/org/jsoup/parser/TreeBuilder.java delete mode 100644 server/src/org/jsoup/parser/XmlTreeBuilder.java delete mode 100644 server/src/org/jsoup/parser/package-info.java delete mode 100644 server/src/org/jsoup/safety/Cleaner.java delete mode 100644 server/src/org/jsoup/safety/Whitelist.java delete mode 100644 server/src/org/jsoup/safety/package-info.java delete mode 100644 server/src/org/jsoup/select/Collector.java delete mode 100644 server/src/org/jsoup/select/CombiningEvaluator.java delete mode 100644 server/src/org/jsoup/select/Elements.java delete mode 100644 server/src/org/jsoup/select/Evaluator.java delete mode 100644 server/src/org/jsoup/select/NodeTraversor.java delete mode 100644 server/src/org/jsoup/select/NodeVisitor.java delete mode 100644 server/src/org/jsoup/select/QueryParser.java delete mode 100644 server/src/org/jsoup/select/Selector.java delete mode 100644 server/src/org/jsoup/select/StructuralEvaluator.java delete mode 100644 server/src/org/jsoup/select/package-info.java diff --git a/server/src/org/jsoup/Connection.java b/server/src/org/jsoup/Connection.java deleted file mode 100644 index 1d9879bfb3..0000000000 --- a/server/src/org/jsoup/Connection.java +++ /dev/null @@ -1,607 +0,0 @@ -package org.jsoup; - -import java.io.IOException; -import java.net.URL; -import java.util.Collection; -import java.util.Map; - -import org.jsoup.nodes.Document; -import org.jsoup.parser.Parser; - -/** - * A Connection provides a convenient interface to fetch content from the web, - * and parse them into Documents. - *

- * To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}. - * Connections contain {@link Connection.Request} and - * {@link Connection.Response} objects. The request objects are reusable as - * prototype requests. - *

- * Request configuration can be made using either the shortcut methods in - * Connection (e.g. {@link #userAgent(String)}), or by methods in the - * Connection.Request object directly. All request configuration must be made - * before the request is executed. - *

- * The Connection interface is currently in beta and subject to change. - * Comments, suggestions, and bug reports are welcome. - */ -public interface Connection { - - /** - * GET and POST http methods. - */ - public enum Method { - GET, POST - } - - /** - * Set the request URL to fetch. The protocol must be HTTP or HTTPS. - * - * @param url - * URL to connect to - * @return this Connection, for chaining - */ - public Connection url(URL url); - - /** - * Set the request URL to fetch. The protocol must be HTTP or HTTPS. - * - * @param url - * URL to connect to - * @return this Connection, for chaining - */ - public Connection url(String url); - - /** - * Set the request user-agent header. - * - * @param userAgent - * user-agent to use - * @return this Connection, for chaining - */ - public Connection userAgent(String userAgent); - - /** - * Set the request timeouts (connect and read). If a timeout occurs, an - * IOException will be thrown. The default timeout is 3 seconds (3000 - * millis). A timeout of zero is treated as an infinite timeout. - * - * @param millis - * number of milliseconds (thousandths of a second) before timing - * out connects or reads. - * @return this Connection, for chaining - */ - public Connection timeout(int millis); - - /** - * Set the request referrer (aka "referer") header. - * - * @param referrer - * referrer to use - * @return this Connection, for chaining - */ - public Connection referrer(String referrer); - - /** - * Configures the connection to (not) follow server redirects. By default - * this is true. - * - * @param followRedirects - * true if server redirects should be followed. - * @return this Connection, for chaining - */ - public Connection followRedirects(boolean followRedirects); - - /** - * Set the request method to use, GET or POST. Default is GET. - * - * @param method - * HTTP request method - * @return this Connection, for chaining - */ - public Connection method(Method method); - - /** - * Configures the connection to not throw exceptions when a HTTP error - * occurs. (4xx - 5xx, e.g. 404 or 500). By default this is false; an - * IOException is thrown if an error is encountered. If set to true, - * the response is populated with the error body, and the status message - * will reflect the error. - * - * @param ignoreHttpErrors - * - false (default) if HTTP errors should be ignored. - * @return this Connection, for chaining - */ - public Connection ignoreHttpErrors(boolean ignoreHttpErrors); - - /** - * Ignore the document's Content-Type when parsing the response. By default - * this is false, an unrecognised content-type will cause an - * IOException to be thrown. (This is to prevent producing garbage by - * attempting to parse a JPEG binary image, for example.) Set to true to - * force a parse attempt regardless of content type. - * - * @param ignoreContentType - * set to true if you would like the content type ignored on - * parsing the response into a Document. - * @return this Connection, for chaining - */ - public Connection ignoreContentType(boolean ignoreContentType); - - /** - * Add a request data parameter. Request parameters are sent in the request - * query string for GETs, and in the request body for POSTs. A request may - * have multiple values of the same name. - * - * @param key - * data key - * @param value - * data value - * @return this Connection, for chaining - */ - public Connection data(String key, String value); - - /** - * Adds all of the supplied data to the request data parameters - * - * @param data - * map of data parameters - * @return this Connection, for chaining - */ - public Connection data(Map data); - - /** - * Add a number of request data parameters. Multiple parameters may be set - * at once, e.g.: - * .data("name", "jsoup", "language", "Java", "language", "English"); - * creates a query string like: - * ?name=jsoup&language=Java&language=English - * - * @param keyvals - * a set of key value pairs. - * @return this Connection, for chaining - */ - public Connection data(String... keyvals); - - /** - * Set a request header. - * - * @param name - * header name - * @param value - * header value - * @return this Connection, for chaining - * @see org.jsoup.Connection.Request#headers() - */ - public Connection header(String name, String value); - - /** - * Set a cookie to be sent in the request. - * - * @param name - * name of cookie - * @param value - * value of cookie - * @return this Connection, for chaining - */ - public Connection cookie(String name, String value); - - /** - * Adds each of the supplied cookies to the request. - * - * @param cookies - * map of cookie name -> value pairs - * @return this Connection, for chaining - */ - public Connection cookies(Map cookies); - - /** - * Provide an alternate parser to use when parsing the response to a - * Document. - * - * @param parser - * alternate parser - * @return this Connection, for chaining - */ - public Connection parser(Parser parser); - - /** - * Execute the request as a GET, and parse the result. - * - * @return parsed Document - * @throws IOException - * on error - */ - public Document get() throws IOException; - - /** - * Execute the request as a POST, and parse the result. - * - * @return parsed Document - * @throws IOException - * on error - */ - public Document post() throws IOException; - - /** - * Execute the request. - * - * @return a response object - * @throws IOException - * on error - */ - public Response execute() throws IOException; - - /** - * Get the request object associated with this connection - * - * @return request - */ - public Request request(); - - /** - * Set the connection's request - * - * @param request - * new request object - * @return this Connection, for chaining - */ - public Connection request(Request request); - - /** - * Get the response, once the request has been executed - * - * @return response - */ - public Response response(); - - /** - * Set the connection's response - * - * @param response - * new response - * @return this Connection, for chaining - */ - public Connection response(Response response); - - /** - * Common methods for Requests and Responses - * - * @param - * Type of Base, either Request or Response - */ - interface Base { - - /** - * Get the URL - * - * @return URL - */ - public URL url(); - - /** - * Set the URL - * - * @param url - * new URL - * @return this, for chaining - */ - public T url(URL url); - - /** - * Get the request method - * - * @return method - */ - public Method method(); - - /** - * Set the request method - * - * @param method - * new method - * @return this, for chaining - */ - public T method(Method method); - - /** - * Get the value of a header. This is a simplified header model, where a - * header may only have one value. - *

- * Header names are case insensitive. - * - * @param name - * name of header (case insensitive) - * @return value of header, or null if not set. - * @see #hasHeader(String) - * @see #cookie(String) - */ - public String header(String name); - - /** - * Set a header. This method will overwrite any existing header with the - * same case insensitive name. - * - * @param name - * Name of header - * @param value - * Value of header - * @return this, for chaining - */ - public T header(String name, String value); - - /** - * Check if a header is present - * - * @param name - * name of header (case insensitive) - * @return if the header is present in this request/response - */ - public boolean hasHeader(String name); - - /** - * Remove a header by name - * - * @param name - * name of header to remove (case insensitive) - * @return this, for chaining - */ - public T removeHeader(String name); - - /** - * Retrieve all of the request/response headers as a map - * - * @return headers - */ - public Map headers(); - - /** - * Get a cookie value by name from this request/response. - *

- * Response objects have a simplified cookie model. Each cookie set in - * the response is added to the response object's cookie key=value map. - * The cookie's path, domain, and expiry date are ignored. - * - * @param name - * name of cookie to retrieve. - * @return value of cookie, or null if not set - */ - public String cookie(String name); - - /** - * Set a cookie in this request/response. - * - * @param name - * name of cookie - * @param value - * value of cookie - * @return this, for chaining - */ - public T cookie(String name, String value); - - /** - * Check if a cookie is present - * - * @param name - * name of cookie - * @return if the cookie is present in this request/response - */ - public boolean hasCookie(String name); - - /** - * Remove a cookie by name - * - * @param name - * name of cookie to remove - * @return this, for chaining - */ - public T removeCookie(String name); - - /** - * Retrieve all of the request/response cookies as a map - * - * @return cookies - */ - public Map cookies(); - - } - - /** - * Represents a HTTP request. - */ - public interface Request extends Base { - /** - * Get the request timeout, in milliseconds. - * - * @return the timeout in milliseconds. - */ - public int timeout(); - - /** - * Update the request timeout. - * - * @param millis - * timeout, in milliseconds - * @return this Request, for chaining - */ - public Request timeout(int millis); - - /** - * Get the current followRedirects configuration. - * - * @return true if followRedirects is enabled. - */ - public boolean followRedirects(); - - /** - * Configures the request to (not) follow server redirects. By default - * this is true. - * - * @param followRedirects - * true if server redirects should be followed. - * @return this Request, for chaining - */ - public Request followRedirects(boolean followRedirects); - - /** - * Get the current ignoreHttpErrors configuration. - * - * @return true if errors will be ignored; false (default) if HTTP - * errors will cause an IOException to be thrown. - */ - public boolean ignoreHttpErrors(); - - /** - * Configures the request to ignore HTTP errors in the response. - * - * @param ignoreHttpErrors - * set to true to ignore HTTP errors. - * @return this Request, for chaining - */ - public Request ignoreHttpErrors(boolean ignoreHttpErrors); - - /** - * Get the current ignoreContentType configuration. - * - * @return true if invalid content-types will be ignored; false - * (default) if they will cause an IOException to be thrown. - */ - public boolean ignoreContentType(); - - /** - * Configures the request to ignore the Content-Type of the response. - * - * @param ignoreContentType - * set to true to ignore the content type. - * @return this Request, for chaining - */ - public Request ignoreContentType(boolean ignoreContentType); - - /** - * Add a data parameter to the request - * - * @param keyval - * data to add. - * @return this Request, for chaining - */ - public Request data(KeyVal keyval); - - /** - * Get all of the request's data parameters - * - * @return collection of keyvals - */ - public Collection data(); - - /** - * Specify the parser to use when parsing the document. - * - * @param parser - * parser to use. - * @return this Request, for chaining - */ - public Request parser(Parser parser); - - /** - * Get the current parser to use when parsing the document. - * - * @return current Parser - */ - public Parser parser(); - } - - /** - * Represents a HTTP response. - */ - public interface Response extends Base { - - /** - * Get the status code of the response. - * - * @return status code - */ - public int statusCode(); - - /** - * Get the status message of the response. - * - * @return status message - */ - public String statusMessage(); - - /** - * Get the character set name of the response. - * - * @return character set name - */ - public String charset(); - - /** - * Get the response content type (e.g. "text/html"); - * - * @return the response content type - */ - public String contentType(); - - /** - * Parse the body of the response as a Document. - * - * @return a parsed Document - * @throws IOException - * on error - */ - public Document parse() throws IOException; - - /** - * Get the body of the response as a plain string. - * - * @return body - */ - public String body(); - - /** - * Get the body of the response as an array of bytes. - * - * @return body bytes - */ - public byte[] bodyAsBytes(); - } - - /** - * A Key Value tuple. - */ - public interface KeyVal { - - /** - * Update the key of a keyval - * - * @param key - * new key - * @return this KeyVal, for chaining - */ - public KeyVal key(String key); - - /** - * Get the key of a keyval - * - * @return the key - */ - public String key(); - - /** - * Update the value of a keyval - * - * @param value - * the new value - * @return this KeyVal, for chaining - */ - public KeyVal value(String value); - - /** - * Get the value of a keyval - * - * @return the value - */ - public String value(); - } -} diff --git a/server/src/org/jsoup/Jsoup.java b/server/src/org/jsoup/Jsoup.java deleted file mode 100644 index b5429d9410..0000000000 --- a/server/src/org/jsoup/Jsoup.java +++ /dev/null @@ -1,293 +0,0 @@ -package org.jsoup; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; - -import org.jsoup.helper.DataUtil; -import org.jsoup.helper.HttpConnection; -import org.jsoup.nodes.Document; -import org.jsoup.parser.Parser; -import org.jsoup.safety.Cleaner; -import org.jsoup.safety.Whitelist; - -/** - * The core public access point to the jsoup functionality. - * - * @author Jonathan Hedley - */ -public class Jsoup { - private Jsoup() { - } - - /** - * Parse HTML into a Document. The parser will make a sensible, balanced - * document tree out of any HTML. - * - * @param html - * HTML to parse - * @param baseUri - * The URL where the HTML was retrieved from. Used to resolve - * relative URLs to absolute URLs, that occur before the HTML - * declares a {@code } tag. - * @return sane HTML - */ - public static Document parse(String html, String baseUri) { - return Parser.parse(html, baseUri); - } - - /** - * Parse HTML into a Document, using the provided Parser. You can provide an - * alternate parser, such as a simple XML (non-HTML) parser. - * - * @param html - * HTML to parse - * @param baseUri - * The URL where the HTML was retrieved from. Used to resolve - * relative URLs to absolute URLs, that occur before the HTML - * declares a {@code } tag. - * @param parser - * alternate {@link Parser#xmlParser() parser} to use. - * @return sane HTML - */ - public static Document parse(String html, String baseUri, Parser parser) { - return parser.parseInput(html, baseUri); - } - - /** - * Parse HTML into a Document. As no base URI is specified, absolute URL - * detection relies on the HTML including a {@code } tag. - * - * @param html - * HTML to parse - * @return sane HTML - * @see #parse(String, String) - */ - public static Document parse(String html) { - return Parser.parse(html, ""); - } - - /** - * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML - * page. - *

- * Use examples: - *

- * - * @param url - * URL to connect to. The protocol must be {@code http} or - * {@code https}. - * @return the connection. You can add data, cookies, and headers; set the - * user-agent, referrer, method; and then execute. - */ - public static Connection connect(String url) { - return HttpConnection.connect(url); - } - - /** - * Parse the contents of a file as HTML. - * - * @param in - * file to load HTML from - * @param charsetName - * (optional) character set of file contents. Set to {@code null} - * to determine from {@code http-equiv} meta tag, if present, or - * fall back to {@code UTF-8} (which is often safe to do). - * @param baseUri - * The URL where the HTML was retrieved from, to resolve relative - * links against. - * @return sane HTML - * @throws IOException - * if the file could not be found, or read, or if the - * charsetName is invalid. - */ - public static Document parse(File in, String charsetName, String baseUri) - throws IOException { - return DataUtil.load(in, charsetName, baseUri); - } - - /** - * Parse the contents of a file as HTML. The location of the file is used as - * the base URI to qualify relative URLs. - * - * @param in - * file to load HTML from - * @param charsetName - * (optional) character set of file contents. Set to {@code null} - * to determine from {@code http-equiv} meta tag, if present, or - * fall back to {@code UTF-8} (which is often safe to do). - * @return sane HTML - * @throws IOException - * if the file could not be found, or read, or if the - * charsetName is invalid. - * @see #parse(File, String, String) - */ - public static Document parse(File in, String charsetName) - throws IOException { - return DataUtil.load(in, charsetName, in.getAbsolutePath()); - } - - /** - * Read an input stream, and parse it to a Document. - * - * @param in - * input stream to read. Make sure to close it after parsing. - * @param charsetName - * (optional) character set of file contents. Set to {@code null} - * to determine from {@code http-equiv} meta tag, if present, or - * fall back to {@code UTF-8} (which is often safe to do). - * @param baseUri - * The URL where the HTML was retrieved from, to resolve relative - * links against. - * @return sane HTML - * @throws IOException - * if the file could not be found, or read, or if the - * charsetName is invalid. - */ - public static Document parse(InputStream in, String charsetName, - String baseUri) throws IOException { - return DataUtil.load(in, charsetName, baseUri); - } - - /** - * Read an input stream, and parse it to a Document. You can provide an - * alternate parser, such as a simple XML (non-HTML) parser. - * - * @param in - * input stream to read. Make sure to close it after parsing. - * @param charsetName - * (optional) character set of file contents. Set to {@code null} - * to determine from {@code http-equiv} meta tag, if present, or - * fall back to {@code UTF-8} (which is often safe to do). - * @param baseUri - * The URL where the HTML was retrieved from, to resolve relative - * links against. - * @param parser - * alternate {@link Parser#xmlParser() parser} to use. - * @return sane HTML - * @throws IOException - * if the file could not be found, or read, or if the - * charsetName is invalid. - */ - public static Document parse(InputStream in, String charsetName, - String baseUri, Parser parser) throws IOException { - return DataUtil.load(in, charsetName, baseUri, parser); - } - - /** - * Parse a fragment of HTML, with the assumption that it forms the - * {@code body} of the HTML. - * - * @param bodyHtml - * body HTML fragment - * @param baseUri - * URL to resolve relative URLs against. - * @return sane HTML document - * @see Document#body() - */ - public static Document parseBodyFragment(String bodyHtml, String baseUri) { - return Parser.parseBodyFragment(bodyHtml, baseUri); - } - - /** - * Parse a fragment of HTML, with the assumption that it forms the - * {@code body} of the HTML. - * - * @param bodyHtml - * body HTML fragment - * @return sane HTML document - * @see Document#body() - */ - public static Document parseBodyFragment(String bodyHtml) { - return Parser.parseBodyFragment(bodyHtml, ""); - } - - /** - * Fetch a URL, and parse it as HTML. Provided for compatibility; in most - * cases use {@link #connect(String)} instead. - *

- * The encoding character set is determined by the content-type header or - * http-equiv meta tag, or falls back to {@code UTF-8}. - * - * @param url - * URL to fetch (with a GET). The protocol must be {@code http} - * or {@code https}. - * @param timeoutMillis - * Connection and read timeout, in milliseconds. If exceeded, - * IOException is thrown. - * @return The parsed HTML. - * @throws IOException - * If the final server response != 200 OK (redirects are - * followed), or if there's an error reading the response - * stream. - * @see #connect(String) - */ - public static Document parse(URL url, int timeoutMillis) throws IOException { - Connection con = HttpConnection.connect(url); - con.timeout(timeoutMillis); - return con.get(); - } - - /** - * Get safe HTML from untrusted input HTML, by parsing input HTML and - * filtering it through a white-list of permitted tags and attributes. - * - * @param bodyHtml - * input untrusted HTML - * @param baseUri - * URL to resolve relative URLs against - * @param whitelist - * white-list of permitted HTML elements - * @return safe HTML - * @see Cleaner#clean(Document) - */ - public static String clean(String bodyHtml, String baseUri, - Whitelist whitelist) { - Document dirty = parseBodyFragment(bodyHtml, baseUri); - Cleaner cleaner = new Cleaner(whitelist); - Document clean = cleaner.clean(dirty); - return clean.body().html(); - } - - /** - * Get safe HTML from untrusted input HTML, by parsing input HTML and - * filtering it through a white-list of permitted tags and attributes. - * - * @param bodyHtml - * input untrusted HTML - * @param whitelist - * white-list of permitted HTML elements - * @return safe HTML - * @see Cleaner#clean(Document) - */ - public static String clean(String bodyHtml, Whitelist whitelist) { - return clean(bodyHtml, "", whitelist); - } - - /** - * Test if the input HTML has only tags and attributes allowed by the - * Whitelist. Useful for form validation. The input HTML should still be run - * through the cleaner to set up enforced attributes, and to tidy the - * output. - * - * @param bodyHtml - * HTML to test - * @param whitelist - * whitelist to test against - * @return true if no tags or attributes were removed; false otherwise - * @see #clean(String, org.jsoup.safety.Whitelist) - */ - public static boolean isValid(String bodyHtml, Whitelist whitelist) { - Document dirty = parseBodyFragment(bodyHtml, ""); - Cleaner cleaner = new Cleaner(whitelist); - return cleaner.isValid(dirty); - } - -} diff --git a/server/src/org/jsoup/examples/HtmlToPlainText.java b/server/src/org/jsoup/examples/HtmlToPlainText.java deleted file mode 100644 index 53e485be34..0000000000 --- a/server/src/org/jsoup/examples/HtmlToPlainText.java +++ /dev/null @@ -1,128 +0,0 @@ -package org.jsoup.examples; - -import java.io.IOException; - -import org.jsoup.Jsoup; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; -import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; - -/** - * HTML to plain-text. This example program demonstrates the use of jsoup to - * convert HTML input to lightly-formatted plain-text. That is divergent from - * the general goal of jsoup's .text() methods, which is to get clean data from - * a scrape. - *

- * Note that this is a fairly simplistic formatter -- for real world use you'll - * want to embrace and extend. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class HtmlToPlainText { - public static void main(String... args) throws IOException { - Validate.isTrue(args.length == 1, "usage: supply url to fetch"); - String url = args[0]; - - // fetch the specified URL and parse to a HTML DOM - Document doc = Jsoup.connect(url).get(); - - HtmlToPlainText formatter = new HtmlToPlainText(); - String plainText = formatter.getPlainText(doc); - System.out.println(plainText); - } - - /** - * Format an Element to plain-text - * - * @param element - * the root element to format - * @return formatted text - */ - public String getPlainText(Element element) { - FormattingVisitor formatter = new FormattingVisitor(); - NodeTraversor traversor = new NodeTraversor(formatter); - traversor.traverse(element); // walk the DOM, and call .head() and - // .tail() for each node - - return formatter.toString(); - } - - // the formatting rules, implemented in a breadth-first DOM traverse - private class FormattingVisitor implements NodeVisitor { - private static final int maxWidth = 80; - private int width = 0; - private StringBuilder accum = new StringBuilder(); // holds the - // accumulated text - - // hit when the node is first seen - @Override - public void head(Node node, int depth) { - String name = node.nodeName(); - if (node instanceof TextNode) { - append(((TextNode) node).text()); // TextNodes carry all - // user-readable text in the - // DOM. - } else if (name.equals("li")) { - append("\n * "); - } - } - - // hit when all of the node's children (if any) have been visited - @Override - public void tail(Node node, int depth) { - String name = node.nodeName(); - if (name.equals("br")) { - append("\n"); - } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5")) { - append("\n\n"); - } else if (name.equals("a")) { - append(String.format(" <%s>", node.absUrl("href"))); - } - } - - // appends text to the string builder with a simple word wrap method - private void append(String text) { - if (text.startsWith("\n")) { - width = 0; // reset counter if starts with a newline. only from - // formats above, not in natural text - } - if (text.equals(" ") - && (accum.length() == 0 || StringUtil.in( - accum.substring(accum.length() - 1), " ", "\n"))) { - return; // don't accumulate long runs of empty spaces - } - - if (text.length() + width > maxWidth) { // won't fit, needs to wrap - String words[] = text.split("\\s+"); - for (int i = 0; i < words.length; i++) { - String word = words[i]; - boolean last = i == words.length - 1; - if (!last) { - word = word + " "; - } - if (word.length() + width > maxWidth) { // wrap and reset - // counter - accum.append("\n").append(word); - width = word.length(); - } else { - accum.append(word); - width += word.length(); - } - } - } else { // fits as is, without need to wrap text - accum.append(text); - width += text.length(); - } - } - - @Override - public String toString() { - return accum.toString(); - } - } -} diff --git a/server/src/org/jsoup/examples/ListLinks.java b/server/src/org/jsoup/examples/ListLinks.java deleted file mode 100644 index d57a488435..0000000000 --- a/server/src/org/jsoup/examples/ListLinks.java +++ /dev/null @@ -1,60 +0,0 @@ -package org.jsoup.examples; - -import java.io.IOException; - -import org.jsoup.Jsoup; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -/** - * Example program to list links from a URL. - */ -public class ListLinks { - public static void main(String[] args) throws IOException { - Validate.isTrue(args.length == 1, "usage: supply url to fetch"); - String url = args[0]; - print("Fetching %s...", url); - - Document doc = Jsoup.connect(url).get(); - Elements links = doc.select("a[href]"); - Elements media = doc.select("[src]"); - Elements imports = doc.select("link[href]"); - - print("\nMedia: (%d)", media.size()); - for (Element src : media) { - if (src.tagName().equals("img")) { - print(" * %s: <%s> %sx%s (%s)", src.tagName(), - src.attr("abs:src"), src.attr("width"), - src.attr("height"), trim(src.attr("alt"), 20)); - } else { - print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); - } - } - - print("\nImports: (%d)", imports.size()); - for (Element link : imports) { - print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), - link.attr("rel")); - } - - print("\nLinks: (%d)", links.size()); - for (Element link : links) { - print(" * a: <%s> (%s)", link.attr("abs:href"), - trim(link.text(), 35)); - } - } - - private static void print(String msg, Object... args) { - System.out.println(String.format(msg, args)); - } - - private static String trim(String s, int width) { - if (s.length() > width) { - return s.substring(0, width - 1) + "."; - } else { - return s; - } - } -} diff --git a/server/src/org/jsoup/examples/package-info.java b/server/src/org/jsoup/examples/package-info.java deleted file mode 100644 index c312f430d4..0000000000 --- a/server/src/org/jsoup/examples/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - Contains example programs and use of jsoup. See the jsoup cookbook. - */ -package org.jsoup.examples; \ No newline at end of file diff --git a/server/src/org/jsoup/helper/DataUtil.java b/server/src/org/jsoup/helper/DataUtil.java deleted file mode 100644 index 26b85ea7dc..0000000000 --- a/server/src/org/jsoup/helper/DataUtil.java +++ /dev/null @@ -1,186 +0,0 @@ -package org.jsoup.helper; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.parser.Parser; - -/** - * Internal static utilities for handling data. - * - */ -public class DataUtil { - private static final Pattern charsetPattern = Pattern - .compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); - static final String defaultCharset = "UTF-8"; // used if not found in header - // or meta charset - private static final int bufferSize = 0x20000; // ~130K. - - private DataUtil() { - } - - /** - * Loads a file to a Document. - * - * @param in - * file to load - * @param charsetName - * character set of input - * @param baseUri - * base URI of document, to resolve relative links against - * @return Document - * @throws IOException - * on IO error - */ - public static Document load(File in, String charsetName, String baseUri) - throws IOException { - FileInputStream inStream = null; - try { - inStream = new FileInputStream(in); - ByteBuffer byteData = readToByteBuffer(inStream); - return parseByteData(byteData, charsetName, baseUri, - Parser.htmlParser()); - } finally { - if (inStream != null) { - inStream.close(); - } - } - } - - /** - * Parses a Document from an input steam. - * - * @param in - * input stream to parse. You will need to close it. - * @param charsetName - * character set of input - * @param baseUri - * base URI of document, to resolve relative links against - * @return Document - * @throws IOException - * on IO error - */ - public static Document load(InputStream in, String charsetName, - String baseUri) throws IOException { - ByteBuffer byteData = readToByteBuffer(in); - return parseByteData(byteData, charsetName, baseUri, - Parser.htmlParser()); - } - - /** - * Parses a Document from an input steam, using the provided Parser. - * - * @param in - * input stream to parse. You will need to close it. - * @param charsetName - * character set of input - * @param baseUri - * base URI of document, to resolve relative links against - * @param parser - * alternate {@link Parser#xmlParser() parser} to use. - * @return Document - * @throws IOException - * on IO error - */ - public static Document load(InputStream in, String charsetName, - String baseUri, Parser parser) throws IOException { - ByteBuffer byteData = readToByteBuffer(in); - return parseByteData(byteData, charsetName, baseUri, parser); - } - - // reads bytes first into a buffer, then decodes with the appropriate - // charset. done this way to support - // switching the chartset midstream when a meta http-equiv tag defines the - // charset. - static Document parseByteData(ByteBuffer byteData, String charsetName, - String baseUri, Parser parser) { - String docData; - Document doc = null; - if (charsetName == null) { // determine from meta. safe parse as UTF-8 - // look for or HTML5 - docData = Charset.forName(defaultCharset).decode(byteData) - .toString(); - doc = parser.parseInput(docData, baseUri); - Element meta = doc.select( - "meta[http-equiv=content-type], meta[charset]").first(); - if (meta != null) { // if not found, will keep utf-8 as best attempt - String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta - .attr("content")) : meta.attr("charset"); - if (foundCharset != null && foundCharset.length() != 0 - && !foundCharset.equals(defaultCharset)) { // need to - // re-decode - charsetName = foundCharset; - byteData.rewind(); - docData = Charset.forName(foundCharset).decode(byteData) - .toString(); - doc = null; - } - } - } else { // specified by content type header (or by user on file load) - Validate.notEmpty( - charsetName, - "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); - docData = Charset.forName(charsetName).decode(byteData).toString(); - } - if (doc == null) { - // there are times where there is a spurious byte-order-mark at the - // start of the text. Shouldn't be present - // in utf-8. If after decoding, there is a BOM, strip it; otherwise - // will cause the parser to go straight - // into head mode - if (docData.charAt(0) == 65279) { - docData = docData.substring(1); - } - - doc = parser.parseInput(docData, baseUri); - doc.outputSettings().charset(charsetName); - } - return doc; - } - - static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException { - byte[] buffer = new byte[bufferSize]; - ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize); - int read; - while (true) { - read = inStream.read(buffer); - if (read == -1) { - break; - } - outStream.write(buffer, 0, read); - } - ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray()); - return byteData; - } - - /** - * Parse out a charset from a content type header. - * - * @param contentType - * e.g. "text/html; charset=EUC-JP" - * @return "EUC-JP", or null if not found. Charset is trimmed and - * uppercased. - */ - static String getCharsetFromContentType(String contentType) { - if (contentType == null) { - return null; - } - Matcher m = charsetPattern.matcher(contentType); - if (m.find()) { - return m.group(1).trim().toUpperCase(); - } - return null; - } - -} diff --git a/server/src/org/jsoup/helper/DescendableLinkedList.java b/server/src/org/jsoup/helper/DescendableLinkedList.java deleted file mode 100644 index 97595c34e6..0000000000 --- a/server/src/org/jsoup/helper/DescendableLinkedList.java +++ /dev/null @@ -1,99 +0,0 @@ -package org.jsoup.helper; - -import java.util.Iterator; -import java.util.LinkedList; -import java.util.ListIterator; - -/** - * Provides a descending iterator and other 1.6 methods to allow support on the - * 1.5 JRE. - */ -public class DescendableLinkedList extends LinkedList { - - /** - * Create a new DescendableLinkedList. - */ - public DescendableLinkedList() { - super(); - } - - /** - * Add a new element to the start of the list. - * - * @param e - * element to add - */ - @Override - public void push(E e) { - addFirst(e); - } - - /** - * Look at the last element, if there is one. - * - * @return the last element, or null - */ - @Override - public E peekLast() { - return size() == 0 ? null : getLast(); - } - - /** - * Remove and return the last element, if there is one - * - * @return the last element, or null - */ - @Override - public E pollLast() { - return size() == 0 ? null : removeLast(); - } - - /** - * Get an iterator that starts and the end of the list and works towards the - * start. - * - * @return an iterator that starts and the end of the list and works towards - * the start. - */ - @Override - public Iterator descendingIterator() { - return new DescendingIterator(size()); - } - - private class DescendingIterator implements Iterator { - private final ListIterator iter; - - @SuppressWarnings("unchecked") - private DescendingIterator(int index) { - iter = (ListIterator) listIterator(index); - } - - /** - * Check if there is another element on the list. - * - * @return if another element - */ - @Override - public boolean hasNext() { - return iter.hasPrevious(); - } - - /** - * Get the next element. - * - * @return the next element. - */ - @Override - public E next() { - return iter.previous(); - } - - /** - * Remove the current element. - */ - @Override - public void remove() { - iter.remove(); - } - } -} diff --git a/server/src/org/jsoup/helper/HttpConnection.java b/server/src/org/jsoup/helper/HttpConnection.java deleted file mode 100644 index a48f8972c2..0000000000 --- a/server/src/org/jsoup/helper/HttpConnection.java +++ /dev/null @@ -1,814 +0,0 @@ -package org.jsoup.helper; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.net.HttpURLConnection; -import java.net.MalformedURLException; -import java.net.URL; -import java.net.URLEncoder; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.zip.GZIPInputStream; - -import org.jsoup.Connection; -import org.jsoup.nodes.Document; -import org.jsoup.parser.Parser; -import org.jsoup.parser.TokenQueue; - -/** - * Implementation of {@link Connection}. - * - * @see org.jsoup.Jsoup#connect(String) - */ -public class HttpConnection implements Connection { - public static Connection connect(String url) { - Connection con = new HttpConnection(); - con.url(url); - return con; - } - - public static Connection connect(URL url) { - Connection con = new HttpConnection(); - con.url(url); - return con; - } - - private Connection.Request req; - private Connection.Response res; - - private HttpConnection() { - req = new Request(); - res = new Response(); - } - - @Override - public Connection url(URL url) { - req.url(url); - return this; - } - - @Override - public Connection url(String url) { - Validate.notEmpty(url, "Must supply a valid URL"); - try { - req.url(new URL(url)); - } catch (MalformedURLException e) { - throw new IllegalArgumentException("Malformed URL: " + url, e); - } - return this; - } - - @Override - public Connection userAgent(String userAgent) { - Validate.notNull(userAgent, "User agent must not be null"); - req.header("User-Agent", userAgent); - return this; - } - - @Override - public Connection timeout(int millis) { - req.timeout(millis); - return this; - } - - @Override - public Connection followRedirects(boolean followRedirects) { - req.followRedirects(followRedirects); - return this; - } - - @Override - public Connection referrer(String referrer) { - Validate.notNull(referrer, "Referrer must not be null"); - req.header("Referer", referrer); - return this; - } - - @Override - public Connection method(Method method) { - req.method(method); - return this; - } - - @Override - public Connection ignoreHttpErrors(boolean ignoreHttpErrors) { - req.ignoreHttpErrors(ignoreHttpErrors); - return this; - } - - @Override - public Connection ignoreContentType(boolean ignoreContentType) { - req.ignoreContentType(ignoreContentType); - return this; - } - - @Override - public Connection data(String key, String value) { - req.data(KeyVal.create(key, value)); - return this; - } - - @Override - public Connection data(Map data) { - Validate.notNull(data, "Data map must not be null"); - for (Map.Entry entry : data.entrySet()) { - req.data(KeyVal.create(entry.getKey(), entry.getValue())); - } - return this; - } - - @Override - public Connection data(String... keyvals) { - Validate.notNull(keyvals, "Data key value pairs must not be null"); - Validate.isTrue(keyvals.length % 2 == 0, - "Must supply an even number of key value pairs"); - for (int i = 0; i < keyvals.length; i += 2) { - String key = keyvals[i]; - String value = keyvals[i + 1]; - Validate.notEmpty(key, "Data key must not be empty"); - Validate.notNull(value, "Data value must not be null"); - req.data(KeyVal.create(key, value)); - } - return this; - } - - @Override - public Connection header(String name, String value) { - req.header(name, value); - return this; - } - - @Override - public Connection cookie(String name, String value) { - req.cookie(name, value); - return this; - } - - @Override - public Connection cookies(Map cookies) { - Validate.notNull(cookies, "Cookie map must not be null"); - for (Map.Entry entry : cookies.entrySet()) { - req.cookie(entry.getKey(), entry.getValue()); - } - return this; - } - - @Override - public Connection parser(Parser parser) { - req.parser(parser); - return this; - } - - @Override - public Document get() throws IOException { - req.method(Method.GET); - execute(); - return res.parse(); - } - - @Override - public Document post() throws IOException { - req.method(Method.POST); - execute(); - return res.parse(); - } - - @Override - public Connection.Response execute() throws IOException { - res = Response.execute(req); - return res; - } - - @Override - public Connection.Request request() { - return req; - } - - @Override - public Connection request(Connection.Request request) { - req = request; - return this; - } - - @Override - public Connection.Response response() { - return res; - } - - @Override - public Connection response(Connection.Response response) { - res = response; - return this; - } - - @SuppressWarnings({ "unchecked" }) - private static abstract class Base implements - Connection.Base { - URL url; - Method method; - Map headers; - Map cookies; - - private Base() { - headers = new LinkedHashMap(); - cookies = new LinkedHashMap(); - } - - @Override - public URL url() { - return url; - } - - @Override - public T url(URL url) { - Validate.notNull(url, "URL must not be null"); - this.url = url; - return (T) this; - } - - @Override - public Method method() { - return method; - } - - @Override - public T method(Method method) { - Validate.notNull(method, "Method must not be null"); - this.method = method; - return (T) this; - } - - @Override - public String header(String name) { - Validate.notNull(name, "Header name must not be null"); - return getHeaderCaseInsensitive(name); - } - - @Override - public T header(String name, String value) { - Validate.notEmpty(name, "Header name must not be empty"); - Validate.notNull(value, "Header value must not be null"); - removeHeader(name); // ensures we don't get an "accept-encoding" and - // a "Accept-Encoding" - headers.put(name, value); - return (T) this; - } - - @Override - public boolean hasHeader(String name) { - Validate.notEmpty(name, "Header name must not be empty"); - return getHeaderCaseInsensitive(name) != null; - } - - @Override - public T removeHeader(String name) { - Validate.notEmpty(name, "Header name must not be empty"); - Map.Entry entry = scanHeaders(name); // remove is - // case - // insensitive - // too - if (entry != null) { - headers.remove(entry.getKey()); // ensures correct case - } - return (T) this; - } - - @Override - public Map headers() { - return headers; - } - - private String getHeaderCaseInsensitive(String name) { - Validate.notNull(name, "Header name must not be null"); - // quick evals for common case of title case, lower case, then scan - // for mixed - String value = headers.get(name); - if (value == null) { - value = headers.get(name.toLowerCase()); - } - if (value == null) { - Map.Entry entry = scanHeaders(name); - if (entry != null) { - value = entry.getValue(); - } - } - return value; - } - - private Map.Entry scanHeaders(String name) { - String lc = name.toLowerCase(); - for (Map.Entry entry : headers.entrySet()) { - if (entry.getKey().toLowerCase().equals(lc)) { - return entry; - } - } - return null; - } - - @Override - public String cookie(String name) { - Validate.notNull(name, "Cookie name must not be null"); - return cookies.get(name); - } - - @Override - public T cookie(String name, String value) { - Validate.notEmpty(name, "Cookie name must not be empty"); - Validate.notNull(value, "Cookie value must not be null"); - cookies.put(name, value); - return (T) this; - } - - @Override - public boolean hasCookie(String name) { - Validate.notEmpty("Cookie name must not be empty"); - return cookies.containsKey(name); - } - - @Override - public T removeCookie(String name) { - Validate.notEmpty("Cookie name must not be empty"); - cookies.remove(name); - return (T) this; - } - - @Override - public Map cookies() { - return cookies; - } - } - - public static class Request extends Base implements - Connection.Request { - private int timeoutMilliseconds; - private boolean followRedirects; - private Collection data; - private boolean ignoreHttpErrors = false; - private boolean ignoreContentType = false; - private Parser parser; - - private Request() { - timeoutMilliseconds = 3000; - followRedirects = true; - data = new ArrayList(); - method = Connection.Method.GET; - headers.put("Accept-Encoding", "gzip"); - parser = Parser.htmlParser(); - } - - @Override - public int timeout() { - return timeoutMilliseconds; - } - - @Override - public Request timeout(int millis) { - Validate.isTrue(millis >= 0, - "Timeout milliseconds must be 0 (infinite) or greater"); - timeoutMilliseconds = millis; - return this; - } - - @Override - public boolean followRedirects() { - return followRedirects; - } - - @Override - public Connection.Request followRedirects(boolean followRedirects) { - this.followRedirects = followRedirects; - return this; - } - - @Override - public boolean ignoreHttpErrors() { - return ignoreHttpErrors; - } - - @Override - public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) { - this.ignoreHttpErrors = ignoreHttpErrors; - return this; - } - - @Override - public boolean ignoreContentType() { - return ignoreContentType; - } - - @Override - public Connection.Request ignoreContentType(boolean ignoreContentType) { - this.ignoreContentType = ignoreContentType; - return this; - } - - @Override - public Request data(Connection.KeyVal keyval) { - Validate.notNull(keyval, "Key val must not be null"); - data.add(keyval); - return this; - } - - @Override - public Collection data() { - return data; - } - - @Override - public Request parser(Parser parser) { - this.parser = parser; - return this; - } - - @Override - public Parser parser() { - return parser; - } - } - - public static class Response extends Base implements - Connection.Response { - private static final int MAX_REDIRECTS = 20; - private int statusCode; - private String statusMessage; - private ByteBuffer byteData; - private String charset; - private String contentType; - private boolean executed = false; - private int numRedirects = 0; - private Connection.Request req; - - Response() { - super(); - } - - private Response(Response previousResponse) throws IOException { - super(); - if (previousResponse != null) { - numRedirects = previousResponse.numRedirects + 1; - if (numRedirects >= MAX_REDIRECTS) { - throw new IOException( - String.format( - "Too many redirects occurred trying to load URL %s", - previousResponse.url())); - } - } - } - - static Response execute(Connection.Request req) throws IOException { - return execute(req, null); - } - - static Response execute(Connection.Request req, - Response previousResponse) throws IOException { - Validate.notNull(req, "Request must not be null"); - String protocol = req.url().getProtocol(); - Validate.isTrue( - protocol.equals("http") || protocol.equals("https"), - "Only http & https protocols supported"); - - // set up the request for execution - if (req.method() == Connection.Method.GET && req.data().size() > 0) { - serialiseRequestUrl(req); // appends query string - } - HttpURLConnection conn = createConnection(req); - conn.connect(); - if (req.method() == Connection.Method.POST) { - writePost(req.data(), conn.getOutputStream()); - } - - int status = conn.getResponseCode(); - boolean needsRedirect = false; - if (status != HttpURLConnection.HTTP_OK) { - if (status == HttpURLConnection.HTTP_MOVED_TEMP - || status == HttpURLConnection.HTTP_MOVED_PERM - || status == HttpURLConnection.HTTP_SEE_OTHER) { - needsRedirect = true; - } else if (!req.ignoreHttpErrors()) { - throw new IOException(status + " error loading URL " - + req.url().toString()); - } - } - Response res = new Response(previousResponse); - res.setupFromConnection(conn, previousResponse); - if (needsRedirect && req.followRedirects()) { - req.method(Method.GET); // always redirect with a get. any data - // param from original req are dropped. - req.data().clear(); - req.url(new URL(req.url(), res.header("Location"))); - for (Map.Entry cookie : res.cookies.entrySet()) { // add - // response - // cookies - // to - // request - // (for - // e.g. - // login - // posts) - req.cookie(cookie.getKey(), cookie.getValue()); - } - return execute(req, res); - } - res.req = req; - - InputStream bodyStream = null; - InputStream dataStream = null; - try { - dataStream = conn.getErrorStream() != null ? conn - .getErrorStream() : conn.getInputStream(); - bodyStream = res.hasHeader("Content-Encoding") - && res.header("Content-Encoding").equalsIgnoreCase( - "gzip") ? new BufferedInputStream( - new GZIPInputStream(dataStream)) - : new BufferedInputStream(dataStream); - - res.byteData = DataUtil.readToByteBuffer(bodyStream); - res.charset = DataUtil - .getCharsetFromContentType(res.contentType); // may be - // null, - // readInputStream - // deals - // with it - } finally { - if (bodyStream != null) { - bodyStream.close(); - } - if (dataStream != null) { - dataStream.close(); - } - } - - res.executed = true; - return res; - } - - @Override - public int statusCode() { - return statusCode; - } - - @Override - public String statusMessage() { - return statusMessage; - } - - @Override - public String charset() { - return charset; - } - - @Override - public String contentType() { - return contentType; - } - - @Override - public Document parse() throws IOException { - Validate.isTrue( - executed, - "Request must be executed (with .execute(), .get(), or .post() before parsing response"); - if (!req.ignoreContentType() - && (contentType == null || !(contentType - .startsWith("text/") - || contentType.startsWith("application/xml") || contentType - .startsWith("application/xhtml+xml")))) { - throw new IOException( - String.format( - "Unhandled content type \"%s\" on URL %s. Must be text/*, application/xml, or application/xhtml+xml", - contentType, url.toString())); - } - Document doc = DataUtil.parseByteData(byteData, charset, - url.toExternalForm(), req.parser()); - byteData.rewind(); - charset = doc.outputSettings().charset().name(); // update charset - // from meta-equiv, - // possibly - return doc; - } - - @Override - public String body() { - Validate.isTrue( - executed, - "Request must be executed (with .execute(), .get(), or .post() before getting response body"); - // charset gets set from header on execute, and from meta-equiv on - // parse. parse may not have happened yet - String body; - if (charset == null) { - body = Charset.forName(DataUtil.defaultCharset) - .decode(byteData).toString(); - } else { - body = Charset.forName(charset).decode(byteData).toString(); - } - byteData.rewind(); - return body; - } - - @Override - public byte[] bodyAsBytes() { - Validate.isTrue( - executed, - "Request must be executed (with .execute(), .get(), or .post() before getting response body"); - return byteData.array(); - } - - // set up connection defaults, and details from request - private static HttpURLConnection createConnection(Connection.Request req) - throws IOException { - HttpURLConnection conn = (HttpURLConnection) req.url() - .openConnection(); - conn.setRequestMethod(req.method().name()); - conn.setInstanceFollowRedirects(false); // don't rely on native - // redirection support - conn.setConnectTimeout(req.timeout()); - conn.setReadTimeout(req.timeout()); - if (req.method() == Method.POST) { - conn.setDoOutput(true); - } - if (req.cookies().size() > 0) { - conn.addRequestProperty("Cookie", getRequestCookieString(req)); - } - for (Map.Entry header : req.headers().entrySet()) { - conn.addRequestProperty(header.getKey(), header.getValue()); - } - return conn; - } - - // set up url, method, header, cookies - private void setupFromConnection(HttpURLConnection conn, - Connection.Response previousResponse) throws IOException { - method = Connection.Method.valueOf(conn.getRequestMethod()); - url = conn.getURL(); - statusCode = conn.getResponseCode(); - statusMessage = conn.getResponseMessage(); - contentType = conn.getContentType(); - - Map> resHeaders = conn.getHeaderFields(); - processResponseHeaders(resHeaders); - - // if from a redirect, map previous response cookies into this - // response - if (previousResponse != null) { - for (Map.Entry prevCookie : previousResponse - .cookies().entrySet()) { - if (!hasCookie(prevCookie.getKey())) { - cookie(prevCookie.getKey(), prevCookie.getValue()); - } - } - } - } - - void processResponseHeaders(Map> resHeaders) { - for (Map.Entry> entry : resHeaders.entrySet()) { - String name = entry.getKey(); - if (name == null) { - continue; // http/1.1 line - } - - List values = entry.getValue(); - if (name.equalsIgnoreCase("Set-Cookie")) { - for (String value : values) { - if (value == null) { - continue; - } - TokenQueue cd = new TokenQueue(value); - String cookieName = cd.chompTo("=").trim(); - String cookieVal = cd.consumeTo(";").trim(); - if (cookieVal == null) { - cookieVal = ""; - } - // ignores path, date, domain, secure et al. req'd? - // name not blank, value not null - if (cookieName != null && cookieName.length() > 0) { - cookie(cookieName, cookieVal); - } - } - } else { // only take the first instance of each header - if (!values.isEmpty()) { - header(name, values.get(0)); - } - } - } - } - - private static void writePost(Collection data, - OutputStream outputStream) throws IOException { - OutputStreamWriter w = new OutputStreamWriter(outputStream, - DataUtil.defaultCharset); - boolean first = true; - for (Connection.KeyVal keyVal : data) { - if (!first) { - w.append('&'); - } else { - first = false; - } - - w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)); - w.write('='); - w.write(URLEncoder.encode(keyVal.value(), - DataUtil.defaultCharset)); - } - w.close(); - } - - private static String getRequestCookieString(Connection.Request req) { - StringBuilder sb = new StringBuilder(); - boolean first = true; - for (Map.Entry cookie : req.cookies().entrySet()) { - if (!first) { - sb.append("; "); - } else { - first = false; - } - sb.append(cookie.getKey()).append('=') - .append(cookie.getValue()); - // todo: spec says only ascii, no escaping / encoding defined. - // validate on set? or escape somehow here? - } - return sb.toString(); - } - - // for get url reqs, serialise the data map into the url - private static void serialiseRequestUrl(Connection.Request req) - throws IOException { - URL in = req.url(); - StringBuilder url = new StringBuilder(); - boolean first = true; - // reconstitute the query, ready for appends - url.append(in.getProtocol()).append("://") - .append(in.getAuthority()) // includes host, port - .append(in.getPath()).append("?"); - if (in.getQuery() != null) { - url.append(in.getQuery()); - first = false; - } - for (Connection.KeyVal keyVal : req.data()) { - if (!first) { - url.append('&'); - } else { - first = false; - } - url.append( - URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)) - .append('=') - .append(URLEncoder.encode(keyVal.value(), - DataUtil.defaultCharset)); - } - req.url(new URL(url.toString())); - req.data().clear(); // moved into url as get params - } - } - - public static class KeyVal implements Connection.KeyVal { - private String key; - private String value; - - public static KeyVal create(String key, String value) { - Validate.notEmpty(key, "Data key must not be empty"); - Validate.notNull(value, "Data value must not be null"); - return new KeyVal(key, value); - } - - private KeyVal(String key, String value) { - this.key = key; - this.value = value; - } - - @Override - public KeyVal key(String key) { - Validate.notEmpty(key, "Data key must not be empty"); - this.key = key; - return this; - } - - @Override - public String key() { - return key; - } - - @Override - public KeyVal value(String value) { - Validate.notNull(value, "Data value must not be null"); - this.value = value; - return this; - } - - @Override - public String value() { - return value; - } - - @Override - public String toString() { - return key + "=" + value; - } - } -} diff --git a/server/src/org/jsoup/helper/StringUtil.java b/server/src/org/jsoup/helper/StringUtil.java deleted file mode 100644 index 5a3d19b0aa..0000000000 --- a/server/src/org/jsoup/helper/StringUtil.java +++ /dev/null @@ -1,167 +0,0 @@ -package org.jsoup.helper; - -import java.util.Collection; -import java.util.Iterator; - -/** - * A minimal String utility class. Designed for internal jsoup use only. - */ -public final class StringUtil { - // memoised padding up to 10 - private static final String[] padding = { "", " ", " ", " ", " ", - " ", " ", " ", " ", " ", " " }; - - /** - * Join a collection of strings by a seperator - * - * @param strings - * collection of string objects - * @param sep - * string to place between strings - * @return joined string - */ - public static String join(Collection strings, String sep) { - return join(strings.iterator(), sep); - } - - /** - * Join a collection of strings by a seperator - * - * @param strings - * iterator of string objects - * @param sep - * string to place between strings - * @return joined string - */ - public static String join(Iterator strings, String sep) { - if (!strings.hasNext()) { - return ""; - } - - String start = strings.next().toString(); - if (!strings.hasNext()) { - return start; - } - - StringBuilder sb = new StringBuilder(64).append(start); - while (strings.hasNext()) { - sb.append(sep); - sb.append(strings.next()); - } - return sb.toString(); - } - - /** - * Returns space padding - * - * @param width - * amount of padding desired - * @return string of spaces * width - */ - public static String padding(int width) { - if (width < 0) { - throw new IllegalArgumentException("width must be > 0"); - } - - if (width < padding.length) { - return padding[width]; - } - - char[] out = new char[width]; - for (int i = 0; i < width; i++) { - out[i] = ' '; - } - return String.valueOf(out); - } - - /** - * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, - * \t, etc) - * - * @param string - * string to test - * @return if string is blank - */ - public static boolean isBlank(String string) { - if (string == null || string.length() == 0) { - return true; - } - - int l = string.length(); - for (int i = 0; i < l; i++) { - if (!StringUtil.isWhitespace(string.codePointAt(i))) { - return false; - } - } - return true; - } - - /** - * Tests if a string is numeric, i.e. contains only digit characters - * - * @param string - * string to test - * @return true if only digit chars, false if empty or null or contains - * non-digit chrs - */ - public static boolean isNumeric(String string) { - if (string == null || string.length() == 0) { - return false; - } - - int l = string.length(); - for (int i = 0; i < l; i++) { - if (!Character.isDigit(string.codePointAt(i))) { - return false; - } - } - return true; - } - - /** - * Tests if a code point is "whitespace" as defined in the HTML spec. - * - * @param c - * code point to test - * @return true if code point is whitespace, false otherwise - */ - public static boolean isWhitespace(int c) { - return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; - } - - public static String normaliseWhitespace(String string) { - StringBuilder sb = new StringBuilder(string.length()); - - boolean lastWasWhite = false; - boolean modified = false; - - int l = string.length(); - for (int i = 0; i < l; i++) { - int c = string.codePointAt(i); - if (isWhitespace(c)) { - if (lastWasWhite) { - modified = true; - continue; - } - if (c != ' ') { - modified = true; - } - sb.append(' '); - lastWasWhite = true; - } else { - sb.appendCodePoint(c); - lastWasWhite = false; - } - } - return modified ? sb.toString() : string; - } - - public static boolean in(String needle, String... haystack) { - for (String hay : haystack) { - if (hay.equals(needle)) { - return true; - } - } - return false; - } -} diff --git a/server/src/org/jsoup/helper/Validate.java b/server/src/org/jsoup/helper/Validate.java deleted file mode 100644 index e9fe04f87b..0000000000 --- a/server/src/org/jsoup/helper/Validate.java +++ /dev/null @@ -1,150 +0,0 @@ -package org.jsoup.helper; - -/** - * Simple validation methods. Designed for jsoup internal use - */ -public final class Validate { - - private Validate() { - } - - /** - * Validates that the object is not null - * - * @param obj - * object to test - */ - public static void notNull(Object obj) { - if (obj == null) { - throw new IllegalArgumentException("Object must not be null"); - } - } - - /** - * Validates that the object is not null - * - * @param obj - * object to test - * @param msg - * message to output if validation fails - */ - public static void notNull(Object obj, String msg) { - if (obj == null) { - throw new IllegalArgumentException(msg); - } - } - - /** - * Validates that the value is true - * - * @param val - * object to test - */ - public static void isTrue(boolean val) { - if (!val) { - throw new IllegalArgumentException("Must be true"); - } - } - - /** - * Validates that the value is true - * - * @param val - * object to test - * @param msg - * message to output if validation fails - */ - public static void isTrue(boolean val, String msg) { - if (!val) { - throw new IllegalArgumentException(msg); - } - } - - /** - * Validates that the value is false - * - * @param val - * object to test - */ - public static void isFalse(boolean val) { - if (val) { - throw new IllegalArgumentException("Must be false"); - } - } - - /** - * Validates that the value is false - * - * @param val - * object to test - * @param msg - * message to output if validation fails - */ - public static void isFalse(boolean val, String msg) { - if (val) { - throw new IllegalArgumentException(msg); - } - } - - /** - * Validates that the array contains no null elements - * - * @param objects - * the array to test - */ - public static void noNullElements(Object[] objects) { - noNullElements(objects, "Array must not contain any null objects"); - } - - /** - * Validates that the array contains no null elements - * - * @param objects - * the array to test - * @param msg - * message to output if validation fails - */ - public static void noNullElements(Object[] objects, String msg) { - for (Object obj : objects) { - if (obj == null) { - throw new IllegalArgumentException(msg); - } - } - } - - /** - * Validates that the string is not empty - * - * @param string - * the string to test - */ - public static void notEmpty(String string) { - if (string == null || string.length() == 0) { - throw new IllegalArgumentException("String must not be empty"); - } - } - - /** - * Validates that the string is not empty - * - * @param string - * the string to test - * @param msg - * message to output if validation fails - */ - public static void notEmpty(String string, String msg) { - if (string == null || string.length() == 0) { - throw new IllegalArgumentException(msg); - } - } - - /** - * Cause a failure. - * - * @param msg - * message to output. - */ - public static void fail(String msg) { - throw new IllegalArgumentException(msg); - } -} diff --git a/server/src/org/jsoup/nodes/Attribute.java b/server/src/org/jsoup/nodes/Attribute.java deleted file mode 100644 index 5f27b4fcc4..0000000000 --- a/server/src/org/jsoup/nodes/Attribute.java +++ /dev/null @@ -1,167 +0,0 @@ -package org.jsoup.nodes; - -import java.util.Map; - -import org.jsoup.helper.Validate; - -/** - * A single key + value attribute. Keys are trimmed and normalised to - * lower-case. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Attribute implements Map.Entry, Cloneable { - private String key; - private String value; - - /** - * Create a new attribute from unencoded (raw) key and value. - * - * @param key - * attribute key - * @param value - * attribute value - * @see #createFromEncoded - */ - public Attribute(String key, String value) { - Validate.notEmpty(key); - Validate.notNull(value); - this.key = key.trim().toLowerCase(); - this.value = value; - } - - /** - * Get the attribute key. - * - * @return the attribute key - */ - @Override - public String getKey() { - return key; - } - - /** - * Set the attribute key. Gets normalised as per the constructor method. - * - * @param key - * the new key; must not be null - */ - public void setKey(String key) { - Validate.notEmpty(key); - this.key = key.trim().toLowerCase(); - } - - /** - * Get the attribute value. - * - * @return the attribute value - */ - @Override - public String getValue() { - return value; - } - - /** - * Set the attribute value. - * - * @param value - * the new attribute value; must not be null - */ - @Override - public String setValue(String value) { - Validate.notNull(value); - String old = this.value; - this.value = value; - return old; - } - - /** - * Get the HTML representation of this attribute; e.g. - * {@code href="index.html"}. - * - * @return HTML - */ - public String html() { - return key + "=\"" - + Entities.escape(value, (new Document("")).outputSettings()) - + "\""; - } - - protected void html(StringBuilder accum, Document.OutputSettings out) { - accum.append(key).append("=\"").append(Entities.escape(value, out)) - .append("\""); - } - - /** - * Get the string representation of this attribute, implemented as - * {@link #html()}. - * - * @return string - */ - @Override - public String toString() { - return html(); - } - - /** - * Create a new Attribute from an unencoded key and a HTML attribute encoded - * value. - * - * @param unencodedKey - * assumes the key is not encoded, as can be only run of simple - * \w chars. - * @param encodedValue - * HTML attribute encoded value - * @return attribute - */ - public static Attribute createFromEncoded(String unencodedKey, - String encodedValue) { - String value = Entities.unescape(encodedValue, true); - return new Attribute(unencodedKey, value); - } - - protected boolean isDataAttribute() { - return key.startsWith(Attributes.dataPrefix) - && key.length() > Attributes.dataPrefix.length(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Attribute)) { - return false; - } - - Attribute attribute = (Attribute) o; - - if (key != null ? !key.equals(attribute.key) : attribute.key != null) { - return false; - } - if (value != null ? !value.equals(attribute.value) - : attribute.value != null) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - int result = key != null ? key.hashCode() : 0; - result = 31 * result + (value != null ? value.hashCode() : 0); - return result; - } - - @Override - public Attribute clone() { - try { - return (Attribute) super.clone(); // only fields are immutable - // strings key and value, so no - // more deep copy required - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } - } -} diff --git a/server/src/org/jsoup/nodes/Attributes.java b/server/src/org/jsoup/nodes/Attributes.java deleted file mode 100644 index 8757d1bf97..0000000000 --- a/server/src/org/jsoup/nodes/Attributes.java +++ /dev/null @@ -1,315 +0,0 @@ -package org.jsoup.nodes; - -import java.util.AbstractMap; -import java.util.AbstractSet; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.jsoup.helper.Validate; - -/** - * The attributes of an Element. - *

- * Attributes are treated as a map: there can be only one value associated with - * an attribute key. - *

- * Attribute key and value comparisons are done case insensitively, and keys are - * normalised to lower-case. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Attributes implements Iterable, Cloneable { - protected static final String dataPrefix = "data-"; - - private LinkedHashMap attributes = null; - - // linked hash map to preserve insertion order. - // null be default as so many elements have no attributes -- saves a good - // chunk of memory - - /** - * Get an attribute value by key. - * - * @param key - * the attribute key - * @return the attribute value if set; or empty string if not set. - * @see #hasKey(String) - */ - public String get(String key) { - Validate.notEmpty(key); - - if (attributes == null) { - return ""; - } - - Attribute attr = attributes.get(key.toLowerCase()); - return attr != null ? attr.getValue() : ""; - } - - /** - * Set a new attribute, or replace an existing one by key. - * - * @param key - * attribute key - * @param value - * attribute value - */ - public void put(String key, String value) { - Attribute attr = new Attribute(key, value); - put(attr); - } - - /** - * Set a new attribute, or replace an existing one by key. - * - * @param attribute - * attribute - */ - public void put(Attribute attribute) { - Validate.notNull(attribute); - if (attributes == null) { - attributes = new LinkedHashMap(2); - } - attributes.put(attribute.getKey(), attribute); - } - - /** - * Remove an attribute by key. - * - * @param key - * attribute key to remove - */ - public void remove(String key) { - Validate.notEmpty(key); - if (attributes == null) { - return; - } - attributes.remove(key.toLowerCase()); - } - - /** - * Tests if these attributes contain an attribute with this key. - * - * @param key - * key to check for - * @return true if key exists, false otherwise - */ - public boolean hasKey(String key) { - return attributes != null && attributes.containsKey(key.toLowerCase()); - } - - /** - * Get the number of attributes in this set. - * - * @return size - */ - public int size() { - if (attributes == null) { - return 0; - } - return attributes.size(); - } - - /** - * Add all the attributes from the incoming set to this set. - * - * @param incoming - * attributes to add to these attributes. - */ - public void addAll(Attributes incoming) { - if (incoming.size() == 0) { - return; - } - if (attributes == null) { - attributes = new LinkedHashMap(incoming.size()); - } - attributes.putAll(incoming.attributes); - } - - @Override - public Iterator iterator() { - return asList().iterator(); - } - - /** - * Get the attributes as a List, for iteration. Do not modify the keys of - * the attributes via this view, as changes to keys will not be recognised - * in the containing set. - * - * @return an view of the attributes as a List. - */ - public List asList() { - if (attributes == null) { - return Collections.emptyList(); - } - - List list = new ArrayList(attributes.size()); - for (Map.Entry entry : attributes.entrySet()) { - list.add(entry.getValue()); - } - return Collections.unmodifiableList(list); - } - - /** - * Retrieves a filtered view of attributes that are HTML5 custom data - * attributes; that is, attributes with keys starting with {@code data-}. - * - * @return map of custom data attributes. - */ - public Map dataset() { - return new Dataset(); - } - - /** - * Get the HTML representation of these attributes. - * - * @return HTML - */ - public String html() { - StringBuilder accum = new StringBuilder(); - html(accum, (new Document("")).outputSettings()); // output settings a - // bit funky, but this - // html() seldom used - return accum.toString(); - } - - void html(StringBuilder accum, Document.OutputSettings out) { - if (attributes == null) { - return; - } - - for (Map.Entry entry : attributes.entrySet()) { - Attribute attribute = entry.getValue(); - accum.append(" "); - attribute.html(accum, out); - } - } - - @Override - public String toString() { - return html(); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Attributes)) { - return false; - } - - Attributes that = (Attributes) o; - - if (attributes != null ? !attributes.equals(that.attributes) - : that.attributes != null) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - return attributes != null ? attributes.hashCode() : 0; - } - - @Override - public Attributes clone() { - if (attributes == null) { - return new Attributes(); - } - - Attributes clone; - try { - clone = (Attributes) super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } - clone.attributes = new LinkedHashMap( - attributes.size()); - for (Attribute attribute : this) { - clone.attributes.put(attribute.getKey(), attribute.clone()); - } - return clone; - } - - private class Dataset extends AbstractMap { - - private Dataset() { - if (attributes == null) { - attributes = new LinkedHashMap(2); - } - } - - @Override - public Set> entrySet() { - return new EntrySet(); - } - - @Override - public String put(String key, String value) { - String dataKey = dataKey(key); - String oldValue = hasKey(dataKey) ? attributes.get(dataKey) - .getValue() : null; - Attribute attr = new Attribute(dataKey, value); - attributes.put(dataKey, attr); - return oldValue; - } - - private class EntrySet extends AbstractSet> { - @Override - public Iterator> iterator() { - return new DatasetIterator(); - } - - @Override - public int size() { - int count = 0; - Iterator iter = new DatasetIterator(); - while (iter.hasNext()) { - count++; - } - return count; - } - } - - private class DatasetIterator implements - Iterator> { - private Iterator attrIter = attributes.values() - .iterator(); - private Attribute attr; - - @Override - public boolean hasNext() { - while (attrIter.hasNext()) { - attr = attrIter.next(); - if (attr.isDataAttribute()) { - return true; - } - } - return false; - } - - @Override - public Entry next() { - return new Attribute(attr.getKey().substring( - dataPrefix.length()), attr.getValue()); - } - - @Override - public void remove() { - attributes.remove(attr.getKey()); - } - } - } - - private static String dataKey(String key) { - return dataPrefix + key; - } -} diff --git a/server/src/org/jsoup/nodes/Comment.java b/server/src/org/jsoup/nodes/Comment.java deleted file mode 100644 index 6abe0e3066..0000000000 --- a/server/src/org/jsoup/nodes/Comment.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.jsoup.nodes; - -/** - * A comment node. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Comment extends Node { - private static final String COMMENT_KEY = "comment"; - - /** - * Create a new comment node. - * - * @param data - * The contents of the comment - * @param baseUri - * base URI - */ - public Comment(String data, String baseUri) { - super(baseUri); - attributes.put(COMMENT_KEY, data); - } - - @Override - public String nodeName() { - return "#comment"; - } - - /** - * Get the contents of the comment. - * - * @return comment content - */ - public String getData() { - return attributes.get(COMMENT_KEY); - } - - @Override - void outerHtmlHead(StringBuilder accum, int depth, - Document.OutputSettings out) { - if (out.prettyPrint()) { - indent(accum, depth, out); - } - accum.append(""); - } - - @Override - void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out) { - } - - @Override - public String toString() { - return outerHtml(); - } -} diff --git a/server/src/org/jsoup/nodes/DataNode.java b/server/src/org/jsoup/nodes/DataNode.java deleted file mode 100644 index cc377a4cc8..0000000000 --- a/server/src/org/jsoup/nodes/DataNode.java +++ /dev/null @@ -1,82 +0,0 @@ -package org.jsoup.nodes; - -/** - * A data node, for contents of style, script tags etc, where contents should - * not show in text(). - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class DataNode extends Node { - private static final String DATA_KEY = "data"; - - /** - * Create a new DataNode. - * - * @param data - * data contents - * @param baseUri - * base URI - */ - public DataNode(String data, String baseUri) { - super(baseUri); - attributes.put(DATA_KEY, data); - } - - @Override - public String nodeName() { - return "#data"; - } - - /** - * Get the data contents of this node. Will be unescaped and with original - * new lines, space etc. - * - * @return data - */ - public String getWholeData() { - return attributes.get(DATA_KEY); - } - - /** - * Set the data contents of this node. - * - * @param data - * unencoded data - * @return this node, for chaining - */ - public DataNode setWholeData(String data) { - attributes.put(DATA_KEY, data); - return this; - } - - @Override - void outerHtmlHead(StringBuilder accum, int depth, - Document.OutputSettings out) { - accum.append(getWholeData()); // data is not escaped in return from data - // nodes, so " in script, style is plain - } - - @Override - void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out) { - } - - @Override - public String toString() { - return outerHtml(); - } - - /** - * Create a new DataNode from HTML encoded data. - * - * @param encodedData - * encoded data - * @param baseUri - * bass URI - * @return new DataNode - */ - public static DataNode createFromEncoded(String encodedData, String baseUri) { - String data = Entities.unescape(encodedData); - return new DataNode(data, baseUri); - } -} diff --git a/server/src/org/jsoup/nodes/Document.java b/server/src/org/jsoup/nodes/Document.java deleted file mode 100644 index f1c4595faa..0000000000 --- a/server/src/org/jsoup/nodes/Document.java +++ /dev/null @@ -1,402 +0,0 @@ -package org.jsoup.nodes; - -import java.nio.charset.Charset; -import java.nio.charset.CharsetEncoder; -import java.util.ArrayList; -import java.util.List; - -import org.jsoup.helper.Validate; -import org.jsoup.parser.Tag; -import org.jsoup.select.Elements; - -/** - * A HTML Document. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Document extends Element { - private OutputSettings outputSettings = new OutputSettings(); - private QuirksMode quirksMode = QuirksMode.noQuirks; - - /** - * Create a new, empty Document. - * - * @param baseUri - * base URI of document - * @see org.jsoup.Jsoup#parse - * @see #createShell - */ - public Document(String baseUri) { - super(Tag.valueOf("#root"), baseUri); - } - - /** - * Create a valid, empty shell of a document, suitable for adding more - * elements to. - * - * @param baseUri - * baseUri of document - * @return document with html, head, and body elements. - */ - static public Document createShell(String baseUri) { - Validate.notNull(baseUri); - - Document doc = new Document(baseUri); - Element html = doc.appendElement("html"); - html.appendElement("head"); - html.appendElement("body"); - - return doc; - } - - /** - * Accessor to the document's {@code head} element. - * - * @return {@code head} - */ - public Element head() { - return findFirstElementByTagName("head", this); - } - - /** - * Accessor to the document's {@code body} element. - * - * @return {@code body} - */ - public Element body() { - return findFirstElementByTagName("body", this); - } - - /** - * Get the string contents of the document's {@code title} element. - * - * @return Trimmed title, or empty string if none set. - */ - public String title() { - Element titleEl = getElementsByTag("title").first(); - return titleEl != null ? titleEl.text().trim() : ""; - } - - /** - * Set the document's {@code title} element. Updates the existing element, - * or adds {@code title} to {@code head} if not present - * - * @param title - * string to set as title - */ - public void title(String title) { - Validate.notNull(title); - Element titleEl = getElementsByTag("title").first(); - if (titleEl == null) { // add to head - head().appendElement("title").text(title); - } else { - titleEl.text(title); - } - } - - /** - * Create a new Element, with this document's base uri. Does not make the - * new element a child of this document. - * - * @param tagName - * element tag name (e.g. {@code a}) - * @return new element - */ - public Element createElement(String tagName) { - return new Element(Tag.valueOf(tagName), baseUri()); - } - - /** - * Normalise the document. This happens after the parse phase so generally - * does not need to be called. Moves any text content that is not in the - * body element into the body. - * - * @return this document after normalisation - */ - public Document normalise() { - Element htmlEl = findFirstElementByTagName("html", this); - if (htmlEl == null) { - htmlEl = appendElement("html"); - } - if (head() == null) { - htmlEl.prependElement("head"); - } - if (body() == null) { - htmlEl.appendElement("body"); - } - - // pull text nodes out of root, html, and head els, and push into body. - // non-text nodes are already taken care - // of. do in inverse order to maintain text order. - normaliseTextNodes(head()); - normaliseTextNodes(htmlEl); - normaliseTextNodes(this); - - normaliseStructure("head", htmlEl); - normaliseStructure("body", htmlEl); - - return this; - } - - // does not recurse. - private void normaliseTextNodes(Element element) { - List toMove = new ArrayList(); - for (Node node : element.childNodes) { - if (node instanceof TextNode) { - TextNode tn = (TextNode) node; - if (!tn.isBlank()) { - toMove.add(tn); - } - } - } - - for (int i = toMove.size() - 1; i >= 0; i--) { - Node node = toMove.get(i); - element.removeChild(node); - body().prependChild(new TextNode(" ", "")); - body().prependChild(node); - } - } - - // merge multiple or contents into one, delete the remainder, - // and ensure they are owned by - private void normaliseStructure(String tag, Element htmlEl) { - Elements elements = getElementsByTag(tag); - Element master = elements.first(); // will always be available as - // created above if not existent - if (elements.size() > 1) { // dupes, move contents to master - List toMove = new ArrayList(); - for (int i = 1; i < elements.size(); i++) { - Node dupe = elements.get(i); - for (Node node : dupe.childNodes) { - toMove.add(node); - } - dupe.remove(); - } - - for (Node dupe : toMove) { - master.appendChild(dupe); - } - } - // ensure parented by - if (!master.parent().equals(htmlEl)) { - htmlEl.appendChild(master); // includes remove() - } - } - - // fast method to get first by tag name, used for html, head, body finders - private Element findFirstElementByTagName(String tag, Node node) { - if (node.nodeName().equals(tag)) { - return (Element) node; - } else { - for (Node child : node.childNodes) { - Element found = findFirstElementByTagName(tag, child); - if (found != null) { - return found; - } - } - } - return null; - } - - @Override - public String outerHtml() { - return super.html(); // no outer wrapper tag - } - - /** - * Set the text of the {@code body} of this document. Any existing nodes - * within the body will be cleared. - * - * @param text - * unencoded text - * @return this document - */ - @Override - public Element text(String text) { - body().text(text); // overridden to not nuke doc structure - return this; - } - - @Override - public String nodeName() { - return "#document"; - } - - @Override - public Document clone() { - Document clone = (Document) super.clone(); - clone.outputSettings = outputSettings.clone(); - return clone; - } - - /** - * A Document's output settings control the form of the text() and html() - * methods. - */ - public static class OutputSettings implements Cloneable { - private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; - private Charset charset = Charset.forName("UTF-8"); - private CharsetEncoder charsetEncoder = charset.newEncoder(); - private boolean prettyPrint = true; - private int indentAmount = 1; - - public OutputSettings() { - } - - /** - * Get the document's current HTML escape mode: base, which - * provides a limited set of named HTML entities and escapes other - * characters as numbered entities for maximum compatibility; or - * extended, which uses the complete set of HTML named - * entities. - *

- * The default escape mode is base. - * - * @return the document's current escape mode - */ - public Entities.EscapeMode escapeMode() { - return escapeMode; - } - - /** - * Set the document's escape mode - * - * @param escapeMode - * the new escape mode to use - * @return the document's output settings, for chaining - */ - public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { - this.escapeMode = escapeMode; - return this; - } - - /** - * Get the document's current output charset, which is used to control - * which characters are escaped when generating HTML (via the - * html() methods), and which are kept intact. - *

- * Where possible (when parsing from a URL or File), the document's - * output charset is automatically set to the input charset. Otherwise, - * it defaults to UTF-8. - * - * @return the document's current charset. - */ - public Charset charset() { - return charset; - } - - /** - * Update the document's output charset. - * - * @param charset - * the new charset to use. - * @return the document's output settings, for chaining - */ - public OutputSettings charset(Charset charset) { - // todo: this should probably update the doc's meta charset - this.charset = charset; - charsetEncoder = charset.newEncoder(); - return this; - } - - /** - * Update the document's output charset. - * - * @param charset - * the new charset (by name) to use. - * @return the document's output settings, for chaining - */ - public OutputSettings charset(String charset) { - charset(Charset.forName(charset)); - return this; - } - - CharsetEncoder encoder() { - return charsetEncoder; - } - - /** - * Get if pretty printing is enabled. Default is true. If disabled, the - * HTML output methods will not re-format the output, and the output - * will generally look like the input. - * - * @return if pretty printing is enabled. - */ - public boolean prettyPrint() { - return prettyPrint; - } - - /** - * Enable or disable pretty printing. - * - * @param pretty - * new pretty print setting - * @return this, for chaining - */ - public OutputSettings prettyPrint(boolean pretty) { - prettyPrint = pretty; - return this; - } - - /** - * Get the current tag indent amount, used when pretty printing. - * - * @return the current indent amount - */ - public int indentAmount() { - return indentAmount; - } - - /** - * Set the indent amount for pretty printing - * - * @param indentAmount - * number of spaces to use for indenting each level. Must be - * >= 0. - * @return this, for chaining - */ - public OutputSettings indentAmount(int indentAmount) { - Validate.isTrue(indentAmount >= 0); - this.indentAmount = indentAmount; - return this; - } - - @Override - public OutputSettings clone() { - OutputSettings clone; - try { - clone = (OutputSettings) super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } - clone.charset(charset.name()); // new charset and charset encoder - clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); - // indentAmount, prettyPrint are primitives so object.clone() will - // handle - return clone; - } - } - - /** - * Get the document's current output settings. - * - * @return the document's current output settings. - */ - public OutputSettings outputSettings() { - return outputSettings; - } - - public enum QuirksMode { - noQuirks, quirks, limitedQuirks; - } - - public QuirksMode quirksMode() { - return quirksMode; - } - - public Document quirksMode(QuirksMode quirksMode) { - this.quirksMode = quirksMode; - return this; - } -} diff --git a/server/src/org/jsoup/nodes/DocumentType.java b/server/src/org/jsoup/nodes/DocumentType.java deleted file mode 100644 index 13ff78dc8b..0000000000 --- a/server/src/org/jsoup/nodes/DocumentType.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.jsoup.nodes; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; - -/** - * A {@code } node. - */ -public class DocumentType extends Node { - // todo: quirk mode from publicId and systemId - - /** - * Create a new doctype element. - * - * @param name - * the doctype's name - * @param publicId - * the doctype's public ID - * @param systemId - * the doctype's system ID - * @param baseUri - * the doctype's base URI - */ - public DocumentType(String name, String publicId, String systemId, - String baseUri) { - super(baseUri); - - Validate.notEmpty(name); - attr("name", name); - attr("publicId", publicId); - attr("systemId", systemId); - } - - @Override - public String nodeName() { - return "#doctype"; - } - - @Override - void outerHtmlHead(StringBuilder accum, int depth, - Document.OutputSettings out) { - accum.append("'); - } - - @Override - void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out) { - } -} diff --git a/server/src/org/jsoup/nodes/Element.java b/server/src/org/jsoup/nodes/Element.java deleted file mode 100644 index ff9e68b962..0000000000 --- a/server/src/org/jsoup/nodes/Element.java +++ /dev/null @@ -1,1355 +0,0 @@ -package org.jsoup.nodes; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.parser.Parser; -import org.jsoup.parser.Tag; -import org.jsoup.select.Collector; -import org.jsoup.select.Elements; -import org.jsoup.select.Evaluator; -import org.jsoup.select.Selector; - -/** - * A HTML element consists of a tag name, attributes, and child nodes (including - * text nodes and other elements). - * - * From an Element, you can extract data, traverse the node graph, and - * manipulate the HTML. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Element extends Node { - private Tag tag; - private Set classNames; - - /** - * Create a new, standalone Element. (Standalone in that is has no parent.) - * - * @param tag - * tag of this element - * @param baseUri - * the base URI - * @param attributes - * initial attributes - * @see #appendChild(Node) - * @see #appendElement(String) - */ - public Element(Tag tag, String baseUri, Attributes attributes) { - super(baseUri, attributes); - - Validate.notNull(tag); - this.tag = tag; - } - - /** - * Create a new Element from a tag and a base URI. - * - * @param tag - * element tag - * @param baseUri - * the base URI of this element. It is acceptable for the base - * URI to be an empty string, but not null. - * @see Tag#valueOf(String) - */ - public Element(Tag tag, String baseUri) { - this(tag, baseUri, new Attributes()); - } - - @Override - public String nodeName() { - return tag.getName(); - } - - /** - * Get the name of the tag for this element. E.g. {@code div} - * - * @return the tag name - */ - public String tagName() { - return tag.getName(); - } - - /** - * Change the tag of this element. For example, convert a {@code } to - * a {@code

} with {@code el.tagName("div");}. - * - * @param tagName - * new tag name for this element - * @return this element, for chaining - */ - public Element tagName(String tagName) { - Validate.notEmpty(tagName, "Tag name must not be empty."); - tag = Tag.valueOf(tagName); - return this; - } - - /** - * Get the Tag for this element. - * - * @return the tag object - */ - public Tag tag() { - return tag; - } - - /** - * Test if this element is a block-level element. (E.g. - * {@code
== true} or an inline element {@code

== false}). - * - * @return true if block, false if not (and thus inline) - */ - public boolean isBlock() { - return tag.isBlock(); - } - - /** - * Get the {@code id} attribute of this element. - * - * @return The id attribute, if present, or an empty string if not. - */ - public String id() { - String id = attr("id"); - return id == null ? "" : id; - } - - /** - * Set an attribute value on this element. If this element already has an - * attribute with the key, its value is updated; otherwise, a new attribute - * is added. - * - * @return this element - */ - @Override - public Element attr(String attributeKey, String attributeValue) { - super.attr(attributeKey, attributeValue); - return this; - } - - /** - * Get this element's HTML5 custom data attributes. Each attribute in the - * element that has a key starting with "data-" is included the dataset. - *

- * E.g., the element - * {@code

...} - * has the dataset {@code package=jsoup, language=java}. - *

- * This map is a filtered view of the element's attribute map. Changes to - * one map (add, remove, update) are reflected in the other map. - *

- * You can find elements that have data attributes using the - * {@code [^data-]} attribute key prefix selector. - * - * @return a map of {@code key=value} custom data attributes. - */ - public Map dataset() { - return attributes.dataset(); - } - - @Override - public final Element parent() { - return (Element) parentNode; - } - - /** - * Get this element's parent and ancestors, up to the document root. - * - * @return this element's stack of parents, closest first. - */ - public Elements parents() { - Elements parents = new Elements(); - accumulateParents(this, parents); - return parents; - } - - private static void accumulateParents(Element el, Elements parents) { - Element parent = el.parent(); - if (parent != null && !parent.tagName().equals("#root")) { - parents.add(parent); - accumulateParents(parent, parents); - } - } - - /** - * Get a child element of this element, by its 0-based index number. - *

- * Note that an element can have both mixed Nodes and Elements as children. - * This method inspects a filtered list of children that are elements, and - * the index is based on that filtered list. - * - * @param index - * the index number of the element to retrieve - * @return the child element, if it exists, or {@code null} if absent. - * @see #childNode(int) - */ - public Element child(int index) { - return children().get(index); - } - - /** - * Get this element's child elements. - *

- * This is effectively a filter on {@link #childNodes()} to get Element - * nodes. - * - * @return child elements. If this element has no children, returns an empty - * list. - * @see #childNodes() - */ - public Elements children() { - // create on the fly rather than maintaining two lists. if gets slow, - // memoize, and mark dirty on change - List elements = new ArrayList(); - for (Node node : childNodes) { - if (node instanceof Element) { - elements.add((Element) node); - } - } - return new Elements(elements); - } - - /** - * Get this element's child text nodes. The list is unmodifiable but the - * text nodes may be manipulated. - *

- * This is effectively a filter on {@link #childNodes()} to get Text nodes. - * - * @return child text nodes. If this element has no text nodes, returns an - * empty list. - *

- * For example, with the input HTML: - * {@code

One Two Three
Four

} with the - * {@code p} element selected:
    - *
  • {@code p.text()} = {@code "One Two Three Four"}
  • - *
  • {@code p.ownText()} = {@code "One Three Four"}
  • - *
  • {@code p.children()} = {@code Elements[,
    - * ]}
  • - *
  • {@code p.childNodes()} = {@code List["One ", , " Three ", - *
    - * , " Four"]}
  • - *
  • {@code p.textNodes()} = - * {@code List["One ", " Three ", " Four"]}
  • - *
- */ - public List textNodes() { - List textNodes = new ArrayList(); - for (Node node : childNodes) { - if (node instanceof TextNode) { - textNodes.add((TextNode) node); - } - } - return Collections.unmodifiableList(textNodes); - } - - /** - * Get this element's child data nodes. The list is unmodifiable but the - * data nodes may be manipulated. - *

- * This is effectively a filter on {@link #childNodes()} to get Data nodes. - * - * @return child data nodes. If this element has no data nodes, returns an - * empty list. - * @see #data() - */ - public List dataNodes() { - List dataNodes = new ArrayList(); - for (Node node : childNodes) { - if (node instanceof DataNode) { - dataNodes.add((DataNode) node); - } - } - return Collections.unmodifiableList(dataNodes); - } - - /** - * Find elements that match the {@link Selector} CSS query, with this - * element as the starting context. Matched elements may include this - * element, or any of its children. - *

- * This method is generally more powerful to use than the DOM-type - * {@code getElementBy*} methods, because multiple filters can be combined, - * e.g.: - *

    - *
  • {@code el.select("a[href]")} - finds links ({@code a} tags with - * {@code href} attributes) - *
  • {@code el.select("a[href*=example.com]")} - finds links pointing to - * example.com (loosely) - *
- *

- * See the query syntax documentation in {@link org.jsoup.select.Selector}. - * - * @param cssQuery - * a {@link Selector} CSS-like query - * @return elements that match the query (empty if none match) - * @see org.jsoup.select.Selector - */ - public Elements select(String cssQuery) { - return Selector.select(cssQuery, this); - } - - /** - * Add a node child node to this element. - * - * @param child - * node to add. Must not already have a parent. - * @return this element, so that you can add more child nodes or elements. - */ - public Element appendChild(Node child) { - Validate.notNull(child); - - addChildren(child); - return this; - } - - /** - * Add a node to the start of this element's children. - * - * @param child - * node to add. Must not already have a parent. - * @return this element, so that you can add more child nodes or elements. - */ - public Element prependChild(Node child) { - Validate.notNull(child); - - addChildren(0, child); - return this; - } - - /** - * Create a new element by tag name, and add it as the last child. - * - * @param tagName - * the name of the tag (e.g. {@code div}). - * @return the new element, to allow you to add content to it, e.g.: - * {@code parent.appendElement("h1").attr("id", "header").text("Welcome");} - */ - public Element appendElement(String tagName) { - Element child = new Element(Tag.valueOf(tagName), baseUri()); - appendChild(child); - return child; - } - - /** - * Create a new element by tag name, and add it as the first child. - * - * @param tagName - * the name of the tag (e.g. {@code div}). - * @return the new element, to allow you to add content to it, e.g.: - * {@code parent.prependElement("h1").attr("id", "header").text("Welcome");} - */ - public Element prependElement(String tagName) { - Element child = new Element(Tag.valueOf(tagName), baseUri()); - prependChild(child); - return child; - } - - /** - * Create and append a new TextNode to this element. - * - * @param text - * the unencoded text to add - * @return this element - */ - public Element appendText(String text) { - TextNode node = new TextNode(text, baseUri()); - appendChild(node); - return this; - } - - /** - * Create and prepend a new TextNode to this element. - * - * @param text - * the unencoded text to add - * @return this element - */ - public Element prependText(String text) { - TextNode node = new TextNode(text, baseUri()); - prependChild(node); - return this; - } - - /** - * Add inner HTML to this element. The supplied HTML will be parsed, and - * each node appended to the end of the children. - * - * @param html - * HTML to add inside this element, after the existing HTML - * @return this element - * @see #html(String) - */ - public Element append(String html) { - Validate.notNull(html); - - List nodes = Parser.parseFragment(html, this, baseUri()); - addChildren(nodes.toArray(new Node[nodes.size()])); - return this; - } - - /** - * Add inner HTML into this element. The supplied HTML will be parsed, and - * each node prepended to the start of the element's children. - * - * @param html - * HTML to add inside this element, before the existing HTML - * @return this element - * @see #html(String) - */ - public Element prepend(String html) { - Validate.notNull(html); - - List nodes = Parser.parseFragment(html, this, baseUri()); - addChildren(0, nodes.toArray(new Node[nodes.size()])); - return this; - } - - /** - * Insert the specified HTML into the DOM before this element (i.e. as a - * preceding sibling). - * - * @param html - * HTML to add before this element - * @return this element, for chaining - * @see #after(String) - */ - @Override - public Element before(String html) { - return (Element) super.before(html); - } - - /** - * Insert the specified node into the DOM before this node (i.e. as a - * preceding sibling). - * - * @param node - * to add before this element - * @return this Element, for chaining - * @see #after(Node) - */ - @Override - public Element before(Node node) { - return (Element) super.before(node); - } - - /** - * Insert the specified HTML into the DOM after this element (i.e. as a - * following sibling). - * - * @param html - * HTML to add after this element - * @return this element, for chaining - * @see #before(String) - */ - @Override - public Element after(String html) { - return (Element) super.after(html); - } - - /** - * Insert the specified node into the DOM after this node (i.e. as a - * following sibling). - * - * @param node - * to add after this element - * @return this element, for chaining - * @see #before(Node) - */ - @Override - public Element after(Node node) { - return (Element) super.after(node); - } - - /** - * Remove all of the element's child nodes. Any attributes are left as-is. - * - * @return this element - */ - public Element empty() { - childNodes.clear(); - return this; - } - - /** - * Wrap the supplied HTML around this element. - * - * @param html - * HTML to wrap around this element, e.g. - * {@code

}. Can be arbitrarily deep. - * @return this element, for chaining. - */ - @Override - public Element wrap(String html) { - return (Element) super.wrap(html); - } - - /** - * Get sibling elements. If the element has no sibling elements, returns an - * empty list. An element is not a sibling of itself, so will not be - * included in the returned list. - * - * @return sibling elements - */ - public Elements siblingElements() { - if (parentNode == null) { - return new Elements(0); - } - - List elements = parent().children(); - Elements siblings = new Elements(elements.size() - 1); - for (Element el : elements) { - if (el != this) { - siblings.add(el); - } - } - return siblings; - } - - /** - * Gets the next sibling element of this element. E.g., if a {@code div} - * contains two {@code p}s, the {@code nextElementSibling} of the first - * {@code p} is the second {@code p}. - *

- * This is similar to {@link #nextSibling()}, but specifically finds only - * Elements - * - * @return the next element, or null if there is no next element - * @see #previousElementSibling() - */ - public Element nextElementSibling() { - if (parentNode == null) { - return null; - } - List siblings = parent().children(); - Integer index = indexInList(this, siblings); - Validate.notNull(index); - if (siblings.size() > index + 1) { - return siblings.get(index + 1); - } else { - return null; - } - } - - /** - * Gets the previous element sibling of this element. - * - * @return the previous element, or null if there is no previous element - * @see #nextElementSibling() - */ - public Element previousElementSibling() { - if (parentNode == null) { - return null; - } - List siblings = parent().children(); - Integer index = indexInList(this, siblings); - Validate.notNull(index); - if (index > 0) { - return siblings.get(index - 1); - } else { - return null; - } - } - - /** - * Gets the first element sibling of this element. - * - * @return the first sibling that is an element (aka the parent's first - * element child) - */ - public Element firstElementSibling() { - // todo: should firstSibling() exclude this? - List siblings = parent().children(); - return siblings.size() > 1 ? siblings.get(0) : null; - } - - /** - * Get the list index of this element in its element sibling list. I.e. if - * this is the first element sibling, returns 0. - * - * @return position in element sibling list - */ - public Integer elementSiblingIndex() { - if (parent() == null) { - return 0; - } - return indexInList(this, parent().children()); - } - - /** - * Gets the last element sibling of this element - * - * @return the last sibling that is an element (aka the parent's last - * element child) - */ - public Element lastElementSibling() { - List siblings = parent().children(); - return siblings.size() > 1 ? siblings.get(siblings.size() - 1) : null; - } - - private static Integer indexInList(Element search, - List elements) { - Validate.notNull(search); - Validate.notNull(elements); - - for (int i = 0; i < elements.size(); i++) { - E element = elements.get(i); - if (element.equals(search)) { - return i; - } - } - return null; - } - - // DOM type methods - - /** - * Finds elements, including and recursively under this element, with the - * specified tag name. - * - * @param tagName - * The tag name to search for (case insensitively). - * @return a matching unmodifiable list of elements. Will be empty if this - * element and none of its children match. - */ - public Elements getElementsByTag(String tagName) { - Validate.notEmpty(tagName); - tagName = tagName.toLowerCase().trim(); - - return Collector.collect(new Evaluator.Tag(tagName), this); - } - - /** - * Find an element by ID, including or under this element. - *

- * Note that this finds the first matching ID, starting with this element. - * If you search down from a different starting point, it is possible to - * find a different element by ID. For unique element by ID within a - * Document, use {@link Document#getElementById(String)} - * - * @param id - * The ID to search for. - * @return The first matching element by ID, starting with this element, or - * null if none found. - */ - public Element getElementById(String id) { - Validate.notEmpty(id); - - Elements elements = Collector.collect(new Evaluator.Id(id), this); - if (elements.size() > 0) { - return elements.get(0); - } else { - return null; - } - } - - /** - * Find elements that have this class, including or under this element. Case - * insensitive. - *

- * Elements can have multiple classes (e.g. - * {@code

}. This method checks each class, - * so you can find the above with {@code el.getElementsByClass("header");}. - * - * @param className - * the name of the class to search for. - * @return elements with the supplied class name, empty if none - * @see #hasClass(String) - * @see #classNames() - */ - public Elements getElementsByClass(String className) { - Validate.notEmpty(className); - - return Collector.collect(new Evaluator.Class(className), this); - } - - /** - * Find elements that have a named attribute set. Case insensitive. - * - * @param key - * name of the attribute, e.g. {@code href} - * @return elements that have this attribute, empty if none - */ - public Elements getElementsByAttribute(String key) { - Validate.notEmpty(key); - key = key.trim().toLowerCase(); - - return Collector.collect(new Evaluator.Attribute(key), this); - } - - /** - * Find elements that have an attribute name starting with the supplied - * prefix. Use {@code data-} to find elements that have HTML5 datasets. - * - * @param keyPrefix - * name prefix of the attribute e.g. {@code data-} - * @return elements that have attribute names that start with with the - * prefix, empty if none. - */ - public Elements getElementsByAttributeStarting(String keyPrefix) { - Validate.notEmpty(keyPrefix); - keyPrefix = keyPrefix.trim().toLowerCase(); - - return Collector.collect(new Evaluator.AttributeStarting(keyPrefix), - this); - } - - /** - * Find elements that have an attribute with the specific value. Case - * insensitive. - * - * @param key - * name of the attribute - * @param value - * value of the attribute - * @return elements that have this attribute with this value, empty if none - */ - public Elements getElementsByAttributeValue(String key, String value) { - return Collector.collect(new Evaluator.AttributeWithValue(key, value), - this); - } - - /** - * Find elements that either do not have this attribute, or have it with a - * different value. Case insensitive. - * - * @param key - * name of the attribute - * @param value - * value of the attribute - * @return elements that do not have a matching attribute - */ - public Elements getElementsByAttributeValueNot(String key, String value) { - return Collector.collect( - new Evaluator.AttributeWithValueNot(key, value), this); - } - - /** - * Find elements that have attributes that start with the value prefix. Case - * insensitive. - * - * @param key - * name of the attribute - * @param valuePrefix - * start of attribute value - * @return elements that have attributes that start with the value prefix - */ - public Elements getElementsByAttributeValueStarting(String key, - String valuePrefix) { - return Collector.collect(new Evaluator.AttributeWithValueStarting(key, - valuePrefix), this); - } - - /** - * Find elements that have attributes that end with the value suffix. Case - * insensitive. - * - * @param key - * name of the attribute - * @param valueSuffix - * end of the attribute value - * @return elements that have attributes that end with the value suffix - */ - public Elements getElementsByAttributeValueEnding(String key, - String valueSuffix) { - return Collector.collect(new Evaluator.AttributeWithValueEnding(key, - valueSuffix), this); - } - - /** - * Find elements that have attributes whose value contains the match string. - * Case insensitive. - * - * @param key - * name of the attribute - * @param match - * substring of value to search for - * @return elements that have attributes containing this text - */ - public Elements getElementsByAttributeValueContaining(String key, - String match) { - return Collector.collect(new Evaluator.AttributeWithValueContaining( - key, match), this); - } - - /** - * Find elements that have attributes whose values match the supplied - * regular expression. - * - * @param key - * name of the attribute - * @param pattern - * compiled regular expression to match against attribute values - * @return elements that have attributes matching this regular expression - */ - public Elements getElementsByAttributeValueMatching(String key, - Pattern pattern) { - return Collector.collect(new Evaluator.AttributeWithValueMatching(key, - pattern), this); - - } - - /** - * Find elements that have attributes whose values match the supplied - * regular expression. - * - * @param key - * name of the attribute - * @param regex - * regular expression to match against attribute values. You can - * use embedded flags (such as (?i) and (?m) to control regex - * options. - * @return elements that have attributes matching this regular expression - */ - public Elements getElementsByAttributeValueMatching(String key, String regex) { - Pattern pattern; - try { - pattern = Pattern.compile(regex); - } catch (PatternSyntaxException e) { - throw new IllegalArgumentException( - "Pattern syntax error: " + regex, e); - } - return getElementsByAttributeValueMatching(key, pattern); - } - - /** - * Find elements whose sibling index is less than the supplied index. - * - * @param index - * 0-based index - * @return elements less than index - */ - public Elements getElementsByIndexLessThan(int index) { - return Collector.collect(new Evaluator.IndexLessThan(index), this); - } - - /** - * Find elements whose sibling index is greater than the supplied index. - * - * @param index - * 0-based index - * @return elements greater than index - */ - public Elements getElementsByIndexGreaterThan(int index) { - return Collector.collect(new Evaluator.IndexGreaterThan(index), this); - } - - /** - * Find elements whose sibling index is equal to the supplied index. - * - * @param index - * 0-based index - * @return elements equal to index - */ - public Elements getElementsByIndexEquals(int index) { - return Collector.collect(new Evaluator.IndexEquals(index), this); - } - - /** - * Find elements that contain the specified string. The search is case - * insensitive. The text may appear directly in the element, or in any of - * its descendants. - * - * @param searchText - * to look for in the element's text - * @return elements that contain the string, case insensitive. - * @see Element#text() - */ - public Elements getElementsContainingText(String searchText) { - return Collector.collect(new Evaluator.ContainsText(searchText), this); - } - - /** - * Find elements that directly contain the specified string. The search is - * case insensitive. The text must appear directly in the element, not in - * any of its descendants. - * - * @param searchText - * to look for in the element's own text - * @return elements that contain the string, case insensitive. - * @see Element#ownText() - */ - public Elements getElementsContainingOwnText(String searchText) { - return Collector.collect(new Evaluator.ContainsOwnText(searchText), - this); - } - - /** - * Find elements whose text matches the supplied regular expression. - * - * @param pattern - * regular expression to match text against - * @return elements matching the supplied regular expression. - * @see Element#text() - */ - public Elements getElementsMatchingText(Pattern pattern) { - return Collector.collect(new Evaluator.Matches(pattern), this); - } - - /** - * Find elements whose text matches the supplied regular expression. - * - * @param regex - * regular expression to match text against. You can use embedded flags (such as (?i) and (?m) to control regex - * options. - * @return elements matching the supplied regular expression. - * @see Element#text() - */ - public Elements getElementsMatchingText(String regex) { - Pattern pattern; - try { - pattern = Pattern.compile(regex); - } catch (PatternSyntaxException e) { - throw new IllegalArgumentException( - "Pattern syntax error: " + regex, e); - } - return getElementsMatchingText(pattern); - } - - /** - * Find elements whose own text matches the supplied regular expression. - * - * @param pattern - * regular expression to match text against - * @return elements matching the supplied regular expression. - * @see Element#ownText() - */ - public Elements getElementsMatchingOwnText(Pattern pattern) { - return Collector.collect(new Evaluator.MatchesOwn(pattern), this); - } - - /** - * Find elements whose text matches the supplied regular expression. - * - * @param regex - * regular expression to match text against. You can use embedded flags (such as (?i) and (?m) to control regex - * options. - * @return elements matching the supplied regular expression. - * @see Element#ownText() - */ - public Elements getElementsMatchingOwnText(String regex) { - Pattern pattern; - try { - pattern = Pattern.compile(regex); - } catch (PatternSyntaxException e) { - throw new IllegalArgumentException( - "Pattern syntax error: " + regex, e); - } - return getElementsMatchingOwnText(pattern); - } - - /** - * Find all elements under this element (including self, and children of - * children). - * - * @return all elements - */ - public Elements getAllElements() { - return Collector.collect(new Evaluator.AllElements(), this); - } - - /** - * Gets the combined text of this element and all its children. - *

- * For example, given HTML {@code

Hello there now!

}, - * {@code p.text()} returns {@code "Hello there now!"} - * - * @return unencoded text, or empty string if none. - * @see #ownText() - * @see #textNodes() - */ - public String text() { - StringBuilder sb = new StringBuilder(); - text(sb); - return sb.toString().trim(); - } - - private void text(StringBuilder accum) { - appendWhitespaceIfBr(this, accum); - - for (Node child : childNodes) { - if (child instanceof TextNode) { - TextNode textNode = (TextNode) child; - appendNormalisedText(accum, textNode); - } else if (child instanceof Element) { - Element element = (Element) child; - if (accum.length() > 0 && element.isBlock() - && !TextNode.lastCharIsWhitespace(accum)) { - accum.append(" "); - } - element.text(accum); - } - } - } - - /** - * Gets the text owned by this element only; does not get the combined text - * of all children. - *

- * For example, given HTML {@code

Hello there now!

}, - * {@code p.ownText()} returns {@code "Hello now!"}, whereas - * {@code p.text()} returns {@code "Hello there now!"}. Note that the text - * within the {@code b} element is not returned, as it is not a direct child - * of the {@code p} element. - * - * @return unencoded text, or empty string if none. - * @see #text() - * @see #textNodes() - */ - public String ownText() { - StringBuilder sb = new StringBuilder(); - ownText(sb); - return sb.toString().trim(); - } - - private void ownText(StringBuilder accum) { - for (Node child : childNodes) { - if (child instanceof TextNode) { - TextNode textNode = (TextNode) child; - appendNormalisedText(accum, textNode); - } else if (child instanceof Element) { - appendWhitespaceIfBr((Element) child, accum); - } - } - } - - private void appendNormalisedText(StringBuilder accum, TextNode textNode) { - String text = textNode.getWholeText(); - - if (!preserveWhitespace()) { - text = TextNode.normaliseWhitespace(text); - if (TextNode.lastCharIsWhitespace(accum)) { - text = TextNode.stripLeadingWhitespace(text); - } - } - accum.append(text); - } - - private static void appendWhitespaceIfBr(Element element, - StringBuilder accum) { - if (element.tag.getName().equals("br") - && !TextNode.lastCharIsWhitespace(accum)) { - accum.append(" "); - } - } - - boolean preserveWhitespace() { - return tag.preserveWhitespace() || parent() != null - && parent().preserveWhitespace(); - } - - /** - * Set the text of this element. Any existing contents (text or elements) - * will be cleared - * - * @param text - * unencoded text - * @return this element - */ - public Element text(String text) { - Validate.notNull(text); - - empty(); - TextNode textNode = new TextNode(text, baseUri); - appendChild(textNode); - - return this; - } - - /** - * Test if this element has any text content (that is not just whitespace). - * - * @return true if element has non-blank text content. - */ - public boolean hasText() { - for (Node child : childNodes) { - if (child instanceof TextNode) { - TextNode textNode = (TextNode) child; - if (!textNode.isBlank()) { - return true; - } - } else if (child instanceof Element) { - Element el = (Element) child; - if (el.hasText()) { - return true; - } - } - } - return false; - } - - /** - * Get the combined data of this element. Data is e.g. the inside of a - * {@code script} tag. - * - * @return the data, or empty string if none - * - * @see #dataNodes() - */ - public String data() { - StringBuilder sb = new StringBuilder(); - - for (Node childNode : childNodes) { - if (childNode instanceof DataNode) { - DataNode data = (DataNode) childNode; - sb.append(data.getWholeData()); - } else if (childNode instanceof Element) { - Element element = (Element) childNode; - String elementData = element.data(); - sb.append(elementData); - } - } - return sb.toString(); - } - - /** - * Gets the literal value of this element's "class" attribute, which may - * include multiple class names, space separated. (E.g. on - * <div class="header gray"> returns, " - * header gray") - * - * @return The literal class attribute, or empty string if no class - * attribute set. - */ - public String className() { - return attr("class"); - } - - /** - * Get all of the element's class names. E.g. on element - * {@code
}, returns a set of two elements - * {@code "header", "gray"}. Note that modifications to this set are not - * pushed to the backing {@code class} attribute; use the - * {@link #classNames(java.util.Set)} method to persist them. - * - * @return set of classnames, empty if no class attribute - */ - public Set classNames() { - if (classNames == null) { - String[] names = className().split("\\s+"); - classNames = new LinkedHashSet(Arrays.asList(names)); - } - return classNames; - } - - /** - * Set the element's {@code class} attribute to the supplied class names. - * - * @param classNames - * set of classes - * @return this element, for chaining - */ - public Element classNames(Set classNames) { - Validate.notNull(classNames); - attributes.put("class", StringUtil.join(classNames, " ")); - return this; - } - - /** - * Tests if this element has a class. Case insensitive. - * - * @param className - * name of class to check for - * @return true if it does, false if not - */ - public boolean hasClass(String className) { - Set classNames = classNames(); - for (String name : classNames) { - if (className.equalsIgnoreCase(name)) { - return true; - } - } - return false; - } - - /** - * Add a class name to this element's {@code class} attribute. - * - * @param className - * class name to add - * @return this element - */ - public Element addClass(String className) { - Validate.notNull(className); - - Set classes = classNames(); - classes.add(className); - classNames(classes); - - return this; - } - - /** - * Remove a class name from this element's {@code class} attribute. - * - * @param className - * class name to remove - * @return this element - */ - public Element removeClass(String className) { - Validate.notNull(className); - - Set classes = classNames(); - classes.remove(className); - classNames(classes); - - return this; - } - - /** - * Toggle a class name on this element's {@code class} attribute: if - * present, remove it; otherwise add it. - * - * @param className - * class name to toggle - * @return this element - */ - public Element toggleClass(String className) { - Validate.notNull(className); - - Set classes = classNames(); - if (classes.contains(className)) { - classes.remove(className); - } else { - classes.add(className); - } - classNames(classes); - - return this; - } - - /** - * Get the value of a form element (input, textarea, etc). - * - * @return the value of the form element, or empty string if not set. - */ - public String val() { - if (tagName().equals("textarea")) { - return text(); - } else { - return attr("value"); - } - } - - /** - * Set the value of a form element (input, textarea, etc). - * - * @param value - * value to set - * @return this element (for chaining) - */ - public Element val(String value) { - if (tagName().equals("textarea")) { - text(value); - } else { - attr("value", value); - } - return this; - } - - @Override - void outerHtmlHead(StringBuilder accum, int depth, - Document.OutputSettings out) { - if (accum.length() > 0 - && out.prettyPrint() - && (tag.formatAsBlock() || (parent() != null && parent().tag() - .formatAsBlock()))) { - indent(accum, depth, out); - } - accum.append("<").append(tagName()); - attributes.html(accum, out); - - if (childNodes.isEmpty() && tag.isSelfClosing()) { - accum.append(" />"); - } else { - accum.append(">"); - } - } - - @Override - void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out) { - if (!(childNodes.isEmpty() && tag.isSelfClosing())) { - if (out.prettyPrint() && !childNodes.isEmpty() - && tag.formatAsBlock()) { - indent(accum, depth, out); - } - accum.append(""); - } - } - - /** - * Retrieves the element's inner HTML. E.g. on a {@code
} with one - * empty {@code

}, would return {@code

}. (Whereas - * {@link #outerHtml()} would return {@code
- *

- *
}.) - * - * @return String of HTML. - * @see #outerHtml() - */ - public String html() { - StringBuilder accum = new StringBuilder(); - html(accum); - return accum.toString().trim(); - } - - private void html(StringBuilder accum) { - for (Node node : childNodes) { - node.outerHtml(accum); - } - } - - /** - * Set this element's inner HTML. Clears the existing HTML first. - * - * @param html - * HTML to parse and set into this element - * @return this element - * @see #append(String) - */ - public Element html(String html) { - empty(); - append(html); - return this; - } - - @Override - public String toString() { - return outerHtml(); - } - - @Override - public boolean equals(Object o) { - return this == o; - } - - @Override - public int hashCode() { - // todo: fixup, not very useful - int result = super.hashCode(); - result = 31 * result + (tag != null ? tag.hashCode() : 0); - return result; - } - - @Override - public Element clone() { - Element clone = (Element) super.clone(); - clone.classNames(); // creates linked set of class names from class - // attribute - return clone; - } -} diff --git a/server/src/org/jsoup/nodes/Entities.java b/server/src/org/jsoup/nodes/Entities.java deleted file mode 100644 index 24b50d7344..0000000000 --- a/server/src/org/jsoup/nodes/Entities.java +++ /dev/null @@ -1,217 +0,0 @@ -package org.jsoup.nodes; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.CharsetEncoder; -import java.util.HashMap; -import java.util.Map; -import java.util.MissingResourceException; -import java.util.Properties; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * HTML entities, and escape routines. Source: W3C HTML named character references. - */ -public class Entities { - public enum EscapeMode { - /** - * Restricted entities suitable for XHTML output: lt, gt, amp, apos, and - * quot only. - */ - xhtml(xhtmlByVal), - /** Default HTML output entities. */ - base(baseByVal), - /** Complete HTML entities. */ - extended(fullByVal); - - private Map map; - - EscapeMode(Map map) { - this.map = map; - } - - public Map getMap() { - return map; - } - } - - private static final Map full; - private static final Map xhtmlByVal; - private static final Map baseByVal; - private static final Map fullByVal; - private static final Pattern unescapePattern = Pattern - .compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); - private static final Pattern strictUnescapePattern = Pattern - .compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); - - private Entities() { - } - - /** - * Check if the input is a known named entity - * - * @param name - * the possible entity name (e.g. "lt" or "amp" - * @return true if a known named entity - */ - public static boolean isNamedEntity(String name) { - return full.containsKey(name); - } - -/** - * Get the Character value of the named entity - * @param name named entity (e.g. "lt" or "amp") - * @return the Character value of the named entity (e.g. '<' or '&') - */ - public static Character getCharacterByName(String name) { - return full.get(name); - } - - static String escape(String string, Document.OutputSettings out) { - return escape(string, out.encoder(), out.escapeMode()); - } - - static String escape(String string, CharsetEncoder encoder, - EscapeMode escapeMode) { - StringBuilder accum = new StringBuilder(string.length() * 2); - Map map = escapeMode.getMap(); - - for (int pos = 0; pos < string.length(); pos++) { - Character c = string.charAt(pos); - if (map.containsKey(c)) { - accum.append('&').append(map.get(c)).append(';'); - } else if (encoder.canEncode(c)) { - accum.append(c.charValue()); - } else { - accum.append("&#").append((int) c).append(';'); - } - } - - return accum.toString(); - } - - static String unescape(String string) { - return unescape(string, false); - } - - /** - * Unescape the input string. - * - * @param string - * @param strict - * if "strict" (that is, requires trailing ';' char, otherwise - * that's optional) - * @return - */ - static String unescape(String string, boolean strict) { - // todo: change this method to use Tokeniser.consumeCharacterReference - if (!string.contains("&")) { - return string; - } - - Matcher m = strict ? strictUnescapePattern.matcher(string) - : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? - StringBuffer accum = new StringBuffer(string.length()); // pity matcher - // can't use - // stringbuilder, - // avoid syncs - // todo: replace m.appendReplacement with own impl, so StringBuilder and - // quoteReplacement not required - - while (m.find()) { - int charval = -1; - String num = m.group(3); - if (num != null) { - try { - int base = m.group(2) != null ? 16 : 10; // 2 is hex - // indicator - charval = Integer.valueOf(num, base); - } catch (NumberFormatException e) { - } // skip - } else { - String name = m.group(1); - if (full.containsKey(name)) { - charval = full.get(name); - } - } - - if (charval != -1 || charval > 0xFFFF) { // out of range - String c = Character.toString((char) charval); - m.appendReplacement(accum, Matcher.quoteReplacement(c)); - } else { - m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace - // with - // original - // string - } - } - m.appendTail(accum); - return accum.toString(); - } - - // xhtml has restricted entities - private static final Object[][] xhtmlArray = { { "quot", 0x00022 }, - { "amp", 0x00026 }, { "apos", 0x00027 }, { "lt", 0x0003C }, - { "gt", 0x0003E } }; - - static { - xhtmlByVal = new HashMap(); - baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most - // common - // / - // default - full = loadEntities("entities-full.properties"); // extended and - // overblown. - fullByVal = toCharacterKey(full); - - for (Object[] entity : xhtmlArray) { - Character c = Character.valueOf((char) ((Integer) entity[1]) - .intValue()); - xhtmlByVal.put(c, ((String) entity[0])); - } - } - - private static Map loadEntities(String filename) { - Properties properties = new Properties(); - Map entities = new HashMap(); - try { - InputStream in = Entities.class.getResourceAsStream(filename); - properties.load(in); - in.close(); - } catch (IOException e) { - throw new MissingResourceException( - "Error loading entities resource: " + e.getMessage(), - "Entities", filename); - } - - for (Map.Entry entry : properties.entrySet()) { - Character val = Character.valueOf((char) Integer.parseInt( - (String) entry.getValue(), 16)); - String name = (String) entry.getKey(); - entities.put(name, val); - } - return entities; - } - - private static Map toCharacterKey( - Map inMap) { - Map outMap = new HashMap(); - for (Map.Entry entry : inMap.entrySet()) { - Character character = entry.getValue(); - String name = entry.getKey(); - - if (outMap.containsKey(character)) { - // dupe, prefer the lower case version - if (name.toLowerCase().equals(name)) { - outMap.put(character, name); - } - } else { - outMap.put(character, name); - } - } - return outMap; - } -} diff --git a/server/src/org/jsoup/nodes/Node.java b/server/src/org/jsoup/nodes/Node.java deleted file mode 100644 index 72b8dcbd47..0000000000 --- a/server/src/org/jsoup/nodes/Node.java +++ /dev/null @@ -1,727 +0,0 @@ -package org.jsoup.nodes; - -import java.net.MalformedURLException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.parser.Parser; -import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; - -/** - * The base, abstract Node model. Elements, Documents, Comments etc are all Node - * instances. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public abstract class Node implements Cloneable { - Node parentNode; - List childNodes; - Attributes attributes; - String baseUri; - int siblingIndex; - - /** - * Create a new Node. - * - * @param baseUri - * base URI - * @param attributes - * attributes (not null, but may be empty) - */ - protected Node(String baseUri, Attributes attributes) { - Validate.notNull(baseUri); - Validate.notNull(attributes); - - childNodes = new ArrayList(4); - this.baseUri = baseUri.trim(); - this.attributes = attributes; - } - - protected Node(String baseUri) { - this(baseUri, new Attributes()); - } - - /** - * Default constructor. Doesn't setup base uri, children, or attributes; use - * with caution. - */ - protected Node() { - childNodes = Collections.emptyList(); - attributes = null; - } - - /** - * Get the node name of this node. Use for debugging purposes and not logic - * switching (for that, use instanceof). - * - * @return node name - */ - public abstract String nodeName(); - - /** - * Get an attribute's value by its key. - *

- * To get an absolute URL from an attribute that may be a relative URL, - * prefix the key with abs, which is a shortcut to the - * {@link #absUrl} method. E.g.:

- * String url = a.attr("abs:href");
- * - * @param attributeKey - * The attribute key. - * @return The attribute, or empty string if not present (to avoid nulls). - * @see #attributes() - * @see #hasAttr(String) - * @see #absUrl(String) - */ - public String attr(String attributeKey) { - Validate.notNull(attributeKey); - - if (attributes.hasKey(attributeKey)) { - return attributes.get(attributeKey); - } else if (attributeKey.toLowerCase().startsWith("abs:")) { - return absUrl(attributeKey.substring("abs:".length())); - } else { - return ""; - } - } - - /** - * Get all of the element's attributes. - * - * @return attributes (which implements iterable, in same order as presented - * in original HTML). - */ - public Attributes attributes() { - return attributes; - } - - /** - * Set an attribute (key=value). If the attribute already exists, it is - * replaced. - * - * @param attributeKey - * The attribute key. - * @param attributeValue - * The attribute value. - * @return this (for chaining) - */ - public Node attr(String attributeKey, String attributeValue) { - attributes.put(attributeKey, attributeValue); - return this; - } - - /** - * Test if this element has an attribute. - * - * @param attributeKey - * The attribute key to check. - * @return true if the attribute exists, false if not. - */ - public boolean hasAttr(String attributeKey) { - Validate.notNull(attributeKey); - - if (attributeKey.toLowerCase().startsWith("abs:")) { - String key = attributeKey.substring("abs:".length()); - if (attributes.hasKey(key) && !absUrl(key).equals("")) { - return true; - } - } - return attributes.hasKey(attributeKey); - } - - /** - * Remove an attribute from this element. - * - * @param attributeKey - * The attribute to remove. - * @return this (for chaining) - */ - public Node removeAttr(String attributeKey) { - Validate.notNull(attributeKey); - attributes.remove(attributeKey); - return this; - } - - /** - * Get the base URI of this node. - * - * @return base URI - */ - public String baseUri() { - return baseUri; - } - - /** - * Update the base URI of this node and all of its descendants. - * - * @param baseUri - * base URI to set - */ - public void setBaseUri(final String baseUri) { - Validate.notNull(baseUri); - - traverse(new NodeVisitor() { - @Override - public void head(Node node, int depth) { - node.baseUri = baseUri; - } - - @Override - public void tail(Node node, int depth) { - } - }); - } - - /** - * Get an absolute URL from a URL attribute that may be relative (i.e. an - * <a href> or <img src>). - *

- * E.g.: String absUrl = linkEl.absUrl("href"); - *

- * If the attribute value is already absolute (i.e. it starts with a - * protocol, like http:// or https:// etc), and it - * successfully parses as a URL, the attribute is returned directly. - * Otherwise, it is treated as a URL relative to the element's - * {@link #baseUri}, and made absolute using that. - *

- * As an alternate, you can use the {@link #attr} method with the - * abs: prefix, e.g.: - * String absUrl = linkEl.attr("abs:href"); - * - * @param attributeKey - * The attribute key - * @return An absolute URL if one could be made, or an empty string (not - * null) if the attribute was missing or could not be made - * successfully into a URL. - * @see #attr - * @see java.net.URL#URL(java.net.URL, String) - */ - public String absUrl(String attributeKey) { - Validate.notEmpty(attributeKey); - - String relUrl = attr(attributeKey); - if (!hasAttr(attributeKey)) { - return ""; // nothing to make absolute with - } else { - URL base; - try { - try { - base = new URL(baseUri); - } catch (MalformedURLException e) { - // the base is unsuitable, but the attribute may be abs on - // its own, so try that - URL abs = new URL(relUrl); - return abs.toExternalForm(); - } - // workaround: java resolves '//path/file + ?foo' to - // '//path/?foo', not '//path/file?foo' as desired - if (relUrl.startsWith("?")) { - relUrl = base.getPath() + relUrl; - } - URL abs = new URL(base, relUrl); - return abs.toExternalForm(); - } catch (MalformedURLException e) { - return ""; - } - } - } - - /** - * Get a child node by index - * - * @param index - * index of child node - * @return the child node at this index. - */ - public Node childNode(int index) { - return childNodes.get(index); - } - - /** - * Get this node's children. Presented as an unmodifiable list: new children - * can not be added, but the child nodes themselves can be manipulated. - * - * @return list of children. If no children, returns an empty list. - */ - public List childNodes() { - return Collections.unmodifiableList(childNodes); - } - - protected Node[] childNodesAsArray() { - return childNodes.toArray(new Node[childNodes().size()]); - } - - /** - * Gets this node's parent node. - * - * @return parent node; or null if no parent. - */ - public Node parent() { - return parentNode; - } - - /** - * Gets the Document associated with this Node. - * - * @return the Document associated with this Node, or null if there is no - * such Document. - */ - public Document ownerDocument() { - if (this instanceof Document) { - return (Document) this; - } else if (parentNode == null) { - return null; - } else { - return parentNode.ownerDocument(); - } - } - - /** - * Remove (delete) this node from the DOM tree. If this node has children, - * they are also removed. - */ - public void remove() { - Validate.notNull(parentNode); - parentNode.removeChild(this); - } - - /** - * Insert the specified HTML into the DOM before this node (i.e. as a - * preceding sibling). - * - * @param html - * HTML to add before this node - * @return this node, for chaining - * @see #after(String) - */ - public Node before(String html) { - addSiblingHtml(siblingIndex(), html); - return this; - } - - /** - * Insert the specified node into the DOM before this node (i.e. as a - * preceding sibling). - * - * @param node - * to add before this node - * @return this node, for chaining - * @see #after(Node) - */ - public Node before(Node node) { - Validate.notNull(node); - Validate.notNull(parentNode); - - parentNode.addChildren(siblingIndex(), node); - return this; - } - - /** - * Insert the specified HTML into the DOM after this node (i.e. as a - * following sibling). - * - * @param html - * HTML to add after this node - * @return this node, for chaining - * @see #before(String) - */ - public Node after(String html) { - addSiblingHtml(siblingIndex() + 1, html); - return this; - } - - /** - * Insert the specified node into the DOM after this node (i.e. as a - * following sibling). - * - * @param node - * to add after this node - * @return this node, for chaining - * @see #before(Node) - */ - public Node after(Node node) { - Validate.notNull(node); - Validate.notNull(parentNode); - - parentNode.addChildren(siblingIndex() + 1, node); - return this; - } - - private void addSiblingHtml(int index, String html) { - Validate.notNull(html); - Validate.notNull(parentNode); - - Element context = parent() instanceof Element ? (Element) parent() - : null; - List nodes = Parser.parseFragment(html, context, baseUri()); - parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); - } - - /** - * Wrap the supplied HTML around this node. - * - * @param html - * HTML to wrap around this element, e.g. - * {@code

}. Can be arbitrarily deep. - * @return this node, for chaining. - */ - public Node wrap(String html) { - Validate.notEmpty(html); - - Element context = parent() instanceof Element ? (Element) parent() - : null; - List wrapChildren = Parser - .parseFragment(html, context, baseUri()); - Node wrapNode = wrapChildren.get(0); - if (wrapNode == null || !(wrapNode instanceof Element)) { - return null; - } - - Element wrap = (Element) wrapNode; - Element deepest = getDeepChild(wrap); - parentNode.replaceChild(this, wrap); - deepest.addChildren(this); - - // remainder (unbalanced wrap, like

-- The

is - // remainder - if (wrapChildren.size() > 0) { - for (int i = 0; i < wrapChildren.size(); i++) { - Node remainder = wrapChildren.get(i); - remainder.parentNode.removeChild(remainder); - wrap.appendChild(remainder); - } - } - return this; - } - - /** - * Removes this node from the DOM, and moves its children up into the node's - * parent. This has the effect of dropping the node but keeping its - * children. - *

- * For example, with the input html:
- * {@code

One Two Three
}
- * Calling {@code element.unwrap()} on the {@code span} element will result - * in the html:
- * {@code
One Two Three
}
- * and the {@code "Two "} {@link TextNode} being returned. - * - * @return the first child of this node, after the node has been unwrapped. - * Null if the node had no children. - * @see #remove() - * @see #wrap(String) - */ - public Node unwrap() { - Validate.notNull(parentNode); - - int index = siblingIndex; - Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null; - parentNode.addChildren(index, childNodesAsArray()); - remove(); - - return firstChild; - } - - private Element getDeepChild(Element el) { - List children = el.children(); - if (children.size() > 0) { - return getDeepChild(children.get(0)); - } else { - return el; - } - } - - /** - * Replace this node in the DOM with the supplied node. - * - * @param in - * the node that will will replace the existing node. - */ - public void replaceWith(Node in) { - Validate.notNull(in); - Validate.notNull(parentNode); - parentNode.replaceChild(this, in); - } - - protected void setParentNode(Node parentNode) { - if (this.parentNode != null) { - this.parentNode.removeChild(this); - } - this.parentNode = parentNode; - } - - protected void replaceChild(Node out, Node in) { - Validate.isTrue(out.parentNode == this); - Validate.notNull(in); - if (in.parentNode != null) { - in.parentNode.removeChild(in); - } - - Integer index = out.siblingIndex(); - childNodes.set(index, in); - in.parentNode = this; - in.setSiblingIndex(index); - out.parentNode = null; - } - - protected void removeChild(Node out) { - Validate.isTrue(out.parentNode == this); - int index = out.siblingIndex(); - childNodes.remove(index); - reindexChildren(); - out.parentNode = null; - } - - protected void addChildren(Node... children) { - // most used. short circuit addChildren(int), which hits reindex - // children and array copy - for (Node child : children) { - reparentChild(child); - childNodes.add(child); - child.setSiblingIndex(childNodes.size() - 1); - } - } - - protected void addChildren(int index, Node... children) { - Validate.noNullElements(children); - for (int i = children.length - 1; i >= 0; i--) { - Node in = children[i]; - reparentChild(in); - childNodes.add(index, in); - } - reindexChildren(); - } - - private void reparentChild(Node child) { - if (child.parentNode != null) { - child.parentNode.removeChild(child); - } - child.setParentNode(this); - } - - private void reindexChildren() { - for (int i = 0; i < childNodes.size(); i++) { - childNodes.get(i).setSiblingIndex(i); - } - } - - /** - * Retrieves this node's sibling nodes. Similar to {@link #childNodes() - * node.parent.childNodes()}, but does not include this node (a node is not - * a sibling of itself). - * - * @return node siblings. If the node has no parent, returns an empty list. - */ - public List siblingNodes() { - if (parentNode == null) { - return Collections.emptyList(); - } - - List nodes = parentNode.childNodes; - List siblings = new ArrayList(nodes.size() - 1); - for (Node node : nodes) { - if (node != this) { - siblings.add(node); - } - } - return siblings; - } - - /** - * Get this node's next sibling. - * - * @return next sibling, or null if this is the last sibling - */ - public Node nextSibling() { - if (parentNode == null) { - return null; // root - } - - List siblings = parentNode.childNodes; - Integer index = siblingIndex(); - Validate.notNull(index); - if (siblings.size() > index + 1) { - return siblings.get(index + 1); - } else { - return null; - } - } - - /** - * Get this node's previous sibling. - * - * @return the previous sibling, or null if this is the first sibling - */ - public Node previousSibling() { - if (parentNode == null) { - return null; // root - } - - List siblings = parentNode.childNodes; - Integer index = siblingIndex(); - Validate.notNull(index); - if (index > 0) { - return siblings.get(index - 1); - } else { - return null; - } - } - - /** - * Get the list index of this node in its node sibling list. I.e. if this is - * the first node sibling, returns 0. - * - * @return position in node sibling list - * @see org.jsoup.nodes.Element#elementSiblingIndex() - */ - public int siblingIndex() { - return siblingIndex; - } - - protected void setSiblingIndex(int siblingIndex) { - this.siblingIndex = siblingIndex; - } - - /** - * Perform a depth-first traversal through this node and its descendants. - * - * @param nodeVisitor - * the visitor callbacks to perform on each node - * @return this node, for chaining - */ - public Node traverse(NodeVisitor nodeVisitor) { - Validate.notNull(nodeVisitor); - NodeTraversor traversor = new NodeTraversor(nodeVisitor); - traversor.traverse(this); - return this; - } - - /** - * Get the outer HTML of this node. - * - * @return HTML - */ - public String outerHtml() { - StringBuilder accum = new StringBuilder(128); - outerHtml(accum); - return accum.toString(); - } - - protected void outerHtml(StringBuilder accum) { - new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())) - .traverse(this); - } - - // if this node has no document (or parent), retrieve the default output - // settings - private Document.OutputSettings getOutputSettings() { - return ownerDocument() != null ? ownerDocument().outputSettings() - : (new Document("")).outputSettings(); - } - - /** - * Get the outer HTML of this node. - * - * @param accum - * accumulator to place HTML into - */ - abstract void outerHtmlHead(StringBuilder accum, int depth, - Document.OutputSettings out); - - abstract void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out); - - @Override - public String toString() { - return outerHtml(); - } - - protected void indent(StringBuilder accum, int depth, - Document.OutputSettings out) { - accum.append("\n").append( - StringUtil.padding(depth * out.indentAmount())); - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - // todo: have nodes hold a child index, compare against that and parent - // (not children) - return false; - } - - @Override - public int hashCode() { - int result = parentNode != null ? parentNode.hashCode() : 0; - // not children, or will block stack as they go back up to parent) - result = 31 * result + (attributes != null ? attributes.hashCode() : 0); - return result; - } - - /** - * Create a stand-alone, deep copy of this node, and all of its children. - * The cloned node will have no siblings or parent node. As a stand-alone - * object, any changes made to the clone or any of its children will not - * impact the original node. - *

- * The cloned node may be adopted into another Document or node structure - * using {@link Element#appendChild(Node)}. - * - * @return stand-alone cloned node - */ - @Override - public Node clone() { - return doClone(null); // splits for orphan - } - - protected Node doClone(Node parent) { - Node clone; - try { - clone = (Node) super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); - } - - clone.parentNode = parent; // can be null, to create an orphan split - clone.siblingIndex = parent == null ? 0 : siblingIndex; - clone.attributes = attributes != null ? attributes.clone() : null; - clone.baseUri = baseUri; - clone.childNodes = new ArrayList(childNodes.size()); - for (Node child : childNodes) { - clone.childNodes.add(child.doClone(clone)); // clone() creates - // orphans, doClone() - // keeps parent - } - - return clone; - } - - private static class OuterHtmlVisitor implements NodeVisitor { - private StringBuilder accum; - private Document.OutputSettings out; - - OuterHtmlVisitor(StringBuilder accum, Document.OutputSettings out) { - this.accum = accum; - this.out = out; - } - - @Override - public void head(Node node, int depth) { - node.outerHtmlHead(accum, depth, out); - } - - @Override - public void tail(Node node, int depth) { - if (!node.nodeName().equals("#text")) { - node.outerHtmlTail(accum, depth, out); - } - } - } -} diff --git a/server/src/org/jsoup/nodes/TextNode.java b/server/src/org/jsoup/nodes/TextNode.java deleted file mode 100644 index 594e38593e..0000000000 --- a/server/src/org/jsoup/nodes/TextNode.java +++ /dev/null @@ -1,206 +0,0 @@ -package org.jsoup.nodes; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; - -/** - * A text node. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class TextNode extends Node { - /* - * TextNode is a node, and so by default comes with attributes and children. - * The attributes are seldom used, but use memory, and the child nodes are - * never used. So we don't have them, and override accessors to attributes - * to create them as needed on the fly. - */ - private static final String TEXT_KEY = "text"; - String text; - - /** - * Create a new TextNode representing the supplied (unencoded) text). - * - * @param text - * raw text - * @param baseUri - * base uri - * @see #createFromEncoded(String, String) - */ - public TextNode(String text, String baseUri) { - this.baseUri = baseUri; - this.text = text; - } - - @Override - public String nodeName() { - return "#text"; - } - - /** - * Get the text content of this text node. - * - * @return Unencoded, normalised text. - * @see TextNode#getWholeText() - */ - public String text() { - return normaliseWhitespace(getWholeText()); - } - - /** - * Set the text content of this text node. - * - * @param text - * unencoded text - * @return this, for chaining - */ - public TextNode text(String text) { - this.text = text; - if (attributes != null) { - attributes.put(TEXT_KEY, text); - } - return this; - } - - /** - * Get the (unencoded) text of this text node, including any newlines and - * spaces present in the original. - * - * @return text - */ - public String getWholeText() { - return attributes == null ? text : attributes.get(TEXT_KEY); - } - - /** - * Test if this text node is blank -- that is, empty or only whitespace - * (including newlines). - * - * @return true if this document is empty or only whitespace, false if it - * contains any text content. - */ - public boolean isBlank() { - return StringUtil.isBlank(getWholeText()); - } - - /** - * Split this text node into two nodes at the specified string offset. After - * splitting, this node will contain the original text up to the offset, and - * will have a new text node sibling containing the text after the offset. - * - * @param offset - * string offset point to split node at. - * @return the newly created text node containing the text after the offset. - */ - public TextNode splitText(int offset) { - Validate.isTrue(offset >= 0, "Split offset must be not be negative"); - Validate.isTrue(offset < text.length(), - "Split offset must not be greater than current text length"); - - String head = getWholeText().substring(0, offset); - String tail = getWholeText().substring(offset); - text(head); - TextNode tailNode = new TextNode(tail, baseUri()); - if (parent() != null) { - parent().addChildren(siblingIndex() + 1, tailNode); - } - - return tailNode; - } - - @Override - void outerHtmlHead(StringBuilder accum, int depth, - Document.OutputSettings out) { - String html = Entities.escape(getWholeText(), out); - if (out.prettyPrint() && parent() instanceof Element - && !((Element) parent()).preserveWhitespace()) { - html = normaliseWhitespace(html); - } - - if (out.prettyPrint() && siblingIndex() == 0 - && parentNode instanceof Element - && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) { - indent(accum, depth, out); - } - accum.append(html); - } - - @Override - void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out) { - } - - @Override - public String toString() { - return outerHtml(); - } - - /** - * Create a new TextNode from HTML encoded (aka escaped) data. - * - * @param encodedText - * Text containing encoded HTML (e.g. &lt;) - * @return TextNode containing unencoded data (e.g. <) - */ - public static TextNode createFromEncoded(String encodedText, String baseUri) { - String text = Entities.unescape(encodedText); - return new TextNode(text, baseUri); - } - - static String normaliseWhitespace(String text) { - text = StringUtil.normaliseWhitespace(text); - return text; - } - - static String stripLeadingWhitespace(String text) { - return text.replaceFirst("^\\s+", ""); - } - - static boolean lastCharIsWhitespace(StringBuilder sb) { - return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; - } - - // attribute fiddling. create on first access. - private void ensureAttributes() { - if (attributes == null) { - attributes = new Attributes(); - attributes.put(TEXT_KEY, text); - } - } - - @Override - public String attr(String attributeKey) { - ensureAttributes(); - return super.attr(attributeKey); - } - - @Override - public Attributes attributes() { - ensureAttributes(); - return super.attributes(); - } - - @Override - public Node attr(String attributeKey, String attributeValue) { - ensureAttributes(); - return super.attr(attributeKey, attributeValue); - } - - @Override - public boolean hasAttr(String attributeKey) { - ensureAttributes(); - return super.hasAttr(attributeKey); - } - - @Override - public Node removeAttr(String attributeKey) { - ensureAttributes(); - return super.removeAttr(attributeKey); - } - - @Override - public String absUrl(String attributeKey) { - ensureAttributes(); - return super.absUrl(attributeKey); - } -} diff --git a/server/src/org/jsoup/nodes/XmlDeclaration.java b/server/src/org/jsoup/nodes/XmlDeclaration.java deleted file mode 100644 index ce6ac678a5..0000000000 --- a/server/src/org/jsoup/nodes/XmlDeclaration.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.jsoup.nodes; - -/** - * An XML Declaration. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class XmlDeclaration extends Node { - private static final String DECL_KEY = "declaration"; - private final boolean isProcessingInstruction; // "); - } - - @Override - void outerHtmlTail(StringBuilder accum, int depth, - Document.OutputSettings out) { - } - - @Override - public String toString() { - return outerHtml(); - } -} diff --git a/server/src/org/jsoup/nodes/entities-base.properties b/server/src/org/jsoup/nodes/entities-base.properties deleted file mode 100644 index 3d1d11e6c4..0000000000 --- a/server/src/org/jsoup/nodes/entities-base.properties +++ /dev/null @@ -1,106 +0,0 @@ -AElig=000C6 -AMP=00026 -Aacute=000C1 -Acirc=000C2 -Agrave=000C0 -Aring=000C5 -Atilde=000C3 -Auml=000C4 -COPY=000A9 -Ccedil=000C7 -ETH=000D0 -Eacute=000C9 -Ecirc=000CA -Egrave=000C8 -Euml=000CB -GT=0003E -Iacute=000CD -Icirc=000CE -Igrave=000CC -Iuml=000CF -LT=0003C -Ntilde=000D1 -Oacute=000D3 -Ocirc=000D4 -Ograve=000D2 -Oslash=000D8 -Otilde=000D5 -Ouml=000D6 -QUOT=00022 -REG=000AE -THORN=000DE -Uacute=000DA -Ucirc=000DB -Ugrave=000D9 -Uuml=000DC -Yacute=000DD -aacute=000E1 -acirc=000E2 -acute=000B4 -aelig=000E6 -agrave=000E0 -amp=00026 -aring=000E5 -atilde=000E3 -auml=000E4 -brvbar=000A6 -ccedil=000E7 -cedil=000B8 -cent=000A2 -copy=000A9 -curren=000A4 -deg=000B0 -divide=000F7 -eacute=000E9 -ecirc=000EA -egrave=000E8 -eth=000F0 -euml=000EB -frac12=000BD -frac14=000BC -frac34=000BE -gt=0003E -iacute=000ED -icirc=000EE -iexcl=000A1 -igrave=000EC -iquest=000BF -iuml=000EF -laquo=000AB -lt=0003C -macr=000AF -micro=000B5 -middot=000B7 -nbsp=000A0 -not=000AC -ntilde=000F1 -oacute=000F3 -ocirc=000F4 -ograve=000F2 -ordf=000AA -ordm=000BA -oslash=000F8 -otilde=000F5 -ouml=000F6 -para=000B6 -plusmn=000B1 -pound=000A3 -quot=00022 -raquo=000BB -reg=000AE -sect=000A7 -shy=000AD -sup1=000B9 -sup2=000B2 -sup3=000B3 -szlig=000DF -thorn=000FE -times=000D7 -uacute=000FA -ucirc=000FB -ugrave=000F9 -uml=000A8 -uuml=000FC -yacute=000FD -yen=000A5 -yuml=000FF diff --git a/server/src/org/jsoup/nodes/entities-full.properties b/server/src/org/jsoup/nodes/entities-full.properties deleted file mode 100644 index 92f124f408..0000000000 --- a/server/src/org/jsoup/nodes/entities-full.properties +++ /dev/null @@ -1,2032 +0,0 @@ -AElig=000C6 -AMP=00026 -Aacute=000C1 -Abreve=00102 -Acirc=000C2 -Acy=00410 -Afr=1D504 -Agrave=000C0 -Alpha=00391 -Amacr=00100 -And=02A53 -Aogon=00104 -Aopf=1D538 -ApplyFunction=02061 -Aring=000C5 -Ascr=1D49C -Assign=02254 -Atilde=000C3 -Auml=000C4 -Backslash=02216 -Barv=02AE7 -Barwed=02306 -Bcy=00411 -Because=02235 -Bernoullis=0212C -Beta=00392 -Bfr=1D505 -Bopf=1D539 -Breve=002D8 -Bscr=0212C -Bumpeq=0224E -CHcy=00427 -COPY=000A9 -Cacute=00106 -Cap=022D2 -CapitalDifferentialD=02145 -Cayleys=0212D -Ccaron=0010C -Ccedil=000C7 -Ccirc=00108 -Cconint=02230 -Cdot=0010A -Cedilla=000B8 -CenterDot=000B7 -Cfr=0212D -Chi=003A7 -CircleDot=02299 -CircleMinus=02296 -CirclePlus=02295 -CircleTimes=02297 -ClockwiseContourIntegral=02232 -CloseCurlyDoubleQuote=0201D -CloseCurlyQuote=02019 -Colon=02237 -Colone=02A74 -Congruent=02261 -Conint=0222F -ContourIntegral=0222E -Copf=02102 -Coproduct=02210 -CounterClockwiseContourIntegral=02233 -Cross=02A2F -Cscr=1D49E -Cup=022D3 -CupCap=0224D -DD=02145 -DDotrahd=02911 -DJcy=00402 -DScy=00405 -DZcy=0040F -Dagger=02021 -Darr=021A1 -Dashv=02AE4 -Dcaron=0010E -Dcy=00414 -Del=02207 -Delta=00394 -Dfr=1D507 -DiacriticalAcute=000B4 -DiacriticalDot=002D9 -DiacriticalDoubleAcute=002DD -DiacriticalGrave=00060 -DiacriticalTilde=002DC -Diamond=022C4 -DifferentialD=02146 -Dopf=1D53B -Dot=000A8 -DotDot=020DC -DotEqual=02250 -DoubleContourIntegral=0222F -DoubleDot=000A8 -DoubleDownArrow=021D3 -DoubleLeftArrow=021D0 -DoubleLeftRightArrow=021D4 -DoubleLeftTee=02AE4 -DoubleLongLeftArrow=027F8 -DoubleLongLeftRightArrow=027FA -DoubleLongRightArrow=027F9 -DoubleRightArrow=021D2 -DoubleRightTee=022A8 -DoubleUpArrow=021D1 -DoubleUpDownArrow=021D5 -DoubleVerticalBar=02225 -DownArrow=02193 -DownArrowBar=02913 -DownArrowUpArrow=021F5 -DownBreve=00311 -DownLeftRightVector=02950 -DownLeftTeeVector=0295E -DownLeftVector=021BD -DownLeftVectorBar=02956 -DownRightTeeVector=0295F -DownRightVector=021C1 -DownRightVectorBar=02957 -DownTee=022A4 -DownTeeArrow=021A7 -Downarrow=021D3 -Dscr=1D49F -Dstrok=00110 -ENG=0014A -ETH=000D0 -Eacute=000C9 -Ecaron=0011A -Ecirc=000CA -Ecy=0042D -Edot=00116 -Efr=1D508 -Egrave=000C8 -Element=02208 -Emacr=00112 -EmptySmallSquare=025FB -EmptyVerySmallSquare=025AB -Eogon=00118 -Eopf=1D53C -Epsilon=00395 -Equal=02A75 -EqualTilde=02242 -Equilibrium=021CC -Escr=02130 -Esim=02A73 -Eta=00397 -Euml=000CB -Exists=02203 -ExponentialE=02147 -Fcy=00424 -Ffr=1D509 -FilledSmallSquare=025FC -FilledVerySmallSquare=025AA -Fopf=1D53D -ForAll=02200 -Fouriertrf=02131 -Fscr=02131 -GJcy=00403 -GT=0003E -Gamma=00393 -Gammad=003DC -Gbreve=0011E -Gcedil=00122 -Gcirc=0011C -Gcy=00413 -Gdot=00120 -Gfr=1D50A -Gg=022D9 -Gopf=1D53E -GreaterEqual=02265 -GreaterEqualLess=022DB -GreaterFullEqual=02267 -GreaterGreater=02AA2 -GreaterLess=02277 -GreaterSlantEqual=02A7E -GreaterTilde=02273 -Gscr=1D4A2 -Gt=0226B -HARDcy=0042A -Hacek=002C7 -Hat=0005E -Hcirc=00124 -Hfr=0210C -HilbertSpace=0210B -Hopf=0210D -HorizontalLine=02500 -Hscr=0210B -Hstrok=00126 -HumpDownHump=0224E -HumpEqual=0224F -IEcy=00415 -IJlig=00132 -IOcy=00401 -Iacute=000CD -Icirc=000CE -Icy=00418 -Idot=00130 -Ifr=02111 -Igrave=000CC -Im=02111 -Imacr=0012A -ImaginaryI=02148 -Implies=021D2 -Int=0222C -Integral=0222B -Intersection=022C2 -InvisibleComma=02063 -InvisibleTimes=02062 -Iogon=0012E -Iopf=1D540 -Iota=00399 -Iscr=02110 -Itilde=00128 -Iukcy=00406 -Iuml=000CF -Jcirc=00134 -Jcy=00419 -Jfr=1D50D -Jopf=1D541 -Jscr=1D4A5 -Jsercy=00408 -Jukcy=00404 -KHcy=00425 -KJcy=0040C -Kappa=0039A -Kcedil=00136 -Kcy=0041A -Kfr=1D50E -Kopf=1D542 -Kscr=1D4A6 -LJcy=00409 -LT=0003C -Lacute=00139 -Lambda=0039B -Lang=027EA -Laplacetrf=02112 -Larr=0219E -Lcaron=0013D -Lcedil=0013B -Lcy=0041B -LeftAngleBracket=027E8 -LeftArrow=02190 -LeftArrowBar=021E4 -LeftArrowRightArrow=021C6 -LeftCeiling=02308 -LeftDoubleBracket=027E6 -LeftDownTeeVector=02961 -LeftDownVector=021C3 -LeftDownVectorBar=02959 -LeftFloor=0230A -LeftRightArrow=02194 -LeftRightVector=0294E -LeftTee=022A3 -LeftTeeArrow=021A4 -LeftTeeVector=0295A -LeftTriangle=022B2 -LeftTriangleBar=029CF -LeftTriangleEqual=022B4 -LeftUpDownVector=02951 -LeftUpTeeVector=02960 -LeftUpVector=021BF -LeftUpVectorBar=02958 -LeftVector=021BC -LeftVectorBar=02952 -Leftarrow=021D0 -Leftrightarrow=021D4 -LessEqualGreater=022DA -LessFullEqual=02266 -LessGreater=02276 -LessLess=02AA1 -LessSlantEqual=02A7D -LessTilde=02272 -Lfr=1D50F -Ll=022D8 -Lleftarrow=021DA -Lmidot=0013F -LongLeftArrow=027F5 -LongLeftRightArrow=027F7 -LongRightArrow=027F6 -Longleftarrow=027F8 -Longleftrightarrow=027FA -Longrightarrow=027F9 -Lopf=1D543 -LowerLeftArrow=02199 -LowerRightArrow=02198 -Lscr=02112 -Lsh=021B0 -Lstrok=00141 -Lt=0226A -Map=02905 -Mcy=0041C -MediumSpace=0205F -Mellintrf=02133 -Mfr=1D510 -MinusPlus=02213 -Mopf=1D544 -Mscr=02133 -Mu=0039C -NJcy=0040A -Nacute=00143 -Ncaron=00147 -Ncedil=00145 -Ncy=0041D -NegativeMediumSpace=0200B -NegativeThickSpace=0200B -NegativeThinSpace=0200B -NegativeVeryThinSpace=0200B -NestedGreaterGreater=0226B -NestedLessLess=0226A -NewLine=0000A -Nfr=1D511 -NoBreak=02060 -NonBreakingSpace=000A0 -Nopf=02115 -Not=02AEC -NotCongruent=02262 -NotCupCap=0226D -NotDoubleVerticalBar=02226 -NotElement=02209 -NotEqual=02260 -NotExists=02204 -NotGreater=0226F -NotGreaterEqual=02271 -NotGreaterLess=02279 -NotGreaterTilde=02275 -NotLeftTriangle=022EA -NotLeftTriangleEqual=022EC -NotLess=0226E -NotLessEqual=02270 -NotLessGreater=02278 -NotLessTilde=02274 -NotPrecedes=02280 -NotPrecedesSlantEqual=022E0 -NotReverseElement=0220C -NotRightTriangle=022EB -NotRightTriangleEqual=022ED -NotSquareSubsetEqual=022E2 -NotSquareSupersetEqual=022E3 -NotSubsetEqual=02288 -NotSucceeds=02281 -NotSucceedsSlantEqual=022E1 -NotSupersetEqual=02289 -NotTilde=02241 -NotTildeEqual=02244 -NotTildeFullEqual=02247 -NotTildeTilde=02249 -NotVerticalBar=02224 -Nscr=1D4A9 -Ntilde=000D1 -Nu=0039D -OElig=00152 -Oacute=000D3 -Ocirc=000D4 -Ocy=0041E -Odblac=00150 -Ofr=1D512 -Ograve=000D2 -Omacr=0014C -Omega=003A9 -Omicron=0039F -Oopf=1D546 -OpenCurlyDoubleQuote=0201C -OpenCurlyQuote=02018 -Or=02A54 -Oscr=1D4AA -Oslash=000D8 -Otilde=000D5 -Otimes=02A37 -Ouml=000D6 -OverBar=0203E -OverBrace=023DE -OverBracket=023B4 -OverParenthesis=023DC -PartialD=02202 -Pcy=0041F -Pfr=1D513 -Phi=003A6 -Pi=003A0 -PlusMinus=000B1 -Poincareplane=0210C -Popf=02119 -Pr=02ABB -Precedes=0227A -PrecedesEqual=02AAF -PrecedesSlantEqual=0227C -PrecedesTilde=0227E -Prime=02033 -Product=0220F -Proportion=02237 -Proportional=0221D -Pscr=1D4AB -Psi=003A8 -QUOT=00022 -Qfr=1D514 -Qopf=0211A -Qscr=1D4AC -RBarr=02910 -REG=000AE -Racute=00154 -Rang=027EB -Rarr=021A0 -Rarrtl=02916 -Rcaron=00158 -Rcedil=00156 -Rcy=00420 -Re=0211C -ReverseElement=0220B -ReverseEquilibrium=021CB -ReverseUpEquilibrium=0296F -Rfr=0211C -Rho=003A1 -RightAngleBracket=027E9 -RightArrow=02192 -RightArrowBar=021E5 -RightArrowLeftArrow=021C4 -RightCeiling=02309 -RightDoubleBracket=027E7 -RightDownTeeVector=0295D -RightDownVector=021C2 -RightDownVectorBar=02955 -RightFloor=0230B -RightTee=022A2 -RightTeeArrow=021A6 -RightTeeVector=0295B -RightTriangle=022B3 -RightTriangleBar=029D0 -RightTriangleEqual=022B5 -RightUpDownVector=0294F -RightUpTeeVector=0295C -RightUpVector=021BE -RightUpVectorBar=02954 -RightVector=021C0 -RightVectorBar=02953 -Rightarrow=021D2 -Ropf=0211D -RoundImplies=02970 -Rrightarrow=021DB -Rscr=0211B -Rsh=021B1 -RuleDelayed=029F4 -SHCHcy=00429 -SHcy=00428 -SOFTcy=0042C -Sacute=0015A -Sc=02ABC -Scaron=00160 -Scedil=0015E -Scirc=0015C -Scy=00421 -Sfr=1D516 -ShortDownArrow=02193 -ShortLeftArrow=02190 -ShortRightArrow=02192 -ShortUpArrow=02191 -Sigma=003A3 -SmallCircle=02218 -Sopf=1D54A -Sqrt=0221A -Square=025A1 -SquareIntersection=02293 -SquareSubset=0228F -SquareSubsetEqual=02291 -SquareSuperset=02290 -SquareSupersetEqual=02292 -SquareUnion=02294 -Sscr=1D4AE -Star=022C6 -Sub=022D0 -Subset=022D0 -SubsetEqual=02286 -Succeeds=0227B -SucceedsEqual=02AB0 -SucceedsSlantEqual=0227D -SucceedsTilde=0227F -SuchThat=0220B -Sum=02211 -Sup=022D1 -Superset=02283 -SupersetEqual=02287 -Supset=022D1 -THORN=000DE -TRADE=02122 -TSHcy=0040B -TScy=00426 -Tab=00009 -Tau=003A4 -Tcaron=00164 -Tcedil=00162 -Tcy=00422 -Tfr=1D517 -Therefore=02234 -Theta=00398 -ThinSpace=02009 -Tilde=0223C -TildeEqual=02243 -TildeFullEqual=02245 -TildeTilde=02248 -Topf=1D54B -TripleDot=020DB -Tscr=1D4AF -Tstrok=00166 -Uacute=000DA -Uarr=0219F -Uarrocir=02949 -Ubrcy=0040E -Ubreve=0016C -Ucirc=000DB -Ucy=00423 -Udblac=00170 -Ufr=1D518 -Ugrave=000D9 -Umacr=0016A -UnderBar=0005F -UnderBrace=023DF -UnderBracket=023B5 -UnderParenthesis=023DD -Union=022C3 -UnionPlus=0228E -Uogon=00172 -Uopf=1D54C -UpArrow=02191 -UpArrowBar=02912 -UpArrowDownArrow=021C5 -UpDownArrow=02195 -UpEquilibrium=0296E -UpTee=022A5 -UpTeeArrow=021A5 -Uparrow=021D1 -Updownarrow=021D5 -UpperLeftArrow=02196 -UpperRightArrow=02197 -Upsi=003D2 -Upsilon=003A5 -Uring=0016E -Uscr=1D4B0 -Utilde=00168 -Uuml=000DC -VDash=022AB -Vbar=02AEB -Vcy=00412 -Vdash=022A9 -Vdashl=02AE6 -Vee=022C1 -Verbar=02016 -Vert=02016 -VerticalBar=02223 -VerticalLine=0007C -VerticalSeparator=02758 -VerticalTilde=02240 -VeryThinSpace=0200A -Vfr=1D519 -Vopf=1D54D -Vscr=1D4B1 -Vvdash=022AA -Wcirc=00174 -Wedge=022C0 -Wfr=1D51A -Wopf=1D54E -Wscr=1D4B2 -Xfr=1D51B -Xi=0039E -Xopf=1D54F -Xscr=1D4B3 -YAcy=0042F -YIcy=00407 -YUcy=0042E -Yacute=000DD -Ycirc=00176 -Ycy=0042B -Yfr=1D51C -Yopf=1D550 -Yscr=1D4B4 -Yuml=00178 -ZHcy=00416 -Zacute=00179 -Zcaron=0017D -Zcy=00417 -Zdot=0017B -ZeroWidthSpace=0200B -Zeta=00396 -Zfr=02128 -Zopf=02124 -Zscr=1D4B5 -aacute=000E1 -abreve=00103 -ac=0223E -acd=0223F -acirc=000E2 -acute=000B4 -acy=00430 -aelig=000E6 -af=02061 -afr=1D51E -agrave=000E0 -alefsym=02135 -aleph=02135 -alpha=003B1 -amacr=00101 -amalg=02A3F -amp=00026 -and=02227 -andand=02A55 -andd=02A5C -andslope=02A58 -andv=02A5A -ang=02220 -ange=029A4 -angle=02220 -angmsd=02221 -angmsdaa=029A8 -angmsdab=029A9 -angmsdac=029AA -angmsdad=029AB -angmsdae=029AC -angmsdaf=029AD -angmsdag=029AE -angmsdah=029AF -angrt=0221F -angrtvb=022BE -angrtvbd=0299D -angsph=02222 -angst=000C5 -angzarr=0237C -aogon=00105 -aopf=1D552 -ap=02248 -apE=02A70 -apacir=02A6F -ape=0224A -apid=0224B -apos=00027 -approx=02248 -approxeq=0224A -aring=000E5 -ascr=1D4B6 -ast=0002A -asymp=02248 -asympeq=0224D -atilde=000E3 -auml=000E4 -awconint=02233 -awint=02A11 -bNot=02AED -backcong=0224C -backepsilon=003F6 -backprime=02035 -backsim=0223D -backsimeq=022CD -barvee=022BD -barwed=02305 -barwedge=02305 -bbrk=023B5 -bbrktbrk=023B6 -bcong=0224C -bcy=00431 -bdquo=0201E -becaus=02235 -because=02235 -bemptyv=029B0 -bepsi=003F6 -bernou=0212C -beta=003B2 -beth=02136 -between=0226C -bfr=1D51F -bigcap=022C2 -bigcirc=025EF -bigcup=022C3 -bigodot=02A00 -bigoplus=02A01 -bigotimes=02A02 -bigsqcup=02A06 -bigstar=02605 -bigtriangledown=025BD -bigtriangleup=025B3 -biguplus=02A04 -bigvee=022C1 -bigwedge=022C0 -bkarow=0290D -blacklozenge=029EB -blacksquare=025AA -blacktriangle=025B4 -blacktriangledown=025BE -blacktriangleleft=025C2 -blacktriangleright=025B8 -blank=02423 -blk12=02592 -blk14=02591 -blk34=02593 -block=02588 -bnot=02310 -bopf=1D553 -bot=022A5 -bottom=022A5 -bowtie=022C8 -boxDL=02557 -boxDR=02554 -boxDl=02556 -boxDr=02553 -boxH=02550 -boxHD=02566 -boxHU=02569 -boxHd=02564 -boxHu=02567 -boxUL=0255D -boxUR=0255A -boxUl=0255C -boxUr=02559 -boxV=02551 -boxVH=0256C -boxVL=02563 -boxVR=02560 -boxVh=0256B -boxVl=02562 -boxVr=0255F -boxbox=029C9 -boxdL=02555 -boxdR=02552 -boxdl=02510 -boxdr=0250C -boxh=02500 -boxhD=02565 -boxhU=02568 -boxhd=0252C -boxhu=02534 -boxminus=0229F -boxplus=0229E -boxtimes=022A0 -boxuL=0255B -boxuR=02558 -boxul=02518 -boxur=02514 -boxv=02502 -boxvH=0256A -boxvL=02561 -boxvR=0255E -boxvh=0253C -boxvl=02524 -boxvr=0251C -bprime=02035 -breve=002D8 -brvbar=000A6 -bscr=1D4B7 -bsemi=0204F -bsim=0223D -bsime=022CD -bsol=0005C -bsolb=029C5 -bsolhsub=027C8 -bull=02022 -bullet=02022 -bump=0224E -bumpE=02AAE -bumpe=0224F -bumpeq=0224F -cacute=00107 -cap=02229 -capand=02A44 -capbrcup=02A49 -capcap=02A4B -capcup=02A47 -capdot=02A40 -caret=02041 -caron=002C7 -ccaps=02A4D -ccaron=0010D -ccedil=000E7 -ccirc=00109 -ccups=02A4C -ccupssm=02A50 -cdot=0010B -cedil=000B8 -cemptyv=029B2 -cent=000A2 -centerdot=000B7 -cfr=1D520 -chcy=00447 -check=02713 -checkmark=02713 -chi=003C7 -cir=025CB -cirE=029C3 -circ=002C6 -circeq=02257 -circlearrowleft=021BA -circlearrowright=021BB -circledR=000AE -circledS=024C8 -circledast=0229B -circledcirc=0229A -circleddash=0229D -cire=02257 -cirfnint=02A10 -cirmid=02AEF -cirscir=029C2 -clubs=02663 -clubsuit=02663 -colon=0003A -colone=02254 -coloneq=02254 -comma=0002C -commat=00040 -comp=02201 -compfn=02218 -complement=02201 -complexes=02102 -cong=02245 -congdot=02A6D -conint=0222E -copf=1D554 -coprod=02210 -copy=000A9 -copysr=02117 -crarr=021B5 -cross=02717 -cscr=1D4B8 -csub=02ACF -csube=02AD1 -csup=02AD0 -csupe=02AD2 -ctdot=022EF -cudarrl=02938 -cudarrr=02935 -cuepr=022DE -cuesc=022DF -cularr=021B6 -cularrp=0293D -cup=0222A -cupbrcap=02A48 -cupcap=02A46 -cupcup=02A4A -cupdot=0228D -cupor=02A45 -curarr=021B7 -curarrm=0293C -curlyeqprec=022DE -curlyeqsucc=022DF -curlyvee=022CE -curlywedge=022CF -curren=000A4 -curvearrowleft=021B6 -curvearrowright=021B7 -cuvee=022CE -cuwed=022CF -cwconint=02232 -cwint=02231 -cylcty=0232D -dArr=021D3 -dHar=02965 -dagger=02020 -daleth=02138 -darr=02193 -dash=02010 -dashv=022A3 -dbkarow=0290F -dblac=002DD -dcaron=0010F -dcy=00434 -dd=02146 -ddagger=02021 -ddarr=021CA -ddotseq=02A77 -deg=000B0 -delta=003B4 -demptyv=029B1 -dfisht=0297F -dfr=1D521 -dharl=021C3 -dharr=021C2 -diam=022C4 -diamond=022C4 -diamondsuit=02666 -diams=02666 -die=000A8 -digamma=003DD -disin=022F2 -div=000F7 -divide=000F7 -divideontimes=022C7 -divonx=022C7 -djcy=00452 -dlcorn=0231E -dlcrop=0230D -dollar=00024 -dopf=1D555 -dot=002D9 -doteq=02250 -doteqdot=02251 -dotminus=02238 -dotplus=02214 -dotsquare=022A1 -doublebarwedge=02306 -downarrow=02193 -downdownarrows=021CA -downharpoonleft=021C3 -downharpoonright=021C2 -drbkarow=02910 -drcorn=0231F -drcrop=0230C -dscr=1D4B9 -dscy=00455 -dsol=029F6 -dstrok=00111 -dtdot=022F1 -dtri=025BF -dtrif=025BE -duarr=021F5 -duhar=0296F -dwangle=029A6 -dzcy=0045F -dzigrarr=027FF -eDDot=02A77 -eDot=02251 -eacute=000E9 -easter=02A6E -ecaron=0011B -ecir=02256 -ecirc=000EA -ecolon=02255 -ecy=0044D -edot=00117 -ee=02147 -efDot=02252 -efr=1D522 -eg=02A9A -egrave=000E8 -egs=02A96 -egsdot=02A98 -el=02A99 -elinters=023E7 -ell=02113 -els=02A95 -elsdot=02A97 -emacr=00113 -empty=02205 -emptyset=02205 -emptyv=02205 -emsp13=02004 -emsp14=02005 -emsp=02003 -eng=0014B -ensp=02002 -eogon=00119 -eopf=1D556 -epar=022D5 -eparsl=029E3 -eplus=02A71 -epsi=003B5 -epsilon=003B5 -epsiv=003F5 -eqcirc=02256 -eqcolon=02255 -eqsim=02242 -eqslantgtr=02A96 -eqslantless=02A95 -equals=0003D -equest=0225F -equiv=02261 -equivDD=02A78 -eqvparsl=029E5 -erDot=02253 -erarr=02971 -escr=0212F -esdot=02250 -esim=02242 -eta=003B7 -eth=000F0 -euml=000EB -euro=020AC -excl=00021 -exist=02203 -expectation=02130 -exponentiale=02147 -fallingdotseq=02252 -fcy=00444 -female=02640 -ffilig=0FB03 -fflig=0FB00 -ffllig=0FB04 -ffr=1D523 -filig=0FB01 -flat=0266D -fllig=0FB02 -fltns=025B1 -fnof=00192 -fopf=1D557 -forall=02200 -fork=022D4 -forkv=02AD9 -fpartint=02A0D -frac12=000BD -frac13=02153 -frac14=000BC -frac15=02155 -frac16=02159 -frac18=0215B -frac23=02154 -frac25=02156 -frac34=000BE -frac35=02157 -frac38=0215C -frac45=02158 -frac56=0215A -frac58=0215D -frac78=0215E -frasl=02044 -frown=02322 -fscr=1D4BB -gE=02267 -gEl=02A8C -gacute=001F5 -gamma=003B3 -gammad=003DD -gap=02A86 -gbreve=0011F -gcirc=0011D -gcy=00433 -gdot=00121 -ge=02265 -gel=022DB -geq=02265 -geqq=02267 -geqslant=02A7E -ges=02A7E -gescc=02AA9 -gesdot=02A80 -gesdoto=02A82 -gesdotol=02A84 -gesles=02A94 -gfr=1D524 -gg=0226B -ggg=022D9 -gimel=02137 -gjcy=00453 -gl=02277 -glE=02A92 -gla=02AA5 -glj=02AA4 -gnE=02269 -gnap=02A8A -gnapprox=02A8A -gne=02A88 -gneq=02A88 -gneqq=02269 -gnsim=022E7 -gopf=1D558 -grave=00060 -gscr=0210A -gsim=02273 -gsime=02A8E -gsiml=02A90 -gt=0003E -gtcc=02AA7 -gtcir=02A7A -gtdot=022D7 -gtlPar=02995 -gtquest=02A7C -gtrapprox=02A86 -gtrarr=02978 -gtrdot=022D7 -gtreqless=022DB -gtreqqless=02A8C -gtrless=02277 -gtrsim=02273 -hArr=021D4 -hairsp=0200A -half=000BD -hamilt=0210B -hardcy=0044A -harr=02194 -harrcir=02948 -harrw=021AD -hbar=0210F -hcirc=00125 -hearts=02665 -heartsuit=02665 -hellip=02026 -hercon=022B9 -hfr=1D525 -hksearow=02925 -hkswarow=02926 -hoarr=021FF -homtht=0223B -hookleftarrow=021A9 -hookrightarrow=021AA -hopf=1D559 -horbar=02015 -hscr=1D4BD -hslash=0210F -hstrok=00127 -hybull=02043 -hyphen=02010 -iacute=000ED -ic=02063 -icirc=000EE -icy=00438 -iecy=00435 -iexcl=000A1 -iff=021D4 -ifr=1D526 -igrave=000EC -ii=02148 -iiiint=02A0C -iiint=0222D -iinfin=029DC -iiota=02129 -ijlig=00133 -imacr=0012B -image=02111 -imagline=02110 -imagpart=02111 -imath=00131 -imof=022B7 -imped=001B5 -in=02208 -incare=02105 -infin=0221E -infintie=029DD -inodot=00131 -int=0222B -intcal=022BA -integers=02124 -intercal=022BA -intlarhk=02A17 -intprod=02A3C -iocy=00451 -iogon=0012F -iopf=1D55A -iota=003B9 -iprod=02A3C -iquest=000BF -iscr=1D4BE -isin=02208 -isinE=022F9 -isindot=022F5 -isins=022F4 -isinsv=022F3 -isinv=02208 -it=02062 -itilde=00129 -iukcy=00456 -iuml=000EF -jcirc=00135 -jcy=00439 -jfr=1D527 -jmath=00237 -jopf=1D55B -jscr=1D4BF -jsercy=00458 -jukcy=00454 -kappa=003BA -kappav=003F0 -kcedil=00137 -kcy=0043A -kfr=1D528 -kgreen=00138 -khcy=00445 -kjcy=0045C -kopf=1D55C -kscr=1D4C0 -lAarr=021DA -lArr=021D0 -lAtail=0291B -lBarr=0290E -lE=02266 -lEg=02A8B -lHar=02962 -lacute=0013A -laemptyv=029B4 -lagran=02112 -lambda=003BB -lang=027E8 -langd=02991 -langle=027E8 -lap=02A85 -laquo=000AB -larr=02190 -larrb=021E4 -larrbfs=0291F -larrfs=0291D -larrhk=021A9 -larrlp=021AB -larrpl=02939 -larrsim=02973 -larrtl=021A2 -lat=02AAB -latail=02919 -late=02AAD -lbarr=0290C -lbbrk=02772 -lbrace=0007B -lbrack=0005B -lbrke=0298B -lbrksld=0298F -lbrkslu=0298D -lcaron=0013E -lcedil=0013C -lceil=02308 -lcub=0007B -lcy=0043B -ldca=02936 -ldquo=0201C -ldquor=0201E -ldrdhar=02967 -ldrushar=0294B -ldsh=021B2 -le=02264 -leftarrow=02190 -leftarrowtail=021A2 -leftharpoondown=021BD -leftharpoonup=021BC -leftleftarrows=021C7 -leftrightarrow=02194 -leftrightarrows=021C6 -leftrightharpoons=021CB -leftrightsquigarrow=021AD -leftthreetimes=022CB -leg=022DA -leq=02264 -leqq=02266 -leqslant=02A7D -les=02A7D -lescc=02AA8 -lesdot=02A7F -lesdoto=02A81 -lesdotor=02A83 -lesges=02A93 -lessapprox=02A85 -lessdot=022D6 -lesseqgtr=022DA -lesseqqgtr=02A8B -lessgtr=02276 -lesssim=02272 -lfisht=0297C -lfloor=0230A -lfr=1D529 -lg=02276 -lgE=02A91 -lhard=021BD -lharu=021BC -lharul=0296A -lhblk=02584 -ljcy=00459 -ll=0226A -llarr=021C7 -llcorner=0231E -llhard=0296B -lltri=025FA -lmidot=00140 -lmoust=023B0 -lmoustache=023B0 -lnE=02268 -lnap=02A89 -lnapprox=02A89 -lne=02A87 -lneq=02A87 -lneqq=02268 -lnsim=022E6 -loang=027EC -loarr=021FD -lobrk=027E6 -longleftarrow=027F5 -longleftrightarrow=027F7 -longmapsto=027FC -longrightarrow=027F6 -looparrowleft=021AB -looparrowright=021AC -lopar=02985 -lopf=1D55D -loplus=02A2D -lotimes=02A34 -lowast=02217 -lowbar=0005F -loz=025CA -lozenge=025CA -lozf=029EB -lpar=00028 -lparlt=02993 -lrarr=021C6 -lrcorner=0231F -lrhar=021CB -lrhard=0296D -lrm=0200E -lrtri=022BF -lsaquo=02039 -lscr=1D4C1 -lsh=021B0 -lsim=02272 -lsime=02A8D -lsimg=02A8F -lsqb=0005B -lsquo=02018 -lsquor=0201A -lstrok=00142 -lt=0003C -ltcc=02AA6 -ltcir=02A79 -ltdot=022D6 -lthree=022CB -ltimes=022C9 -ltlarr=02976 -ltquest=02A7B -ltrPar=02996 -ltri=025C3 -ltrie=022B4 -ltrif=025C2 -lurdshar=0294A -luruhar=02966 -mDDot=0223A -macr=000AF -male=02642 -malt=02720 -maltese=02720 -map=021A6 -mapsto=021A6 -mapstodown=021A7 -mapstoleft=021A4 -mapstoup=021A5 -marker=025AE -mcomma=02A29 -mcy=0043C -mdash=02014 -measuredangle=02221 -mfr=1D52A -mho=02127 -micro=000B5 -mid=02223 -midast=0002A -midcir=02AF0 -middot=000B7 -minus=02212 -minusb=0229F -minusd=02238 -minusdu=02A2A -mlcp=02ADB -mldr=02026 -mnplus=02213 -models=022A7 -mopf=1D55E -mp=02213 -mscr=1D4C2 -mstpos=0223E -mu=003BC -multimap=022B8 -mumap=022B8 -nLeftarrow=021CD -nLeftrightarrow=021CE -nRightarrow=021CF -nVDash=022AF -nVdash=022AE -nabla=02207 -nacute=00144 -nap=02249 -napos=00149 -napprox=02249 -natur=0266E -natural=0266E -naturals=02115 -nbsp=000A0 -ncap=02A43 -ncaron=00148 -ncedil=00146 -ncong=02247 -ncup=02A42 -ncy=0043D -ndash=02013 -ne=02260 -neArr=021D7 -nearhk=02924 -nearr=02197 -nearrow=02197 -nequiv=02262 -nesear=02928 -nexist=02204 -nexists=02204 -nfr=1D52B -nge=02271 -ngeq=02271 -ngsim=02275 -ngt=0226F -ngtr=0226F -nhArr=021CE -nharr=021AE -nhpar=02AF2 -ni=0220B -nis=022FC -nisd=022FA -niv=0220B -njcy=0045A -nlArr=021CD -nlarr=0219A -nldr=02025 -nle=02270 -nleftarrow=0219A -nleftrightarrow=021AE -nleq=02270 -nless=0226E -nlsim=02274 -nlt=0226E -nltri=022EA -nltrie=022EC -nmid=02224 -nopf=1D55F -not=000AC -notin=02209 -notinva=02209 -notinvb=022F7 -notinvc=022F6 -notni=0220C -notniva=0220C -notnivb=022FE -notnivc=022FD -npar=02226 -nparallel=02226 -npolint=02A14 -npr=02280 -nprcue=022E0 -nprec=02280 -nrArr=021CF -nrarr=0219B -nrightarrow=0219B -nrtri=022EB -nrtrie=022ED -nsc=02281 -nsccue=022E1 -nscr=1D4C3 -nshortmid=02224 -nshortparallel=02226 -nsim=02241 -nsime=02244 -nsimeq=02244 -nsmid=02224 -nspar=02226 -nsqsube=022E2 -nsqsupe=022E3 -nsub=02284 -nsube=02288 -nsubseteq=02288 -nsucc=02281 -nsup=02285 -nsupe=02289 -nsupseteq=02289 -ntgl=02279 -ntilde=000F1 -ntlg=02278 -ntriangleleft=022EA -ntrianglelefteq=022EC -ntriangleright=022EB -ntrianglerighteq=022ED -nu=003BD -num=00023 -numero=02116 -numsp=02007 -nvDash=022AD -nvHarr=02904 -nvdash=022AC -nvinfin=029DE -nvlArr=02902 -nvrArr=02903 -nwArr=021D6 -nwarhk=02923 -nwarr=02196 -nwarrow=02196 -nwnear=02927 -oS=024C8 -oacute=000F3 -oast=0229B -ocir=0229A -ocirc=000F4 -ocy=0043E -odash=0229D -odblac=00151 -odiv=02A38 -odot=02299 -odsold=029BC -oelig=00153 -ofcir=029BF -ofr=1D52C -ogon=002DB -ograve=000F2 -ogt=029C1 -ohbar=029B5 -ohm=003A9 -oint=0222E -olarr=021BA -olcir=029BE -olcross=029BB -oline=0203E -olt=029C0 -omacr=0014D -omega=003C9 -omicron=003BF -omid=029B6 -ominus=02296 -oopf=1D560 -opar=029B7 -operp=029B9 -oplus=02295 -or=02228 -orarr=021BB -ord=02A5D -order=02134 -orderof=02134 -ordf=000AA -ordm=000BA -origof=022B6 -oror=02A56 -orslope=02A57 -orv=02A5B -oscr=02134 -oslash=000F8 -osol=02298 -otilde=000F5 -otimes=02297 -otimesas=02A36 -ouml=000F6 -ovbar=0233D -par=02225 -para=000B6 -parallel=02225 -parsim=02AF3 -parsl=02AFD -part=02202 -pcy=0043F -percnt=00025 -period=0002E -permil=02030 -perp=022A5 -pertenk=02031 -pfr=1D52D -phi=003C6 -phiv=003D5 -phmmat=02133 -phone=0260E -pi=003C0 -pitchfork=022D4 -piv=003D6 -planck=0210F -planckh=0210E -plankv=0210F -plus=0002B -plusacir=02A23 -plusb=0229E -pluscir=02A22 -plusdo=02214 -plusdu=02A25 -pluse=02A72 -plusmn=000B1 -plussim=02A26 -plustwo=02A27 -pm=000B1 -pointint=02A15 -popf=1D561 -pound=000A3 -pr=0227A -prE=02AB3 -prap=02AB7 -prcue=0227C -pre=02AAF -prec=0227A -precapprox=02AB7 -preccurlyeq=0227C -preceq=02AAF -precnapprox=02AB9 -precneqq=02AB5 -precnsim=022E8 -precsim=0227E -prime=02032 -primes=02119 -prnE=02AB5 -prnap=02AB9 -prnsim=022E8 -prod=0220F -profalar=0232E -profline=02312 -profsurf=02313 -prop=0221D -propto=0221D -prsim=0227E -prurel=022B0 -pscr=1D4C5 -psi=003C8 -puncsp=02008 -qfr=1D52E -qint=02A0C -qopf=1D562 -qprime=02057 -qscr=1D4C6 -quaternions=0210D -quatint=02A16 -quest=0003F -questeq=0225F -quot=00022 -rAarr=021DB -rArr=021D2 -rAtail=0291C -rBarr=0290F -rHar=02964 -racute=00155 -radic=0221A -raemptyv=029B3 -rang=027E9 -rangd=02992 -range=029A5 -rangle=027E9 -raquo=000BB -rarr=02192 -rarrap=02975 -rarrb=021E5 -rarrbfs=02920 -rarrc=02933 -rarrfs=0291E -rarrhk=021AA -rarrlp=021AC -rarrpl=02945 -rarrsim=02974 -rarrtl=021A3 -rarrw=0219D -ratail=0291A -ratio=02236 -rationals=0211A -rbarr=0290D -rbbrk=02773 -rbrace=0007D -rbrack=0005D -rbrke=0298C -rbrksld=0298E -rbrkslu=02990 -rcaron=00159 -rcedil=00157 -rceil=02309 -rcub=0007D -rcy=00440 -rdca=02937 -rdldhar=02969 -rdquo=0201D -rdquor=0201D -rdsh=021B3 -real=0211C -realine=0211B -realpart=0211C -reals=0211D -rect=025AD -reg=000AE -rfisht=0297D -rfloor=0230B -rfr=1D52F -rhard=021C1 -rharu=021C0 -rharul=0296C -rho=003C1 -rhov=003F1 -rightarrow=02192 -rightarrowtail=021A3 -rightharpoondown=021C1 -rightharpoonup=021C0 -rightleftarrows=021C4 -rightleftharpoons=021CC -rightrightarrows=021C9 -rightsquigarrow=0219D -rightthreetimes=022CC -ring=002DA -risingdotseq=02253 -rlarr=021C4 -rlhar=021CC -rlm=0200F -rmoust=023B1 -rmoustache=023B1 -rnmid=02AEE -roang=027ED -roarr=021FE -robrk=027E7 -ropar=02986 -ropf=1D563 -roplus=02A2E -rotimes=02A35 -rpar=00029 -rpargt=02994 -rppolint=02A12 -rrarr=021C9 -rsaquo=0203A -rscr=1D4C7 -rsh=021B1 -rsqb=0005D -rsquo=02019 -rsquor=02019 -rthree=022CC -rtimes=022CA -rtri=025B9 -rtrie=022B5 -rtrif=025B8 -rtriltri=029CE -ruluhar=02968 -rx=0211E -sacute=0015B -sbquo=0201A -sc=0227B -scE=02AB4 -scap=02AB8 -scaron=00161 -sccue=0227D -sce=02AB0 -scedil=0015F -scirc=0015D -scnE=02AB6 -scnap=02ABA -scnsim=022E9 -scpolint=02A13 -scsim=0227F -scy=00441 -sdot=022C5 -sdotb=022A1 -sdote=02A66 -seArr=021D8 -searhk=02925 -searr=02198 -searrow=02198 -sect=000A7 -semi=0003B -seswar=02929 -setminus=02216 -setmn=02216 -sext=02736 -sfr=1D530 -sfrown=02322 -sharp=0266F -shchcy=00449 -shcy=00448 -shortmid=02223 -shortparallel=02225 -shy=000AD -sigma=003C3 -sigmaf=003C2 -sigmav=003C2 -sim=0223C -simdot=02A6A -sime=02243 -simeq=02243 -simg=02A9E -simgE=02AA0 -siml=02A9D -simlE=02A9F -simne=02246 -simplus=02A24 -simrarr=02972 -slarr=02190 -smallsetminus=02216 -smashp=02A33 -smeparsl=029E4 -smid=02223 -smile=02323 -smt=02AAA -smte=02AAC -softcy=0044C -sol=0002F -solb=029C4 -solbar=0233F -sopf=1D564 -spades=02660 -spadesuit=02660 -spar=02225 -sqcap=02293 -sqcup=02294 -sqsub=0228F -sqsube=02291 -sqsubset=0228F -sqsubseteq=02291 -sqsup=02290 -sqsupe=02292 -sqsupset=02290 -sqsupseteq=02292 -squ=025A1 -square=025A1 -squarf=025AA -squf=025AA -srarr=02192 -sscr=1D4C8 -ssetmn=02216 -ssmile=02323 -sstarf=022C6 -star=02606 -starf=02605 -straightepsilon=003F5 -straightphi=003D5 -strns=000AF -sub=02282 -subE=02AC5 -subdot=02ABD -sube=02286 -subedot=02AC3 -submult=02AC1 -subnE=02ACB -subne=0228A -subplus=02ABF -subrarr=02979 -subset=02282 -subseteq=02286 -subseteqq=02AC5 -subsetneq=0228A -subsetneqq=02ACB -subsim=02AC7 -subsub=02AD5 -subsup=02AD3 -succ=0227B -succapprox=02AB8 -succcurlyeq=0227D -succeq=02AB0 -succnapprox=02ABA -succneqq=02AB6 -succnsim=022E9 -succsim=0227F -sum=02211 -sung=0266A -sup1=000B9 -sup2=000B2 -sup3=000B3 -sup=02283 -supE=02AC6 -supdot=02ABE -supdsub=02AD8 -supe=02287 -supedot=02AC4 -suphsol=027C9 -suphsub=02AD7 -suplarr=0297B -supmult=02AC2 -supnE=02ACC -supne=0228B -supplus=02AC0 -supset=02283 -supseteq=02287 -supseteqq=02AC6 -supsetneq=0228B -supsetneqq=02ACC -supsim=02AC8 -supsub=02AD4 -supsup=02AD6 -swArr=021D9 -swarhk=02926 -swarr=02199 -swarrow=02199 -swnwar=0292A -szlig=000DF -target=02316 -tau=003C4 -tbrk=023B4 -tcaron=00165 -tcedil=00163 -tcy=00442 -tdot=020DB -telrec=02315 -tfr=1D531 -there4=02234 -therefore=02234 -theta=003B8 -thetasym=003D1 -thetav=003D1 -thickapprox=02248 -thicksim=0223C -thinsp=02009 -thkap=02248 -thksim=0223C -thorn=000FE -tilde=002DC -times=000D7 -timesb=022A0 -timesbar=02A31 -timesd=02A30 -tint=0222D -toea=02928 -top=022A4 -topbot=02336 -topcir=02AF1 -topf=1D565 -topfork=02ADA -tosa=02929 -tprime=02034 -trade=02122 -triangle=025B5 -triangledown=025BF -triangleleft=025C3 -trianglelefteq=022B4 -triangleq=0225C -triangleright=025B9 -trianglerighteq=022B5 -tridot=025EC -trie=0225C -triminus=02A3A -triplus=02A39 -trisb=029CD -tritime=02A3B -trpezium=023E2 -tscr=1D4C9 -tscy=00446 -tshcy=0045B -tstrok=00167 -twixt=0226C -twoheadleftarrow=0219E -twoheadrightarrow=021A0 -uArr=021D1 -uHar=02963 -uacute=000FA -uarr=02191 -ubrcy=0045E -ubreve=0016D -ucirc=000FB -ucy=00443 -udarr=021C5 -udblac=00171 -udhar=0296E -ufisht=0297E -ufr=1D532 -ugrave=000F9 -uharl=021BF -uharr=021BE -uhblk=02580 -ulcorn=0231C -ulcorner=0231C -ulcrop=0230F -ultri=025F8 -umacr=0016B -uml=000A8 -uogon=00173 -uopf=1D566 -uparrow=02191 -updownarrow=02195 -upharpoonleft=021BF -upharpoonright=021BE -uplus=0228E -upsi=003C5 -upsih=003D2 -upsilon=003C5 -upuparrows=021C8 -urcorn=0231D -urcorner=0231D -urcrop=0230E -uring=0016F -urtri=025F9 -uscr=1D4CA -utdot=022F0 -utilde=00169 -utri=025B5 -utrif=025B4 -uuarr=021C8 -uuml=000FC -uwangle=029A7 -vArr=021D5 -vBar=02AE8 -vBarv=02AE9 -vDash=022A8 -vangrt=0299C -varepsilon=003F5 -varkappa=003F0 -varnothing=02205 -varphi=003D5 -varpi=003D6 -varpropto=0221D -varr=02195 -varrho=003F1 -varsigma=003C2 -vartheta=003D1 -vartriangleleft=022B2 -vartriangleright=022B3 -vcy=00432 -vdash=022A2 -vee=02228 -veebar=022BB -veeeq=0225A -vellip=022EE -verbar=0007C -vert=0007C -vfr=1D533 -vltri=022B2 -vopf=1D567 -vprop=0221D -vrtri=022B3 -vscr=1D4CB -vzigzag=0299A -wcirc=00175 -wedbar=02A5F -wedge=02227 -wedgeq=02259 -weierp=02118 -wfr=1D534 -wopf=1D568 -wp=02118 -wr=02240 -wreath=02240 -wscr=1D4CC -xcap=022C2 -xcirc=025EF -xcup=022C3 -xdtri=025BD -xfr=1D535 -xhArr=027FA -xharr=027F7 -xi=003BE -xlArr=027F8 -xlarr=027F5 -xmap=027FC -xnis=022FB -xodot=02A00 -xopf=1D569 -xoplus=02A01 -xotime=02A02 -xrArr=027F9 -xrarr=027F6 -xscr=1D4CD -xsqcup=02A06 -xuplus=02A04 -xutri=025B3 -xvee=022C1 -xwedge=022C0 -yacute=000FD -yacy=0044F -ycirc=00177 -ycy=0044B -yen=000A5 -yfr=1D536 -yicy=00457 -yopf=1D56A -yscr=1D4CE -yucy=0044E -yuml=000FF -zacute=0017A -zcaron=0017E -zcy=00437 -zdot=0017C -zeetrf=02128 -zeta=003B6 -zfr=1D537 -zhcy=00436 -zigrarr=021DD -zopf=1D56B -zscr=1D4CF -zwj=0200D -zwnj=0200C diff --git a/server/src/org/jsoup/nodes/package-info.java b/server/src/org/jsoup/nodes/package-info.java deleted file mode 100644 index 24b12803ff..0000000000 --- a/server/src/org/jsoup/nodes/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - HTML document structure nodes. - */ -package org.jsoup.nodes; \ No newline at end of file diff --git a/server/src/org/jsoup/package-info.java b/server/src/org/jsoup/package-info.java deleted file mode 100644 index 49526116b4..0000000000 --- a/server/src/org/jsoup/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - Contains the main {@link org.jsoup.Jsoup} class, which provides convenient static access to the jsoup functionality. - */ -package org.jsoup; \ No newline at end of file diff --git a/server/src/org/jsoup/parser/CharacterReader.java b/server/src/org/jsoup/parser/CharacterReader.java deleted file mode 100644 index 30fbca07f1..0000000000 --- a/server/src/org/jsoup/parser/CharacterReader.java +++ /dev/null @@ -1,244 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; - -/** - * CharacterReader consumes tokens off a string. To replace the old TokenQueue. - */ -class CharacterReader { - static final char EOF = (char) -1; - - private final String input; - private final int length; - private int pos = 0; - private int mark = 0; - - CharacterReader(String input) { - Validate.notNull(input); - input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns - // to newlines - - this.input = input; - length = input.length(); - } - - int pos() { - return pos; - } - - boolean isEmpty() { - return pos >= length; - } - - char current() { - return isEmpty() ? EOF : input.charAt(pos); - } - - char consume() { - char val = isEmpty() ? EOF : input.charAt(pos); - pos++; - return val; - } - - void unconsume() { - pos--; - } - - void advance() { - pos++; - } - - void mark() { - mark = pos; - } - - void rewindToMark() { - pos = mark; - } - - String consumeAsString() { - return input.substring(pos, pos++); - } - - String consumeTo(char c) { - int offset = input.indexOf(c, pos); - if (offset != -1) { - String consumed = input.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return consumeToEnd(); - } - } - - String consumeTo(String seq) { - int offset = input.indexOf(seq, pos); - if (offset != -1) { - String consumed = input.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return consumeToEnd(); - } - } - - String consumeToAny(char... seq) { - int start = pos; - - OUTER: while (!isEmpty()) { - char c = input.charAt(pos); - for (char seek : seq) { - if (seek == c) { - break OUTER; - } - } - pos++; - } - - return pos > start ? input.substring(start, pos) : ""; - } - - String consumeToEnd() { - String data = input.substring(pos, input.length()); - pos = input.length(); - return data; - } - - String consumeLetterSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { - pos++; - } else { - break; - } - } - - return input.substring(start, pos); - } - - String consumeLetterThenDigitSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { - pos++; - } else { - break; - } - } - while (!isEmpty()) { - char c = input.charAt(pos); - if (c >= '0' && c <= '9') { - pos++; - } else { - break; - } - } - - return input.substring(start, pos); - } - - String consumeHexSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') - || (c >= 'a' && c <= 'f')) { - pos++; - } else { - break; - } - } - return input.substring(start, pos); - } - - String consumeDigitSequence() { - int start = pos; - while (!isEmpty()) { - char c = input.charAt(pos); - if (c >= '0' && c <= '9') { - pos++; - } else { - break; - } - } - return input.substring(start, pos); - } - - boolean matches(char c) { - return !isEmpty() && input.charAt(pos) == c; - - } - - boolean matches(String seq) { - return input.startsWith(seq, pos); - } - - boolean matchesIgnoreCase(String seq) { - return input.regionMatches(true, pos, seq, 0, seq.length()); - } - - boolean matchesAny(char... seq) { - if (isEmpty()) { - return false; - } - - char c = input.charAt(pos); - for (char seek : seq) { - if (seek == c) { - return true; - } - } - return false; - } - - boolean matchesLetter() { - if (isEmpty()) { - return false; - } - char c = input.charAt(pos); - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - } - - boolean matchesDigit() { - if (isEmpty()) { - return false; - } - char c = input.charAt(pos); - return (c >= '0' && c <= '9'); - } - - boolean matchConsume(String seq) { - if (matches(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - boolean matchConsumeIgnoreCase(String seq) { - if (matchesIgnoreCase(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - boolean containsIgnoreCase(String seq) { - // used to check presence of , . only finds consistent - // case. - String loScan = seq.toLowerCase(); - String hiScan = seq.toUpperCase(); - return (input.indexOf(loScan, pos) > -1) - || (input.indexOf(hiScan, pos) > -1); - } - - @Override - public String toString() { - return input.substring(pos); - } -} diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilder.java b/server/src/org/jsoup/parser/HtmlTreeBuilder.java deleted file mode 100644 index f09ab8794c..0000000000 --- a/server/src/org/jsoup/parser/HtmlTreeBuilder.java +++ /dev/null @@ -1,754 +0,0 @@ -package org.jsoup.parser; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Comment; -import org.jsoup.nodes.DataNode; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; - -/** - * HTML Tree Builder; creates a DOM from Tokens. - */ -class HtmlTreeBuilder extends TreeBuilder { - - private HtmlTreeBuilderState state; // the current state - private HtmlTreeBuilderState originalState; // original / marked state - - private boolean baseUriSetFromDoc = false; - private Element headElement; // the current head element - private Element formElement; // the current form element - private Element contextElement; // fragment parse context -- could be null - // even if fragment parsing - private DescendableLinkedList formattingElements = new DescendableLinkedList(); // active - // (open) - // formatting - // elements - private List pendingTableCharacters = new ArrayList(); // chars - // in - // table - // to - // be - // shifted - // out - - private boolean framesetOk = true; // if ok to go into frameset - private boolean fosterInserts = false; // if next inserts should be fostered - private boolean fragmentParsing = false; // if parsing a fragment of html - - HtmlTreeBuilder() { - } - - @Override - Document parse(String input, String baseUri, ParseErrorList errors) { - state = HtmlTreeBuilderState.Initial; - return super.parse(input, baseUri, errors); - } - - List parseFragment(String inputFragment, Element context, - String baseUri, ParseErrorList errors) { - // context may be null - state = HtmlTreeBuilderState.Initial; - initialiseParse(inputFragment, baseUri, errors); - contextElement = context; - fragmentParsing = true; - Element root = null; - - if (context != null) { - if (context.ownerDocument() != null) { - doc.quirksMode(context.ownerDocument().quirksMode()); - } - - // initialise the tokeniser state: - String contextTag = context.tagName(); - if (StringUtil.in(contextTag, "title", "textarea")) { - tokeniser.transition(TokeniserState.Rcdata); - } else if (StringUtil.in(contextTag, "iframe", "noembed", - "noframes", "style", "xmp")) { - tokeniser.transition(TokeniserState.Rawtext); - } else if (contextTag.equals("script")) { - tokeniser.transition(TokeniserState.ScriptData); - } else if (contextTag.equals(("noscript"))) { - tokeniser.transition(TokeniserState.Data); // if scripting - // enabled, rawtext - } else if (contextTag.equals("plaintext")) { - tokeniser.transition(TokeniserState.Data); - } else { - tokeniser.transition(TokeniserState.Data); // default - } - - root = new Element(Tag.valueOf("html"), baseUri); - doc.appendChild(root); - stack.push(root); - resetInsertionMode(); - // todo: setup form element to nearest form on context (up ancestor - // chain) - } - - runParser(); - if (context != null) { - return root.childNodes(); - } else { - return doc.childNodes(); - } - } - - @Override - protected boolean process(Token token) { - currentToken = token; - return state.process(token, this); - } - - boolean process(Token token, HtmlTreeBuilderState state) { - currentToken = token; - return state.process(token, this); - } - - void transition(HtmlTreeBuilderState state) { - this.state = state; - } - - HtmlTreeBuilderState state() { - return state; - } - - void markInsertionMode() { - originalState = state; - } - - HtmlTreeBuilderState originalState() { - return originalState; - } - - void framesetOk(boolean framesetOk) { - this.framesetOk = framesetOk; - } - - boolean framesetOk() { - return framesetOk; - } - - Document getDocument() { - return doc; - } - - String getBaseUri() { - return baseUri; - } - - void maybeSetBaseUri(Element base) { - if (baseUriSetFromDoc) { - return; - } - - String href = base.absUrl("href"); - if (href.length() != 0) { // ignore etc - baseUri = href; - baseUriSetFromDoc = true; - doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) - // will get updated base, and to update all - // descendants - } - } - - boolean isFragmentParsing() { - return fragmentParsing; - } - - void error(HtmlTreeBuilderState state) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Unexpected token [%s] when in state [%s]", currentToken - .tokenType(), state)); - } - } - - Element insert(Token.StartTag startTag) { - // handle empty unknown tags - // when the spec expects an empty tag, will directly hit insertEmpty, so - // won't generate fake end tag. - if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) { - Element el = insertEmpty(startTag); - process(new Token.EndTag(el.tagName())); // ensure we get out of - // whatever state we are in - return el; - } - - Element el = new Element(Tag.valueOf(startTag.name()), baseUri, - startTag.attributes); - insert(el); - return el; - } - - Element insert(String startTagName) { - Element el = new Element(Tag.valueOf(startTagName), baseUri); - insert(el); - return el; - } - - void insert(Element el) { - insertNode(el); - stack.add(el); - } - - Element insertEmpty(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name()); - Element el = new Element(tag, baseUri, startTag.attributes); - insertNode(el); - if (startTag.isSelfClosing()) { - tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) { - tag.setSelfClosing(); - } - } - return el; - } - - void insert(Token.Comment commentToken) { - Comment comment = new Comment(commentToken.getData(), baseUri); - insertNode(comment); - } - - void insert(Token.Character characterToken) { - Node node; - // characters in script and style go in as datanodes, not text nodes - if (StringUtil.in(currentElement().tagName(), "script", "style")) { - node = new DataNode(characterToken.getData(), baseUri); - } else { - node = new TextNode(characterToken.getData(), baseUri); - } - currentElement().appendChild(node); // doesn't use insertNode, because - // we don't foster these; and will - // always have a stack. - } - - private void insertNode(Node node) { - // if the stack hasn't been set up yet, elements (doctype, comments) go - // into the doc - if (stack.size() == 0) { - doc.appendChild(node); - } else if (isFosterInserts()) { - insertInFosterParent(node); - } else { - currentElement().appendChild(node); - } - } - - Element pop() { - // todo - dev, remove validation check - if (stack.peekLast().nodeName().equals("td") - && !state.name().equals("InCell")) { - Validate.isFalse(true, "pop td not in cell"); - } - if (stack.peekLast().nodeName().equals("html")) { - Validate.isFalse(true, "popping html!"); - } - return stack.pollLast(); - } - - void push(Element element) { - stack.add(element); - } - - DescendableLinkedList getStack() { - return stack; - } - - boolean onStack(Element el) { - return isElementInQueue(stack, el); - } - - private boolean isElementInQueue(DescendableLinkedList queue, - Element element) { - Iterator it = queue.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == element) { - return true; - } - } - return false; - } - - Element getFromStack(String elName) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - return next; - } - } - return null; - } - - boolean removeFromStack(Element el) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - it.remove(); - return true; - } - } - return false; - } - - void popStackToClose(String elName) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - it.remove(); - break; - } else { - it.remove(); - } - } - } - - void popStackToClose(String... elNames) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (StringUtil.in(next.nodeName(), elNames)) { - it.remove(); - break; - } else { - it.remove(); - } - } - } - - void popStackToBefore(String elName) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - break; - } else { - it.remove(); - } - } - } - - void clearStackToTableContext() { - clearStackToContext("table"); - } - - void clearStackToTableBodyContext() { - clearStackToContext("tbody", "tfoot", "thead"); - } - - void clearStackToTableRowContext() { - clearStackToContext("tr"); - } - - private void clearStackToContext(String... nodeNames) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (StringUtil.in(next.nodeName(), nodeNames) - || next.nodeName().equals("html")) { - break; - } else { - it.remove(); - } - } - } - - Element aboveOnStack(Element el) { - assert onStack(el); - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - return it.next(); - } - } - return null; - } - - void insertOnStackAfter(Element after, Element in) { - int i = stack.lastIndexOf(after); - Validate.isTrue(i != -1); - stack.add(i + 1, in); - } - - void replaceOnStack(Element out, Element in) { - replaceInQueue(stack, out, in); - } - - private void replaceInQueue(LinkedList queue, Element out, - Element in) { - int i = queue.lastIndexOf(out); - Validate.isTrue(i != -1); - queue.remove(i); - queue.add(i, in); - } - - void resetInsertionMode() { - boolean last = false; - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element node = it.next(); - if (!it.hasNext()) { - last = true; - node = contextElement; - } - String name = node.nodeName(); - if ("select".equals(name)) { - transition(HtmlTreeBuilderState.InSelect); - break; // frag - } else if (("td".equals(name) || "td".equals(name) && !last)) { - transition(HtmlTreeBuilderState.InCell); - break; - } else if ("tr".equals(name)) { - transition(HtmlTreeBuilderState.InRow); - break; - } else if ("tbody".equals(name) || "thead".equals(name) - || "tfoot".equals(name)) { - transition(HtmlTreeBuilderState.InTableBody); - break; - } else if ("caption".equals(name)) { - transition(HtmlTreeBuilderState.InCaption); - break; - } else if ("colgroup".equals(name)) { - transition(HtmlTreeBuilderState.InColumnGroup); - break; // frag - } else if ("table".equals(name)) { - transition(HtmlTreeBuilderState.InTable); - break; - } else if ("head".equals(name)) { - transition(HtmlTreeBuilderState.InBody); - break; // frag - } else if ("body".equals(name)) { - transition(HtmlTreeBuilderState.InBody); - break; - } else if ("frameset".equals(name)) { - transition(HtmlTreeBuilderState.InFrameset); - break; // frag - } else if ("html".equals(name)) { - transition(HtmlTreeBuilderState.BeforeHead); - break; // frag - } else if (last) { - transition(HtmlTreeBuilderState.InBody); - break; // frag - } - } - } - - // todo: tidy up in specific scope methods - private boolean inSpecificScope(String targetName, String[] baseTypes, - String[] extraTypes) { - return inSpecificScope(new String[] { targetName }, baseTypes, - extraTypes); - } - - private boolean inSpecificScope(String[] targetNames, String[] baseTypes, - String[] extraTypes) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element el = it.next(); - String elName = el.nodeName(); - if (StringUtil.in(elName, targetNames)) { - return true; - } - if (StringUtil.in(elName, baseTypes)) { - return false; - } - if (extraTypes != null && StringUtil.in(elName, extraTypes)) { - return false; - } - } - Validate.fail("Should not be reachable"); - return false; - } - - boolean inScope(String[] targetNames) { - return inSpecificScope(targetNames, new String[] { "applet", "caption", - "html", "table", "td", "th", "marquee", "object" }, null); - } - - boolean inScope(String targetName) { - return inScope(targetName, null); - } - - boolean inScope(String targetName, String[] extras) { - return inSpecificScope(targetName, new String[] { "applet", "caption", - "html", "table", "td", "th", "marquee", "object" }, extras); - // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml - // todo: in svg namespace: forignOjbect, desc, title - } - - boolean inListItemScope(String targetName) { - return inScope(targetName, new String[] { "ol", "ul" }); - } - - boolean inButtonScope(String targetName) { - return inScope(targetName, new String[] { "button" }); - } - - boolean inTableScope(String targetName) { - return inSpecificScope(targetName, new String[] { "html", "table" }, - null); - } - - boolean inSelectScope(String targetName) { - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element el = it.next(); - String elName = el.nodeName(); - if (elName.equals(targetName)) { - return true; - } - if (!StringUtil.in(elName, "optgroup", "option")) { - return false; - } - } - Validate.fail("Should not be reachable"); - return false; - } - - void setHeadElement(Element headElement) { - this.headElement = headElement; - } - - Element getHeadElement() { - return headElement; - } - - boolean isFosterInserts() { - return fosterInserts; - } - - void setFosterInserts(boolean fosterInserts) { - this.fosterInserts = fosterInserts; - } - - Element getFormElement() { - return formElement; - } - - void setFormElement(Element formElement) { - this.formElement = formElement; - } - - void newPendingTableCharacters() { - pendingTableCharacters = new ArrayList(); - } - - List getPendingTableCharacters() { - return pendingTableCharacters; - } - - void setPendingTableCharacters(List pendingTableCharacters) { - this.pendingTableCharacters = pendingTableCharacters; - } - - /** - * 11.2.5.2 Closing elements that have implied end tags - *

- * When the steps below require the UA to generate implied end tags, then, - * while the current node is a dd element, a dt element, an li element, an - * option element, an optgroup element, a p element, an rp element, or an rt - * element, the UA must pop the current node off the stack of open elements. - * - * @param excludeTag - * If a step requires the UA to generate implied end tags but - * lists an element to exclude from the process, then the UA must - * perform the above steps as if that element was not in the - * above list. - */ - void generateImpliedEndTags(String excludeTag) { - while ((excludeTag != null && !currentElement().nodeName().equals( - excludeTag)) - && StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", - "option", "optgroup", "p", "rp", "rt")) { - pop(); - } - } - - void generateImpliedEndTags() { - generateImpliedEndTags(null); - } - - boolean isSpecial(Element el) { - // todo: mathml's mi, mo, mn - // todo: svg's foreigObject, desc, title - String name = el.nodeName(); - return StringUtil.in(name, "address", "applet", "area", "article", - "aside", "base", "basefont", "bgsound", "blockquote", "body", - "br", "button", "caption", "center", "col", "colgroup", - "command", "dd", "details", "dir", "div", "dl", "dt", "embed", - "fieldset", "figcaption", "figure", "footer", "form", "frame", - "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", - "header", "hgroup", "hr", "html", "iframe", "img", "input", - "isindex", "li", "link", "listing", "marquee", "menu", "meta", - "nav", "noembed", "noframes", "noscript", "object", "ol", "p", - "param", "plaintext", "pre", "script", "section", "select", - "style", "summary", "table", "tbody", "td", "textarea", - "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp"); - } - - // active formatting elements - void pushActiveFormattingElements(Element in) { - int numSeen = 0; - Iterator iter = formattingElements.descendingIterator(); - while (iter.hasNext()) { - Element el = iter.next(); - if (el == null) { - break; - } - - if (isSameFormattingElement(in, el)) { - numSeen++; - } - - if (numSeen == 3) { - iter.remove(); - break; - } - } - formattingElements.add(in); - } - - private boolean isSameFormattingElement(Element a, Element b) { - // same if: same namespace, tag, and attributes. Element.equals only - // checks tag, might in future check children - return a.nodeName().equals(b.nodeName()) && - // a.namespace().equals(b.namespace()) && - a.attributes().equals(b.attributes()); - // todo: namespaces - } - - void reconstructFormattingElements() { - int size = formattingElements.size(); - if (size == 0 || formattingElements.getLast() == null - || onStack(formattingElements.getLast())) { - return; - } - - Element entry = formattingElements.getLast(); - int pos = size - 1; - boolean skip = false; - while (true) { - if (pos == 0) { // step 4. if none before, skip to 8 - skip = true; - break; - } - entry = formattingElements.get(--pos); // step 5. one earlier than - // entry - if (entry == null || onStack(entry)) { - break; // jump to 8, else continue back to 4 - } - } - while (true) { - if (!skip) { - entry = formattingElements.get(++pos); - } - Validate.notNull(entry); // should not occur, as we break at last - // element - - // 8. create new element from element, 9 insert into current node, - // onto stack - skip = false; // can only skip increment from 4. - Element newEl = insert(entry.nodeName()); // todo: avoid fostering - // here? - // newEl.namespace(entry.namespace()); // todo: namespaces - newEl.attributes().addAll(entry.attributes()); - - // 10. replace entry with new entry - formattingElements.add(pos, newEl); - formattingElements.remove(pos + 1); - - // 11 - if (pos == size - 1) { - break; - } - } - } - - void clearFormattingElementsToLastMarker() { - while (!formattingElements.isEmpty()) { - Element el = formattingElements.peekLast(); - formattingElements.removeLast(); - if (el == null) { - break; - } - } - } - - void removeFromActiveFormattingElements(Element el) { - Iterator it = formattingElements.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == el) { - it.remove(); - break; - } - } - } - - boolean isInActiveFormattingElements(Element el) { - return isElementInQueue(formattingElements, el); - } - - Element getActiveFormattingElement(String nodeName) { - Iterator it = formattingElements.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == null) { - break; - } else if (next.nodeName().equals(nodeName)) { - return next; - } - } - return null; - } - - void replaceActiveFormattingElement(Element out, Element in) { - replaceInQueue(formattingElements, out, in); - } - - void insertMarkerToFormattingElements() { - formattingElements.add(null); - } - - void insertInFosterParent(Node in) { - Element fosterParent = null; - Element lastTable = getFromStack("table"); - boolean isLastTableParent = false; - if (lastTable != null) { - if (lastTable.parent() != null) { - fosterParent = lastTable.parent(); - isLastTableParent = true; - } else { - fosterParent = aboveOnStack(lastTable); - } - } else { // no table == frag - fosterParent = stack.get(0); - } - - if (isLastTableParent) { - Validate.notNull(lastTable); // last table cannot be null by this - // point. - lastTable.before(in); - } else { - fosterParent.appendChild(in); - } - } - - @Override - public String toString() { - return "TreeBuilder{" + "currentToken=" + currentToken + ", state=" - + state + ", currentElement=" + currentElement() + '}'; - } -} diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java deleted file mode 100644 index 258d547a49..0000000000 --- a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java +++ /dev/null @@ -1,1671 +0,0 @@ -package org.jsoup.parser; - -import java.util.Iterator; -import java.util.LinkedList; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.DocumentType; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -/** - * The Tree Builder's current state. Each state embodies the processing for the - * state, and transitions to other states. - */ -enum HtmlTreeBuilderState { - Initial { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return true; // ignore whitespace - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - // todo: parse error check on expected doctypes - // todo: quirk state check on doctype ids - Token.Doctype d = t.asDoctype(); - DocumentType doctype = new DocumentType(d.getName(), - d.getPublicIdentifier(), d.getSystemIdentifier(), - tb.getBaseUri()); - tb.getDocument().appendChild(doctype); - if (d.isForceQuirks()) { - tb.getDocument().quirksMode(Document.QuirksMode.quirks); - } - tb.transition(BeforeHtml); - } else { - // todo: check not iframe srcdoc - tb.transition(BeforeHtml); - return tb.process(t); // re-process token - } - return true; - } - }, - BeforeHtml { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (isWhitespace(t)) { - return true; // ignore whitespace - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - tb.insert(t.asStartTag()); - tb.transition(BeforeHead); - } else if (t.isEndTag() - && (StringUtil.in(t.asEndTag().name(), "head", "body", - "html", "br"))) { - return anythingElse(t, tb); - } else if (t.isEndTag()) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.insert("html"); - tb.transition(BeforeHead); - return tb.process(t); - } - }, - BeforeHead { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return true; - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return InBody.process(t, tb); // does not transition - } else if (t.isStartTag() && t.asStartTag().name().equals("head")) { - Element head = tb.insert(t.asStartTag()); - tb.setHeadElement(head); - tb.transition(InHead); - } else if (t.isEndTag() - && (StringUtil.in(t.asEndTag().name(), "head", "body", - "html", "br"))) { - tb.process(new Token.StartTag("head")); - return tb.process(t); - } else if (t.isEndTag()) { - tb.error(this); - return false; - } else { - tb.process(new Token.StartTag("head")); - return tb.process(t); - } - return true; - } - }, - InHead { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - return true; - } - switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return InBody.process(t, tb); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", - "command", "link")) { - Element el = tb.insertEmpty(start); - // jsoup special: update base the frist time it is seen - if (name.equals("base") && el.hasAttr("href")) { - tb.maybeSetBaseUri(el); - } - } else if (name.equals("meta")) { - Element meta = tb.insertEmpty(start); - // todo: charset switches - } else if (name.equals("title")) { - handleRcData(start, tb); - } else if (StringUtil.in(name, "noframes", "style")) { - handleRawtext(start, tb); - } else if (name.equals("noscript")) { - // else if noscript && scripting flag = true: rawtext (jsoup - // doesn't run script, to handle as noscript) - tb.insert(start); - tb.transition(InHeadNoscript); - } else if (name.equals("script")) { - // skips some script rules as won't execute them - tb.insert(start); - tb.tokeniser.transition(TokeniserState.ScriptData); - tb.markInsertionMode(); - tb.transition(Text); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("head")) { - tb.pop(); - tb.transition(AfterHead); - } else if (StringUtil.in(name, "body", "html", "br")) { - return anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, TreeBuilder tb) { - tb.process(new Token.EndTag("head")); - return tb.process(t); - } - }, - InHeadNoscript { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isDoctype()) { - tb.error(this); - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { - tb.pop(); - tb.transition(InHead); - } else if (isWhitespace(t) - || t.isComment() - || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "basefont", "bgsound", "link", "meta", "noframes", - "style"))) { - return tb.process(t, InHead); - } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { - return anythingElse(t, tb); - } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "head", "noscript")) || t.isEndTag()) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - tb.process(new Token.EndTag("noscript")); - return tb.process(t); - } - }, - AfterHead { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - } else if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - return tb.process(t, InBody); - } else if (name.equals("body")) { - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InBody); - } else if (name.equals("frameset")) { - tb.insert(startTag); - tb.transition(InFrameset); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", - "link", "meta", "noframes", "script", "style", "title")) { - tb.error(this); - Element head = tb.getHeadElement(); - tb.push(head); - tb.process(t, InHead); - tb.removeFromStack(head); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - anythingElse(t, tb); - } - } else if (t.isEndTag()) { - if (StringUtil.in(t.asEndTag().name(), "body", "html")) { - anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - } else { - anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.process(new Token.StartTag("body")); - tb.framesetOk(true); - return tb.process(t); - } - }, - InBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: { - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - // todo confirm that check - tb.error(this); - return false; - } else if (isWhitespace(c)) { - tb.reconstructFormattingElements(); - tb.insert(c); - } else { - tb.reconstructFormattingElements(); - tb.insert(c); - tb.framesetOk(false); - } - break; - } - case Comment: { - tb.insert(t.asComment()); - break; - } - case Doctype: { - tb.error(this); - return false; - } - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - tb.error(this); - // merge attributes onto real html - Element html = tb.getStack().getFirst(); - for (Attribute attribute : startTag.getAttributes()) { - if (!html.hasAttr(attribute.getKey())) { - html.attributes().put(attribute); - } - } - } else if (StringUtil.in(name, "base", "basefont", "bgsound", - "command", "link", "meta", "noframes", "script", - "style", "title")) { - return tb.process(t, InHead); - } else if (name.equals("body")) { - tb.error(this); - LinkedList stack = tb.getStack(); - if (stack.size() == 1 - || (stack.size() > 2 && !stack.get(1).nodeName() - .equals("body"))) { - // only in fragment case - return false; // ignore - } else { - tb.framesetOk(false); - Element body = stack.get(1); - for (Attribute attribute : startTag.getAttributes()) { - if (!body.hasAttr(attribute.getKey())) { - body.attributes().put(attribute); - } - } - } - } else if (name.equals("frameset")) { - tb.error(this); - LinkedList stack = tb.getStack(); - if (stack.size() == 1 - || (stack.size() > 2 && !stack.get(1).nodeName() - .equals("body"))) { - // only in fragment case - return false; // ignore - } else if (!tb.framesetOk()) { - return false; // ignore frameset - } else { - Element second = stack.get(1); - if (second.parent() != null) { - second.remove(); - } - // pop up to html element - while (stack.size() > 1) { - stack.removeLast(); - } - tb.insert(startTag); - tb.transition(InFrameset); - } - } else if (StringUtil.in(name, "address", "article", "aside", - "blockquote", "center", "details", "dir", "div", "dl", - "fieldset", "figcaption", "figure", "footer", "header", - "hgroup", "menu", "nav", "ol", "p", "section", - "summary", "ul")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", - "h6")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - if (StringUtil.in(tb.currentElement().nodeName(), "h1", - "h2", "h3", "h4", "h5", "h6")) { - tb.error(this); - tb.pop(); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "pre", "listing")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - // todo: ignore LF if next token - tb.framesetOk(false); - } else if (name.equals("form")) { - if (tb.getFormElement() != null) { - tb.error(this); - return false; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - Element form = tb.insert(startTag); - tb.setFormElement(form); - } else if (name.equals("li")) { - tb.framesetOk(false); - LinkedList stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (el.nodeName().equals("li")) { - tb.process(new Token.EndTag("li")); - break; - } - if (tb.isSpecial(el) - && !StringUtil.in(el.nodeName(), "address", - "div", "p")) { - break; - } - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "dd", "dt")) { - tb.framesetOk(false); - LinkedList stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (StringUtil.in(el.nodeName(), "dd", "dt")) { - tb.process(new Token.EndTag(el.nodeName())); - break; - } - if (tb.isSpecial(el) - && !StringUtil.in(el.nodeName(), "address", - "div", "p")) { - break; - } - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (name.equals("plaintext")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once - // in, - // never - // gets - // out - } else if (name.equals("button")) { - if (tb.inButtonScope("button")) { - // close and reprocess - tb.error(this); - tb.process(new Token.EndTag("button")); - tb.process(startTag); - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); - } - } else if (name.equals("a")) { - if (tb.getActiveFormattingElement("a") != null) { - tb.error(this); - tb.process(new Token.EndTag("a")); - - // still on stack? - Element remainingA = tb.getFromStack("a"); - if (remainingA != null) { - tb.removeFromActiveFormattingElements(remainingA); - tb.removeFromStack(remainingA); - } - } - tb.reconstructFormattingElements(); - Element a = tb.insert(startTag); - tb.pushActiveFormattingElements(a); - } else if (StringUtil.in(name, "b", "big", "code", "em", - "font", "i", "s", "small", "strike", "strong", "tt", - "u")) { - tb.reconstructFormattingElements(); - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (name.equals("nobr")) { - tb.reconstructFormattingElements(); - if (tb.inScope("nobr")) { - tb.error(this); - tb.process(new Token.EndTag("nobr")); - tb.reconstructFormattingElements(); - } - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.insertMarkerToFormattingElements(); - tb.framesetOk(false); - } else if (name.equals("table")) { - if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks - && tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InTable); - } else if (StringUtil.in(name, "area", "br", "embed", "img", - "keygen", "wbr")) { - tb.reconstructFormattingElements(); - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("input")) { - tb.reconstructFormattingElements(); - Element el = tb.insertEmpty(startTag); - if (!el.attr("type").equalsIgnoreCase("hidden")) { - tb.framesetOk(false); - } - } else if (StringUtil.in(name, "param", "source", "track")) { - tb.insertEmpty(startTag); - } else if (name.equals("hr")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("image")) { - // we're not supposed to ask. - startTag.name("img"); - return tb.process(startTag); - } else if (name.equals("isindex")) { - // how much do we care about the early 90s? - tb.error(this); - if (tb.getFormElement() != null) { - return false; - } - - tb.tokeniser.acknowledgeSelfClosingFlag(); - tb.process(new Token.StartTag("form")); - if (startTag.attributes.hasKey("action")) { - Element form = tb.getFormElement(); - form.attr("action", startTag.attributes.get("action")); - } - tb.process(new Token.StartTag("hr")); - tb.process(new Token.StartTag("label")); - // hope you like english. - String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes - .get("prompt") - : "This is a searchable index. Enter search keywords: "; - - tb.process(new Token.Character(prompt)); - - // input - Attributes inputAttribs = new Attributes(); - for (Attribute attr : startTag.attributes) { - if (!StringUtil.in(attr.getKey(), "name", "action", - "prompt")) { - inputAttribs.put(attr); - } - } - inputAttribs.put("name", "isindex"); - tb.process(new Token.StartTag("input", inputAttribs)); - tb.process(new Token.EndTag("label")); - tb.process(new Token.StartTag("hr")); - tb.process(new Token.EndTag("form")); - } else if (name.equals("textarea")) { - tb.insert(startTag); - // todo: If the next token is a U+000A LINE FEED (LF) - // character token, then ignore that token and move on to - // the next one. (Newlines at the start of textarea elements - // are ignored as an authoring convenience.) - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.framesetOk(false); - tb.transition(Text); - } else if (name.equals("xmp")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.reconstructFormattingElements(); - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("iframe")) { - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("noembed")) { - // also handle noscript if script enabled - handleRawtext(startTag, tb); - } else if (name.equals("select")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); - - HtmlTreeBuilderState state = tb.state(); - if (state.equals(InTable) || state.equals(InCaption) - || state.equals(InTableBody) || state.equals(InRow) - || state.equals(InCell)) { - tb.transition(InSelectInTable); - } else { - tb.transition(InSelect); - } - } else if (StringUtil.in("optgroup", "option")) { - if (tb.currentElement().nodeName().equals("option")) { - tb.process(new Token.EndTag("option")); - } - tb.reconstructFormattingElements(); - tb.insert(startTag); - } else if (StringUtil.in("rp", "rt")) { - if (tb.inScope("ruby")) { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("ruby")) { - tb.error(this); - tb.popStackToBefore("ruby"); // i.e. close up to but - // not include name - } - tb.insert(startTag); - } - } else if (name.equals("math")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. - // foreign, mathml) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (name.equals("svg")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, - // svg) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", - "frame", "head", "tbody", "td", "tfoot", "th", "thead", - "tr")) { - tb.error(this); - return false; - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - } - break; - - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("body")) { - if (!tb.inScope("body")) { - tb.error(this); - return false; - } else { - // todo: error if stack contains something not dd, dt, - // li, optgroup, option, p, rp, rt, tbody, td, tfoot, - // th, thead, tr, body, html - tb.transition(AfterBody); - } - } else if (name.equals("html")) { - boolean notIgnored = tb.process(new Token.EndTag("body")); - if (notIgnored) { - return tb.process(endTag); - } - } else if (StringUtil.in(name, "address", "article", "aside", - "blockquote", "button", "center", "details", "dir", - "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "listing", "menu", "nav", - "ol", "pre", "section", "summary", "ul")) { - // todo: refactor these lookups - if (!tb.inScope(name)) { - // nothing to close - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (name.equals("form")) { - Element currentForm = tb.getFormElement(); - tb.setFormElement(null); - if (currentForm == null || !tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - // remove currentForm from stack. will shift anything - // under up. - tb.removeFromStack(currentForm); - } - } else if (name.equals("p")) { - if (!tb.inButtonScope(name)) { - tb.error(this); - tb.process(new Token.StartTag(name)); // if no p to - // close, creates - // an empty - //

- return tb.process(endTag); - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (name.equals("li")) { - if (!tb.inListItemScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, "dd", "dt")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - } - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", - "h6")) { - if (!tb.inScope(new String[] { "h1", "h2", "h3", "h4", - "h5", "h6" })) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); - } - } else if (name.equals("sarcasm")) { - // *sigh* - return anyOtherEndTag(t, tb); - } else if (StringUtil.in(name, "a", "b", "big", "code", "em", - "font", "i", "nobr", "s", "small", "strike", "strong", - "tt", "u")) { - // Adoption Agency Algorithm. - OUTER: for (int i = 0; i < 8; i++) { - Element formatEl = tb.getActiveFormattingElement(name); - if (formatEl == null) { - return anyOtherEndTag(t, tb); - } else if (!tb.onStack(formatEl)) { - tb.error(this); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } else if (!tb.inScope(formatEl.nodeName())) { - tb.error(this); - return false; - } else if (tb.currentElement() != formatEl) { - tb.error(this); - } - - Element furthestBlock = null; - Element commonAncestor = null; - boolean seenFormattingElement = false; - LinkedList stack = tb.getStack(); - for (int si = 0; si < stack.size(); si++) { - Element el = stack.get(si); - if (el == formatEl) { - commonAncestor = stack.get(si - 1); - seenFormattingElement = true; - } else if (seenFormattingElement - && tb.isSpecial(el)) { - furthestBlock = el; - break; - } - } - if (furthestBlock == null) { - tb.popStackToClose(formatEl.nodeName()); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } - - // todo: Let a bookmark note the position of the - // formatting element in the list of active formatting - // elements relative to the elements on either side of - // it in the list. - // does that mean: int pos of format el in list? - Element node = furthestBlock; - Element lastNode = furthestBlock; - INNER: for (int j = 0; j < 3; j++) { - if (tb.onStack(node)) { - node = tb.aboveOnStack(node); - } - if (!tb.isInActiveFormattingElements(node)) { // note - // no - // bookmark - // check - tb.removeFromStack(node); - continue INNER; - } else if (node == formatEl) { - break INNER; - } - - Element replacement = new Element(Tag.valueOf(node - .nodeName()), tb.getBaseUri()); - tb.replaceActiveFormattingElement(node, replacement); - tb.replaceOnStack(node, replacement); - node = replacement; - - if (lastNode == furthestBlock) { - // todo: move the aforementioned bookmark to be - // immediately after the new node in the list of - // active formatting elements. - // not getting how this bookmark both straddles - // the element above, but is inbetween here... - } - if (lastNode.parent() != null) { - lastNode.remove(); - } - node.appendChild(lastNode); - - lastNode = node; - } - - if (StringUtil.in(commonAncestor.nodeName(), "table", - "tbody", "tfoot", "thead", "tr")) { - if (lastNode.parent() != null) { - lastNode.remove(); - } - tb.insertInFosterParent(lastNode); - } else { - if (lastNode.parent() != null) { - lastNode.remove(); - } - commonAncestor.appendChild(lastNode); - } - - Element adopter = new Element(Tag.valueOf(name), - tb.getBaseUri()); - Node[] childNodes = furthestBlock.childNodes().toArray( - new Node[furthestBlock.childNodes().size()]); - for (Node childNode : childNodes) { - adopter.appendChild(childNode); // append will - // reparent. thus - // the clone to - // avoid concurrent - // mod. - } - furthestBlock.appendChild(adopter); - tb.removeFromActiveFormattingElements(formatEl); - // todo: insert the new element into the list of active - // formatting elements at the position of the - // aforementioned bookmark. - tb.removeFromStack(formatEl); - tb.insertOnStackAfter(furthestBlock, adopter); - } - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - if (!tb.inScope("name")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); - } - } else if (name.equals("br")) { - tb.error(this); - tb.process(new Token.StartTag("br")); - return false; - } else { - return anyOtherEndTag(t, tb); - } - - break; - case EOF: - // todo: error if stack contains something not dd, dt, li, p, - // tbody, td, tfoot, th, thead, tr, body, html - // stop parsing - break; - } - return true; - } - - boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) { - String name = t.asEndTag().name(); - DescendableLinkedList stack = tb.getStack(); - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element node = it.next(); - if (node.nodeName().equals(name)) { - tb.generateImpliedEndTags(name); - if (!name.equals(tb.currentElement().nodeName())) { - tb.error(this); - } - tb.popStackToClose(name); - break; - } else { - if (tb.isSpecial(node)) { - tb.error(this); - return false; - } - } - } - return true; - } - }, - Text { - // in script, style etc. normally treated as data tags - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isCharacter()) { - tb.insert(t.asCharacter()); - } else if (t.isEOF()) { - tb.error(this); - // if current node is script: already started - tb.pop(); - tb.transition(tb.originalState()); - return tb.process(t); - } else if (t.isEndTag()) { - // if: An end tag whose tag name is "script" -- scripting - // nesting level, if evaluating scripts - tb.pop(); - tb.transition(tb.originalState()); - } - return true; - } - }, - InTable { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isCharacter()) { - tb.newPendingTableCharacters(); - tb.markInsertionMode(); - tb.transition(InTableText); - return tb.process(t); - } else if (t.isComment()) { - tb.insert(t.asComment()); - return true; - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("caption")) { - tb.clearStackToTableContext(); - tb.insertMarkerToFormattingElements(); - tb.insert(startTag); - tb.transition(InCaption); - } else if (name.equals("colgroup")) { - tb.clearStackToTableContext(); - tb.insert(startTag); - tb.transition(InColumnGroup); - } else if (name.equals("col")) { - tb.process(new Token.StartTag("colgroup")); - return tb.process(t); - } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - tb.clearStackToTableContext(); - tb.insert(startTag); - tb.transition(InTableBody); - } else if (StringUtil.in(name, "td", "th", "tr")) { - tb.process(new Token.StartTag("tbody")); - return tb.process(t); - } else if (name.equals("table")) { - tb.error(this); - boolean processed = tb.process(new Token.EndTag("table")); - if (processed) { - return tb.process(t); - } - } else if (StringUtil.in(name, "style", "script")) { - return tb.process(t, InHead); - } else if (name.equals("input")) { - if (!startTag.attributes.get("type").equalsIgnoreCase( - "hidden")) { - return anythingElse(t, tb); - } else { - tb.insertEmpty(startTag); - } - } else if (name.equals("form")) { - tb.error(this); - if (tb.getFormElement() != null) { - return false; - } else { - Element form = tb.insertEmpty(startTag); - tb.setFormElement(form); - } - } else { - return anythingElse(t, tb); - } - } else if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (name.equals("table")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose("table"); - } - tb.resetInsertionMode(); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - } else if (t.isEOF()) { - if (tb.currentElement().nodeName().equals("html")) { - tb.error(this); - } - return true; // stops parsing - } - return anythingElse(t, tb); - } - - boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - boolean processed = true; - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", - "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - processed = tb.process(t, InBody); - tb.setFosterInserts(false); - } else { - processed = tb.process(t, InBody); - } - return processed; - } - }, - InTableText { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.getPendingTableCharacters().add(c); - } - break; - default: - if (tb.getPendingTableCharacters().size() > 0) { - for (Token.Character character : tb - .getPendingTableCharacters()) { - if (!isWhitespace(character)) { - // InTable anything else section: - tb.error(this); - if (StringUtil.in(tb.currentElement().nodeName(), - "table", "tbody", "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - tb.process(character, InBody); - tb.setFosterInserts(false); - } else { - tb.process(character, InBody); - } - } else { - tb.insert(character); - } - } - tb.newPendingTableCharacters(); - } - tb.transition(tb.originalState()); - return tb.process(t); - } - return true; - } - }, - InCaption { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isEndTag() && t.asEndTag().name().equals("caption")) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("caption")) { - tb.error(this); - } - tb.popStackToClose("caption"); - tb.clearFormattingElementsToLastMarker(); - tb.transition(InTable); - } - } else if ((t.isStartTag() - && StringUtil.in(t.asStartTag().name(), "caption", "col", - "colgroup", "tbody", "td", "tfoot", "th", "thead", - "tr") || t.isEndTag() - && t.asEndTag().name().equals("table"))) { - tb.error(this); - boolean processed = tb.process(new Token.EndTag("caption")); - if (processed) { - return tb.process(t); - } - } else if (t.isEndTag() - && StringUtil.in(t.asEndTag().name(), "body", "col", - "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr")) { - tb.error(this); - return false; - } else { - return tb.process(t, InBody); - } - return true; - } - }, - InColumnGroup { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - return true; - } - switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - break; - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - return tb.process(t, InBody); - } else if (name.equals("col")) { - tb.insertEmpty(startTag); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("colgroup")) { - if (tb.currentElement().nodeName().equals("html")) { // frag - // case - tb.error(this); - return false; - } else { - tb.pop(); - tb.transition(InTable); - } - } else { - return anythingElse(t, tb); - } - break; - case EOF: - if (tb.currentElement().nodeName().equals("html")) { - return true; // stop parsing; frag case - } else { - return anythingElse(t, tb); - } - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, TreeBuilder tb) { - boolean processed = tb.process(new Token.EndTag("colgroup")); - if (processed) { - return tb.process(t); - } - return true; - } - }, - InTableBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("tr")) { - tb.clearStackToTableBodyContext(); - tb.insert(startTag); - tb.transition(InRow); - } else if (StringUtil.in(name, "th", "td")) { - tb.error(this); - tb.process(new Token.StartTag("tr")); - return tb.process(startTag); - } else if (StringUtil.in(name, "caption", "col", "colgroup", - "tbody", "tfoot", "thead")) { - return exitTableBody(t, tb); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.clearStackToTableBodyContext(); - tb.pop(); - tb.transition(InTable); - } - } else if (name.equals("table")) { - return exitTableBody(t, tb); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html", "td", "th", "tr")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { - if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb - .inScope("tfoot"))) { - // frag case - tb.error(this); - return false; - } - tb.clearStackToTableBodyContext(); - tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, - // tfoot, - // thead - return tb.process(t); - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InTable); - } - }, - InRow { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag()) { - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - - if (StringUtil.in(name, "th", "td")) { - tb.clearStackToTableRowContext(); - tb.insert(startTag); - tb.transition(InCell); - tb.insertMarkerToFormattingElements(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", - "tbody", "tfoot", "thead", "tr")) { - return handleMissingTr(t, tb); - } else { - return anythingElse(t, tb); - } - } else if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (name.equals("tr")) { - if (!tb.inTableScope(name)) { - tb.error(this); // frag - return false; - } - tb.clearStackToTableRowContext(); - tb.pop(); // tr - tb.transition(InTableBody); - } else if (name.equals("table")) { - return handleMissingTr(t, tb); - } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } - tb.process(new Token.EndTag("tr")); - return tb.process(t); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html", "td", "th")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InTable); - } - - private boolean handleMissingTr(Token t, TreeBuilder tb) { - boolean processed = tb.process(new Token.EndTag("tr")); - if (processed) { - return tb.process(t); - } else { - return false; - } - } - }, - InCell { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isEndTag()) { - Token.EndTag endTag = t.asEndTag(); - String name = endTag.name(); - - if (StringUtil.in(name, "td", "th")) { - if (!tb.inTableScope(name)) { - tb.error(this); - tb.transition(InRow); // might not be in scope if empty: - // and processing fake end - // tag - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) { - tb.error(this); - } - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); - tb.transition(InRow); - } else if (StringUtil.in(name, "body", "caption", "col", - "colgroup", "html")) { - tb.error(this); - return false; - } else if (StringUtil.in(name, "table", "tbody", "tfoot", - "thead", "tr")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } - closeCell(tb); - return tb.process(t); - } else { - return anythingElse(t, tb); - } - } else if (t.isStartTag() - && StringUtil.in(t.asStartTag().name(), "caption", "col", - "colgroup", "tbody", "td", "tfoot", "th", "thead", - "tr")) { - if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { - tb.error(this); - return false; - } - closeCell(tb); - return tb.process(t); - } else { - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - return tb.process(t, InBody); - } - - private void closeCell(HtmlTreeBuilder tb) { - if (tb.inTableScope("td")) { - tb.process(new Token.EndTag("td")); - } else { - tb.process(new Token.EndTag("th")); // only here if th or td in - // scope - } - } - }, - InSelect { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.insert(c); - } - break; - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return tb.process(start, InBody); - } else if (name.equals("option")) { - tb.process(new Token.EndTag("option")); - tb.insert(start); - } else if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option")) { - tb.process(new Token.EndTag("option")); - } else if (tb.currentElement().nodeName() - .equals("optgroup")) { - tb.process(new Token.EndTag("optgroup")); - } - tb.insert(start); - } else if (name.equals("select")) { - tb.error(this); - return tb.process(new Token.EndTag("select")); - } else if (StringUtil.in(name, "input", "keygen", "textarea")) { - tb.error(this); - if (!tb.inSelectScope("select")) { - return false; // frag - } - tb.process(new Token.EndTag("select")); - return tb.process(start); - } else if (name.equals("script")) { - return tb.process(t, InHead); - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option") - && tb.aboveOnStack(tb.currentElement()) != null - && tb.aboveOnStack(tb.currentElement()).nodeName() - .equals("optgroup")) { - tb.process(new Token.EndTag("option")); - } - if (tb.currentElement().nodeName().equals("optgroup")) { - tb.pop(); - } else { - tb.error(this); - } - } else if (name.equals("option")) { - if (tb.currentElement().nodeName().equals("option")) { - tb.pop(); - } else { - tb.error(this); - } - } else if (name.equals("select")) { - if (!tb.inSelectScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose(name); - tb.resetInsertionMode(); - } - } else { - return anythingElse(t, tb); - } - break; - case EOF: - if (!tb.currentElement().nodeName().equals("html")) { - tb.error(this); - } - break; - default: - return anythingElse(t, tb); - } - return true; - } - - private boolean anythingElse(Token t, HtmlTreeBuilder tb) { - tb.error(this); - return false; - } - }, - InSelectInTable { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag() - && StringUtil.in(t.asStartTag().name(), "caption", "table", - "tbody", "tfoot", "thead", "tr", "td", "th")) { - tb.error(this); - tb.process(new Token.EndTag("select")); - return tb.process(t); - } else if (t.isEndTag() - && StringUtil.in(t.asEndTag().name(), "caption", "table", - "tbody", "tfoot", "thead", "tr", "td", "th")) { - tb.error(this); - if (tb.inTableScope(t.asEndTag().name())) { - tb.process(new Token.EndTag("select")); - return (tb.process(t)); - } else { - return false; - } - } else { - return tb.process(t, InSelect); - } - } - }, - AfterBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - return tb.process(t, InBody); - } else if (t.isComment()) { - tb.insert(t.asComment()); // into html node - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { - if (tb.isFragmentParsing()) { - tb.error(this); - return false; - } else { - tb.transition(AfterAfterBody); - } - } else if (t.isEOF()) { - // chillax! we're done - } else { - tb.error(this); - tb.transition(InBody); - return tb.process(t); - } - return true; - } - }, - InFrameset { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag()) { - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return tb.process(start, InBody); - } else if (name.equals("frameset")) { - tb.insert(start); - } else if (name.equals("frame")) { - tb.insertEmpty(start); - } else if (name.equals("noframes")) { - return tb.process(start, InHead); - } else { - tb.error(this); - return false; - } - } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) { - if (tb.currentElement().nodeName().equals("html")) { // frag - tb.error(this); - return false; - } else { - tb.pop(); - if (!tb.isFragmentParsing() - && !tb.currentElement().nodeName() - .equals("frameset")) { - tb.transition(AfterFrameset); - } - } - } else if (t.isEOF()) { - if (!tb.currentElement().nodeName().equals("html")) { - tb.error(this); - return true; - } - } else { - tb.error(this); - return false; - } - return true; - } - }, - AfterFrameset { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (isWhitespace(t)) { - tb.insert(t.asCharacter()); - } else if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype()) { - tb.error(this); - return false; - } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { - return tb.process(t, InBody); - } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { - tb.transition(AfterAfterFrameset); - } else if (t.isStartTag() - && t.asStartTag().name().equals("noframes")) { - return tb.process(t, InHead); - } else if (t.isEOF()) { - // cool your heels, we're complete - } else { - tb.error(this); - return false; - } - return true; - } - }, - AfterAfterBody { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) - || (t.isStartTag() && t.asStartTag().name().equals("html"))) { - return tb.process(t, InBody); - } else if (t.isEOF()) { - // nice work chuck - } else { - tb.error(this); - tb.transition(InBody); - return tb.process(t); - } - return true; - } - }, - AfterAfterFrameset { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isComment()) { - tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) - || (t.isStartTag() && t.asStartTag().name().equals("html"))) { - return tb.process(t, InBody); - } else if (t.isEOF()) { - // nice work chuck - } else if (t.isStartTag() - && t.asStartTag().name().equals("noframes")) { - return tb.process(t, InHead); - } else { - tb.error(this); - return false; - } - return true; - } - }, - ForeignContent { - @Override - boolean process(Token t, HtmlTreeBuilder tb) { - return true; - // todo: implement. Also; how do we get here? - } - }; - - private static String nullString = String.valueOf('\u0000'); - - abstract boolean process(Token t, HtmlTreeBuilder tb); - - private static boolean isWhitespace(Token t) { - if (t.isCharacter()) { - String data = t.asCharacter().getData(); - // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " - for (int i = 0; i < data.length(); i++) { - char c = data.charAt(i); - if (!StringUtil.isWhitespace(c)) { - return false; - } - } - return true; - } - return false; - } - - private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) { - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.transition(Text); - } - - private static void handleRawtext(Token.StartTag startTag, - HtmlTreeBuilder tb) { - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.Rawtext); - tb.markInsertionMode(); - tb.transition(Text); - } -} diff --git a/server/src/org/jsoup/parser/ParseError.java b/server/src/org/jsoup/parser/ParseError.java deleted file mode 100644 index eb3c240a59..0000000000 --- a/server/src/org/jsoup/parser/ParseError.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.jsoup.parser; - -/** - * A Parse Error records an error in the input HTML that occurs in either the - * tokenisation or the tree building phase. - */ -public class ParseError { - private int pos; - private String errorMsg; - - ParseError(int pos, String errorMsg) { - this.pos = pos; - this.errorMsg = errorMsg; - } - - ParseError(int pos, String errorFormat, Object... args) { - errorMsg = String.format(errorFormat, args); - this.pos = pos; - } - - /** - * Retrieve the error message. - * - * @return the error message. - */ - public String getErrorMessage() { - return errorMsg; - } - - /** - * Retrieves the offset of the error. - * - * @return error offset within input - */ - public int getPosition() { - return pos; - } - - @Override - public String toString() { - return pos + ": " + errorMsg; - } -} diff --git a/server/src/org/jsoup/parser/ParseErrorList.java b/server/src/org/jsoup/parser/ParseErrorList.java deleted file mode 100644 index 773dfcae24..0000000000 --- a/server/src/org/jsoup/parser/ParseErrorList.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.jsoup.parser; - -import java.util.ArrayList; - -/** - * A container for ParseErrors. - * - * @author Jonathan Hedley - */ -class ParseErrorList extends ArrayList { - private static final int INITIAL_CAPACITY = 16; - private final int maxSize; - - ParseErrorList(int initialCapacity, int maxSize) { - super(initialCapacity); - this.maxSize = maxSize; - } - - boolean canAddError() { - return size() < maxSize; - } - - int getMaxSize() { - return maxSize; - } - - static ParseErrorList noTracking() { - return new ParseErrorList(0, 0); - } - - static ParseErrorList tracking(int maxSize) { - return new ParseErrorList(INITIAL_CAPACITY, maxSize); - } -} diff --git a/server/src/org/jsoup/parser/Parser.java b/server/src/org/jsoup/parser/Parser.java deleted file mode 100644 index a1f6fd5184..0000000000 --- a/server/src/org/jsoup/parser/Parser.java +++ /dev/null @@ -1,198 +0,0 @@ -package org.jsoup.parser; - -import java.util.List; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -/** - * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use - * one of the more convenient parse methods in {@link org.jsoup.Jsoup}. - */ -public class Parser { - private static final int DEFAULT_MAX_ERRORS = 0; // by default, error - // tracking is disabled. - - private TreeBuilder treeBuilder; - private int maxErrors = DEFAULT_MAX_ERRORS; - private ParseErrorList errors; - - /** - * Create a new Parser, using the specified TreeBuilder - * - * @param treeBuilder - * TreeBuilder to use to parse input into Documents. - */ - public Parser(TreeBuilder treeBuilder) { - this.treeBuilder = treeBuilder; - } - - public Document parseInput(String html, String baseUri) { - errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) - : ParseErrorList.noTracking(); - Document doc = treeBuilder.parse(html, baseUri, errors); - return doc; - } - - // gets & sets - /** - * Get the TreeBuilder currently in use. - * - * @return current TreeBuilder. - */ - public TreeBuilder getTreeBuilder() { - return treeBuilder; - } - - /** - * Update the TreeBuilder used when parsing content. - * - * @param treeBuilder - * current TreeBuilder - * @return this, for chaining - */ - public Parser setTreeBuilder(TreeBuilder treeBuilder) { - this.treeBuilder = treeBuilder; - return this; - } - - /** - * Check if parse error tracking is enabled. - * - * @return current track error state. - */ - public boolean isTrackErrors() { - return maxErrors > 0; - } - - /** - * Enable or disable parse error tracking for the next parse. - * - * @param maxErrors - * the maximum number of errors to track. Set to 0 to disable. - * @return this, for chaining - */ - public Parser setTrackErrors(int maxErrors) { - this.maxErrors = maxErrors; - return this; - } - - /** - * Retrieve the parse errors, if any, from the last parse. - * - * @return list of parse errors, up to the size of the maximum errors - * tracked. - */ - public List getErrors() { - return errors; - } - - // static parse functions below - /** - * Parse HTML into a Document. - * - * @param html - * HTML to parse - * @param baseUri - * base URI of document (i.e. original fetch location), for - * resolving relative URLs. - * - * @return parsed Document - */ - public static Document parse(String html, String baseUri) { - TreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking()); - } - - /** - * Parse a fragment of HTML into a list of nodes. The context element, if - * supplied, supplies parsing context. - * - * @param fragmentHtml - * the fragment of HTML to parse - * @param context - * (optional) the element that this HTML fragment is being parsed - * for (i.e. for inner HTML). This provides stack context (for - * implicit element creation). - * @param baseUri - * base URI of document (i.e. original fetch location), for - * resolving relative URLs. - * - * @return list of nodes parsed from the input HTML. Note that the context - * element, if supplied, is not modified. - */ - public static List parseFragment(String fragmentHtml, - Element context, String baseUri) { - HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parseFragment(fragmentHtml, context, baseUri, - ParseErrorList.noTracking()); - } - - /** - * Parse a fragment of HTML into the {@code body} of a Document. - * - * @param bodyHtml - * fragment of HTML - * @param baseUri - * base URI of document (i.e. original fetch location), for - * resolving relative URLs. - * - * @return Document, with empty head, and HTML parsed into body - */ - public static Document parseBodyFragment(String bodyHtml, String baseUri) { - Document doc = Document.createShell(baseUri); - Element body = doc.body(); - List nodeList = parseFragment(bodyHtml, body, baseUri); - Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node - // list gets - // modified - // when - // re-parented - for (Node node : nodes) { - body.appendChild(node); - } - return doc; - } - - /** - * @param bodyHtml - * HTML to parse - * @param baseUri - * baseUri base URI of document (i.e. original fetch location), - * for resolving relative URLs. - * - * @return parsed Document - * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} - * instead. - */ - @Deprecated - public static Document parseBodyFragmentRelaxed(String bodyHtml, - String baseUri) { - return parse(bodyHtml, baseUri); - } - - // builders - - /** - * Create a new HTML parser. This parser treats input as HTML5, and enforces - * the creation of a normalised document, based on a knowledge of the - * semantics of the incoming tags. - * - * @return a new HTML parser. - */ - public static Parser htmlParser() { - return new Parser(new HtmlTreeBuilder()); - } - - /** - * Create a new XML parser. This parser assumes no knowledge of the incoming - * tags and does not treat it as HTML, rather creates a simple tree directly - * from the input. - * - * @return a new simple XML parser. - */ - public static Parser xmlParser() { - return new Parser(new XmlTreeBuilder()); - } -} diff --git a/server/src/org/jsoup/parser/Tag.java b/server/src/org/jsoup/parser/Tag.java deleted file mode 100644 index c43f27aff3..0000000000 --- a/server/src/org/jsoup/parser/Tag.java +++ /dev/null @@ -1,298 +0,0 @@ -package org.jsoup.parser; - -import java.util.HashMap; -import java.util.Map; - -import org.jsoup.helper.Validate; - -/** - * HTML Tag capabilities. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Tag { - private static final Map tags = new HashMap(); // map - // of - // known - // tags - - private String tagName; - private boolean isBlock = true; // block or inline - private boolean formatAsBlock = true; // should be formatted as a block - private boolean canContainBlock = true; // Can this tag hold block level - // tags? - private boolean canContainInline = true; // only pcdata if not - private boolean empty = false; // can hold nothing; e.g. img - private boolean selfClosing = false; // can self close (). used for - // unknown tags that self close, - // without forcing them as empty. - private boolean preserveWhitespace = false; // for pre, textarea, script etc - - private Tag(String tagName) { - this.tagName = tagName.toLowerCase(); - } - - /** - * Get this tag's name. - * - * @return the tag's name - */ - public String getName() { - return tagName; - } - - /** - * Get a Tag by name. If not previously defined (unknown), returns a new - * generic tag, that can do anything. - *

- * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not - * registered and will only .equals(). - * - * @param tagName - * Name of tag, e.g. "p". Case insensitive. - * @return The tag, either defined or new generic. - */ - public static Tag valueOf(String tagName) { - Validate.notNull(tagName); - tagName = tagName.trim().toLowerCase(); - Validate.notEmpty(tagName); - - synchronized (tags) { - Tag tag = tags.get(tagName); - if (tag == null) { - // not defined: create default; go anywhere, do anything! (incl - // be inside a

) - tag = new Tag(tagName); - tag.isBlock = false; - tag.canContainBlock = true; - } - return tag; - } - } - - /** - * Gets if this is a block tag. - * - * @return if block tag - */ - public boolean isBlock() { - return isBlock; - } - - /** - * Gets if this tag should be formatted as a block (or as inline) - * - * @return if should be formatted as block or inline - */ - public boolean formatAsBlock() { - return formatAsBlock; - } - - /** - * Gets if this tag can contain block tags. - * - * @return if tag can contain block tags - */ - public boolean canContainBlock() { - return canContainBlock; - } - - /** - * Gets if this tag is an inline tag. - * - * @return if this tag is an inline tag. - */ - public boolean isInline() { - return !isBlock; - } - - /** - * Gets if this tag is a data only tag. - * - * @return if this tag is a data only tag - */ - public boolean isData() { - return !canContainInline && !isEmpty(); - } - - /** - * Get if this is an empty tag - * - * @return if this is an empty tag - */ - public boolean isEmpty() { - return empty; - } - - /** - * Get if this tag is self closing. - * - * @return if this tag should be output as self closing. - */ - public boolean isSelfClosing() { - return empty || selfClosing; - } - - /** - * Get if this is a pre-defined tag, or was auto created on parsing. - * - * @return if a known tag - */ - public boolean isKnownTag() { - return tags.containsKey(tagName); - } - - /** - * Check if this tagname is a known tag. - * - * @param tagName - * name of tag - * @return if known HTML tag - */ - public static boolean isKnownTag(String tagName) { - return tags.containsKey(tagName); - } - - /** - * Get if this tag should preserve whitespace within child text nodes. - * - * @return if preserve whitepace - */ - public boolean preserveWhitespace() { - return preserveWhitespace; - } - - Tag setSelfClosing() { - selfClosing = true; - return this; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (!(o instanceof Tag)) { - return false; - } - - Tag tag = (Tag) o; - - if (canContainBlock != tag.canContainBlock) { - return false; - } - if (canContainInline != tag.canContainInline) { - return false; - } - if (empty != tag.empty) { - return false; - } - if (formatAsBlock != tag.formatAsBlock) { - return false; - } - if (isBlock != tag.isBlock) { - return false; - } - if (preserveWhitespace != tag.preserveWhitespace) { - return false; - } - if (selfClosing != tag.selfClosing) { - return false; - } - if (!tagName.equals(tag.tagName)) { - return false; - } - - return true; - } - - @Override - public int hashCode() { - int result = tagName.hashCode(); - result = 31 * result + (isBlock ? 1 : 0); - result = 31 * result + (formatAsBlock ? 1 : 0); - result = 31 * result + (canContainBlock ? 1 : 0); - result = 31 * result + (canContainInline ? 1 : 0); - result = 31 * result + (empty ? 1 : 0); - result = 31 * result + (selfClosing ? 1 : 0); - result = 31 * result + (preserveWhitespace ? 1 : 0); - return result; - } - - @Override - public String toString() { - return tagName; - } - - // internal static initialisers: - // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other - // sources - private static final String[] blockTags = { "html", "head", "body", - "frameset", "script", "noscript", "style", "meta", "link", "title", - "frame", "noframes", "section", "nav", "aside", "hgroup", "header", - "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", - "pre", "div", "blockquote", "hr", "address", "figure", - "figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd", - "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", - "col", "tr", "th", "td", "video", "audio", "canvas", "details", - "menu", "plaintext" }; - private static final String[] inlineTags = { "object", "base", "font", - "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", - "samp", "kbd", "var", "cite", "abbr", "time", "acronym", "mark", - "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", "sub", - "sup", "bdo", "iframe", "embed", "span", "input", "select", - "textarea", "label", "button", "optgroup", "option", "legend", - "datalist", "keygen", "output", "progress", "meter", "area", - "param", "source", "track", "summary", "command", "device" }; - private static final String[] emptyTags = { "meta", "link", "base", - "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", - "col", "command", "device" }; - private static final String[] formatAsInlineTags = { "title", "a", "p", - "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", - "td", "script", "style" }; - private static final String[] preserveWhitespaceTags = { "pre", - "plaintext", "title" }; - - static { - // creates - for (String tagName : blockTags) { - Tag tag = new Tag(tagName); - register(tag); - } - for (String tagName : inlineTags) { - Tag tag = new Tag(tagName); - tag.isBlock = false; - tag.canContainBlock = false; - tag.formatAsBlock = false; - register(tag); - } - - // mods: - for (String tagName : emptyTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.canContainBlock = false; - tag.canContainInline = false; - tag.empty = true; - } - - for (String tagName : formatAsInlineTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.formatAsBlock = false; - } - - for (String tagName : preserveWhitespaceTags) { - Tag tag = tags.get(tagName); - Validate.notNull(tag); - tag.preserveWhitespace = true; - } - } - - private static Tag register(Tag tag) { - synchronized (tags) { - tags.put(tag.tagName, tag); - } - return tag; - } -} diff --git a/server/src/org/jsoup/parser/Token.java b/server/src/org/jsoup/parser/Token.java deleted file mode 100644 index e465eb74e3..0000000000 --- a/server/src/org/jsoup/parser/Token.java +++ /dev/null @@ -1,253 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; - -/** - * Parse tokens for the Tokeniser. - */ -abstract class Token { - TokenType type; - - private Token() { - } - - String tokenType() { - return this.getClass().getSimpleName(); - } - - static class Doctype extends Token { - final StringBuilder name = new StringBuilder(); - final StringBuilder publicIdentifier = new StringBuilder(); - final StringBuilder systemIdentifier = new StringBuilder(); - boolean forceQuirks = false; - - Doctype() { - type = TokenType.Doctype; - } - - String getName() { - return name.toString(); - } - - String getPublicIdentifier() { - return publicIdentifier.toString(); - } - - public String getSystemIdentifier() { - return systemIdentifier.toString(); - } - - public boolean isForceQuirks() { - return forceQuirks; - } - } - - static abstract class Tag extends Token { - protected String tagName; - private String pendingAttributeName; - private String pendingAttributeValue; - - boolean selfClosing = false; - Attributes attributes = new Attributes(); // todo: allow nodes to not - // have attributes - - void newAttribute() { - if (pendingAttributeName != null) { - if (pendingAttributeValue == null) { - pendingAttributeValue = ""; - } - Attribute attribute = new Attribute(pendingAttributeName, - pendingAttributeValue); - attributes.put(attribute); - } - pendingAttributeName = null; - pendingAttributeValue = null; - } - - void finaliseTag() { - // finalises for emit - if (pendingAttributeName != null) { - // todo: check if attribute name exists; if so, drop and error - newAttribute(); - } - } - - String name() { - Validate.isFalse(tagName.length() == 0); - return tagName; - } - - Tag name(String name) { - tagName = name; - return this; - } - - boolean isSelfClosing() { - return selfClosing; - } - - @SuppressWarnings({ "TypeMayBeWeakened" }) - Attributes getAttributes() { - return attributes; - } - - // these appenders are rarely hit in not null state-- caused by null - // chars. - void appendTagName(String append) { - tagName = tagName == null ? append : tagName.concat(append); - } - - void appendTagName(char append) { - appendTagName(String.valueOf(append)); - } - - void appendAttributeName(String append) { - pendingAttributeName = pendingAttributeName == null ? append - : pendingAttributeName.concat(append); - } - - void appendAttributeName(char append) { - appendAttributeName(String.valueOf(append)); - } - - void appendAttributeValue(String append) { - pendingAttributeValue = pendingAttributeValue == null ? append - : pendingAttributeValue.concat(append); - } - - void appendAttributeValue(char append) { - appendAttributeValue(String.valueOf(append)); - } - } - - static class StartTag extends Tag { - StartTag() { - super(); - type = TokenType.StartTag; - } - - StartTag(String name) { - this(); - tagName = name; - } - - StartTag(String name, Attributes attributes) { - this(); - tagName = name; - this.attributes = attributes; - } - - @Override - public String toString() { - return "<" + name() + " " + attributes.toString() + ">"; - } - } - - static class EndTag extends Tag { - EndTag() { - super(); - type = TokenType.EndTag; - } - - EndTag(String name) { - this(); - tagName = name; - } - - @Override - public String toString() { - return ""; - } - } - - static class Comment extends Token { - final StringBuilder data = new StringBuilder(); - - Comment() { - type = TokenType.Comment; - } - - String getData() { - return data.toString(); - } - - @Override - public String toString() { - return ""; - } - } - - static class Character extends Token { - private final String data; - - Character(String data) { - type = TokenType.Character; - this.data = data; - } - - String getData() { - return data; - } - - @Override - public String toString() { - return getData(); - } - } - - static class EOF extends Token { - EOF() { - type = Token.TokenType.EOF; - } - } - - boolean isDoctype() { - return type == TokenType.Doctype; - } - - Doctype asDoctype() { - return (Doctype) this; - } - - boolean isStartTag() { - return type == TokenType.StartTag; - } - - StartTag asStartTag() { - return (StartTag) this; - } - - boolean isEndTag() { - return type == TokenType.EndTag; - } - - EndTag asEndTag() { - return (EndTag) this; - } - - boolean isComment() { - return type == TokenType.Comment; - } - - Comment asComment() { - return (Comment) this; - } - - boolean isCharacter() { - return type == TokenType.Character; - } - - Character asCharacter() { - return (Character) this; - } - - boolean isEOF() { - return type == TokenType.EOF; - } - - enum TokenType { - Doctype, StartTag, EndTag, Comment, Character, EOF - } -} diff --git a/server/src/org/jsoup/parser/TokenQueue.java b/server/src/org/jsoup/parser/TokenQueue.java deleted file mode 100644 index 3e7127e640..0000000000 --- a/server/src/org/jsoup/parser/TokenQueue.java +++ /dev/null @@ -1,473 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; - -/** - * A character queue with parsing helpers. - * - * @author Jonathan Hedley - */ -public class TokenQueue { - private String queue; - private int pos = 0; - - private static final char ESC = '\\'; // escape char for chomp balanced. - - /** - * Create a new TokenQueue. - * - * @param data - * string of data to back queue. - */ - public TokenQueue(String data) { - Validate.notNull(data); - queue = data; - } - - /** - * Is the queue empty? - * - * @return true if no data left in queue. - */ - public boolean isEmpty() { - return remainingLength() == 0; - } - - private int remainingLength() { - return queue.length() - pos; - } - - /** - * Retrieves but does not remove the first character from the queue. - * - * @return First character, or 0 if empty. - */ - public char peek() { - return isEmpty() ? 0 : queue.charAt(pos); - } - - /** - * Add a character to the start of the queue (will be the next character - * retrieved). - * - * @param c - * character to add - */ - public void addFirst(Character c) { - addFirst(c.toString()); - } - - /** - * Add a string to the start of the queue. - * - * @param seq - * string to add. - */ - public void addFirst(String seq) { - // not very performant, but an edge case - queue = seq + queue.substring(pos); - pos = 0; - } - - /** - * Tests if the next characters on the queue match the sequence. Case - * insensitive. - * - * @param seq - * String to check queue for. - * @return true if the next characters match. - */ - public boolean matches(String seq) { - return queue.regionMatches(true, pos, seq, 0, seq.length()); - } - - /** - * Case sensitive match test. - * - * @param seq - * string to case sensitively check for - * @return true if matched, false if not - */ - public boolean matchesCS(String seq) { - return queue.startsWith(seq, pos); - } - - /** - * Tests if the next characters match any of the sequences. Case - * insensitive. - * - * @param seq - * list of strings to case insensitively check for - * @return true of any matched, false if none did - */ - public boolean matchesAny(String... seq) { - for (String s : seq) { - if (matches(s)) { - return true; - } - } - return false; - } - - public boolean matchesAny(char... seq) { - if (isEmpty()) { - return false; - } - - for (char c : seq) { - if (queue.charAt(pos) == c) { - return true; - } - } - return false; - } - - public boolean matchesStartTag() { - // micro opt for matching "= 2 && queue.charAt(pos) == '<' && Character - .isLetter(queue.charAt(pos + 1))); - } - - /** - * Tests if the queue matches the sequence (as with match), and if they do, - * removes the matched string from the queue. - * - * @param seq - * String to search for, and if found, remove from queue. - * @return true if found and removed, false if not found. - */ - public boolean matchChomp(String seq) { - if (matches(seq)) { - pos += seq.length(); - return true; - } else { - return false; - } - } - - /** - * Tests if queue starts with a whitespace character. - * - * @return if starts with whitespace - */ - public boolean matchesWhitespace() { - return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); - } - - /** - * Test if the queue matches a word character (letter or digit). - * - * @return if matches a word character - */ - public boolean matchesWord() { - return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); - } - - /** - * Drops the next character off the queue. - */ - public void advance() { - if (!isEmpty()) { - pos++; - } - } - - /** - * Consume one character off queue. - * - * @return first character on queue. - */ - public char consume() { - return queue.charAt(pos++); - } - - /** - * Consumes the supplied sequence of the queue. If the queue does not start - * with the supplied sequence, will throw an illegal state exception -- but - * you should be running match() against that condition. - *

- * Case insensitive. - * - * @param seq - * sequence to remove from head of queue. - */ - public void consume(String seq) { - if (!matches(seq)) { - throw new IllegalStateException( - "Queue did not match expected sequence"); - } - int len = seq.length(); - if (len > remainingLength()) { - throw new IllegalStateException( - "Queue not long enough to consume sequence"); - } - - pos += len; - } - - /** - * Pulls a string off the queue, up to but exclusive of the match sequence, - * or to the queue running out. - * - * @param seq - * String to end on (and not include in return, but leave on - * queue). Case sensitive. - * @return The matched data consumed from queue. - */ - public String consumeTo(String seq) { - int offset = queue.indexOf(seq, pos); - if (offset != -1) { - String consumed = queue.substring(pos, offset); - pos += consumed.length(); - return consumed; - } else { - return remainder(); - } - } - - public String consumeToIgnoreCase(String seq) { - int start = pos; - String first = seq.substring(0, 1); - boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if - // first - // is - // not - // cased, - // use - // index - // of - while (!isEmpty()) { - if (matches(seq)) { - break; - } - - if (canScan) { - int skip = queue.indexOf(first, pos) - pos; - if (skip == 0) { - pos++; - } else if (skip < 0) { - pos = queue.length(); - } else { - pos += skip; - } - } else { - pos++; - } - } - - String data = queue.substring(start, pos); - return data; - } - - /** - * Consumes to the first sequence provided, or to the end of the queue. - * Leaves the terminator on the queue. - * - * @param seq - * any number of terminators to consume to. Case - * insensitive. - * @return consumed string - */ - // todo: method name. not good that consumeTo cares for case, and consume to - // any doesn't. And the only use for this - // is is a case sensitive time... - public String consumeToAny(String... seq) { - int start = pos; - while (!isEmpty() && !matchesAny(seq)) { - pos++; - } - - String data = queue.substring(start, pos); - return data; - } - - /** - * Pulls a string off the queue (like consumeTo), and then pulls off the - * matched string (but does not return it). - *

- * If the queue runs out of characters before finding the seq, will return - * as much as it can (and queue will go isEmpty() == true). - * - * @param seq - * String to match up to, and not include in return, and to pull - * off queue. Case sensitive. - * @return Data matched from queue. - */ - public String chompTo(String seq) { - String data = consumeTo(seq); - matchChomp(seq); - return data; - } - - public String chompToIgnoreCase(String seq) { - String data = consumeToIgnoreCase(seq); // case insensitive scan - matchChomp(seq); - return data; - } - - /** - * Pulls a balanced string off the queue. E.g. if queue is - * "(one (two) three) four", (,) will return "one (two) three", and leave - * " four" on the queue. Unbalanced openers and closers can be escaped (with - * \). Those escapes will be left in the returned string, which is suitable - * for regexes (where we need to preserve the escape), but unsuitable for - * contains text strings; use unescape for that. - * - * @param open - * opener - * @param close - * closer - * @return data matched from the queue - */ - public String chompBalanced(char open, char close) { - StringBuilder accum = new StringBuilder(); - int depth = 0; - char last = 0; - - do { - if (isEmpty()) { - break; - } - Character c = consume(); - if (last == 0 || last != ESC) { - if (c.equals(open)) { - depth++; - } else if (c.equals(close)) { - depth--; - } - } - - if (depth > 0 && last != 0) { - accum.append(c); // don't include the outer match pair in the - // return - } - last = c; - } while (depth > 0); - return accum.toString(); - } - - /** - * Unescaped a \ escaped string. - * - * @param in - * backslash escaped string - * @return unescaped string - */ - public static String unescape(String in) { - StringBuilder out = new StringBuilder(); - char last = 0; - for (char c : in.toCharArray()) { - if (c == ESC) { - if (last != 0 && last == ESC) { - out.append(c); - } - } else { - out.append(c); - } - last = c; - } - return out.toString(); - } - - /** - * Pulls the next run of whitespace characters of the queue. - */ - public boolean consumeWhitespace() { - boolean seen = false; - while (matchesWhitespace()) { - pos++; - seen = true; - } - return seen; - } - - /** - * Retrieves the next run of word type (letter or digit) off the queue. - * - * @return String of word characters from queue, or empty string if none. - */ - public String consumeWord() { - int start = pos; - while (matchesWord()) { - pos++; - } - return queue.substring(start, pos); - } - - /** - * Consume an tag name off the queue (word or :, _, -) - * - * @return tag name - */ - public String consumeTagName() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume a CSS element selector (tag name, but | instead of : for - * namespaces, to not conflict with :pseudo selects). - * - * @return tag name - */ - public String consumeElementSelector() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume a CSS identifier (ID or class) off the queue (letter, digit, -, - * _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier - * - * @return identifier - */ - public String consumeCssIdentifier() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume an attribute key off the queue (letter, digit, -, _, :") - * - * @return attribute key - */ - public String consumeAttributeKey() { - int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) { - pos++; - } - - return queue.substring(start, pos); - } - - /** - * Consume and return whatever is left on the queue. - * - * @return remained of queue. - */ - public String remainder() { - StringBuilder accum = new StringBuilder(); - while (!isEmpty()) { - accum.append(consume()); - } - return accum.toString(); - } - - @Override - public String toString() { - return queue.substring(pos); - } -} diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java deleted file mode 100644 index f46c962281..0000000000 --- a/server/src/org/jsoup/parser/Tokeniser.java +++ /dev/null @@ -1,264 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Entities; - -/** - * Readers the input stream into tokens. - */ -class Tokeniser { - static final char replacementChar = '\uFFFD'; // replaces null character - - private CharacterReader reader; // html input - private ParseErrorList errors; // errors found while tokenising - - private TokeniserState state = TokeniserState.Data; // current tokenisation - // state - private Token emitPending; // the token we are about to emit on next read - private boolean isEmitPending = false; - private StringBuilder charBuffer = new StringBuilder(); // buffers - // characters to - // output as one - // token - StringBuilder dataBuffer; // buffers data looking for - - Token.Tag tagPending; // tag we are building up - Token.Doctype doctypePending; // doctype building up - Token.Comment commentPending; // comment building up - private Token.StartTag lastStartTag; // the last start tag emitted, to test - // appropriate end tag - private boolean selfClosingFlagAcknowledged = true; - - Tokeniser(CharacterReader reader, ParseErrorList errors) { - this.reader = reader; - this.errors = errors; - } - - Token read() { - if (!selfClosingFlagAcknowledged) { - error("Self closing flag not acknowledged"); - selfClosingFlagAcknowledged = true; - } - - while (!isEmitPending) { - state.read(this, reader); - } - - // if emit is pending, a non-character token was found: return any chars - // in buffer, and leave token for next read: - if (charBuffer.length() > 0) { - String str = charBuffer.toString(); - charBuffer.delete(0, charBuffer.length()); - return new Token.Character(str); - } else { - isEmitPending = false; - return emitPending; - } - } - - void emit(Token token) { - Validate.isFalse(isEmitPending, "There is an unread token pending!"); - - emitPending = token; - isEmitPending = true; - - if (token.type == Token.TokenType.StartTag) { - Token.StartTag startTag = (Token.StartTag) token; - lastStartTag = startTag; - if (startTag.selfClosing) { - selfClosingFlagAcknowledged = false; - } - } else if (token.type == Token.TokenType.EndTag) { - Token.EndTag endTag = (Token.EndTag) token; - if (endTag.attributes.size() > 0) { - error("Attributes incorrectly present on end tag"); - } - } - } - - void emit(String str) { - // buffer strings up until last string token found, to emit only one - // token for a run of character refs etc. - // does not set isEmitPending; read checks that - charBuffer.append(str); - } - - void emit(char c) { - charBuffer.append(c); - } - - TokeniserState getState() { - return state; - } - - void transition(TokeniserState state) { - this.state = state; - } - - void advanceTransition(TokeniserState state) { - reader.advance(); - this.state = state; - } - - void acknowledgeSelfClosingFlag() { - selfClosingFlagAcknowledged = true; - } - - Character consumeCharacterReference(Character additionalAllowedCharacter, - boolean inAttribute) { - if (reader.isEmpty()) { - return null; - } - if (additionalAllowedCharacter != null - && additionalAllowedCharacter == reader.current()) { - return null; - } - if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) { - return null; - } - - reader.mark(); - if (reader.matchConsume("#")) { // numbered - boolean isHexMode = reader.matchConsumeIgnoreCase("X"); - String numRef = isHexMode ? reader.consumeHexSequence() : reader - .consumeDigitSequence(); - if (numRef.length() == 0) { // didn't match anything - characterReferenceError("numeric reference with no numerals"); - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) { - characterReferenceError("missing semicolon"); // missing semi - } - int charval = -1; - try { - int base = isHexMode ? 16 : 10; - charval = Integer.valueOf(numRef, base); - } catch (NumberFormatException e) { - } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) - || charval > 0x10FFFF) { - characterReferenceError("character outside of valid range"); - return replacementChar; - } else { - // todo: implement number replacement table - // todo: check for extra illegal unicode points as parse errors - return (char) charval; - } - } else { // named - // get as many letters as possible, and look for matching entities. - // unconsume backwards till a match is found - String nameRef = reader.consumeLetterThenDigitSequence(); - String origNameRef = new String(nameRef); // for error reporting. - // nameRef gets chomped - // looking for matches - boolean looksLegit = reader.matches(';'); - boolean found = false; - while (nameRef.length() > 0 && !found) { - if (Entities.isNamedEntity(nameRef)) { - found = true; - } else { - nameRef = nameRef.substring(0, nameRef.length() - 1); - reader.unconsume(); - } - } - if (!found) { - if (looksLegit) { - characterReferenceError(String.format( - "invalid named referenece '%s'", origNameRef)); - } - reader.rewindToMark(); - return null; - } - if (inAttribute - && (reader.matchesLetter() || reader.matchesDigit() || reader - .matchesAny('=', '-', '_'))) { - // don't want that to match - reader.rewindToMark(); - return null; - } - if (!reader.matchConsume(";")) { - characterReferenceError("missing semicolon"); // missing semi - } - return Entities.getCharacterByName(nameRef); - } - } - - Token.Tag createTagPending(boolean start) { - tagPending = start ? new Token.StartTag() : new Token.EndTag(); - return tagPending; - } - - void emitTagPending() { - tagPending.finaliseTag(); - emit(tagPending); - } - - void createCommentPending() { - commentPending = new Token.Comment(); - } - - void emitCommentPending() { - emit(commentPending); - } - - void createDoctypePending() { - doctypePending = new Token.Doctype(); - } - - void emitDoctypePending() { - emit(doctypePending); - } - - void createTempBuffer() { - dataBuffer = new StringBuilder(); - } - - boolean isAppropriateEndTagToken() { - if (lastStartTag == null) { - return false; - } - return tagPending.tagName.equals(lastStartTag.tagName); - } - - String appropriateEndTagName() { - return lastStartTag.tagName; - } - - void error(TokeniserState state) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Unexpected character '%s' in input state [%s]", reader - .current(), state)); - } - } - - void eofError(TokeniserState state) { - if (errors.canAddError()) { - errors.add(new ParseError( - reader.pos(), - "Unexpectedly reached end of file (EOF) in input state [%s]", - state)); - } - } - - private void characterReferenceError(String message) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), - "Invalid character reference: %s", message)); - } - } - - private void error(String errorMsg) { - if (errors.canAddError()) { - errors.add(new ParseError(reader.pos(), errorMsg)); - } - } - - boolean currentNodeInHtmlNS() { - // todo: implement namespaces correctly - return true; - // Element currentNode = currentNode(); - // return currentNode != null && currentNode.namespace().equals("HTML"); - } -} diff --git a/server/src/org/jsoup/parser/TokeniserState.java b/server/src/org/jsoup/parser/TokeniserState.java deleted file mode 100644 index 7f7315d769..0000000000 --- a/server/src/org/jsoup/parser/TokeniserState.java +++ /dev/null @@ -1,1870 +0,0 @@ -package org.jsoup.parser; - -/** - * States and transition activations for the Tokeniser. - */ -enum TokeniserState { - Data { - // in data state, gather characters until a character reference or tag - // is found - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInData); - break; - case '<': - t.advanceTransition(TagOpen); - break; - case nullChar: - t.error(this); // NOT replacement character (oddly?) - t.emit(r.consume()); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; - } - } - }, - CharacterReferenceInData { - // from & in data - @Override - void read(Tokeniser t, CharacterReader r) { - Character c = t.consumeCharacterReference(null, false); - if (c == null) { - t.emit('&'); - } else { - t.emit(c); - } - t.transition(Data); - } - }, - Rcdata { - // / handles data in title, textarea etc - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInRcdata); - break; - case '<': - t.advanceTransition(RcdataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; - } - } - }, - CharacterReferenceInRcdata { - @Override - void read(Tokeniser t, CharacterReader r) { - Character c = t.consumeCharacterReference(null, false); - if (c == null) { - t.emit('&'); - } else { - t.emit(c); - } - t.transition(Rcdata); - } - }, - Rawtext { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '<': - t.advanceTransition(RawtextLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; - } - } - }, - ScriptData { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '<': - t.advanceTransition(ScriptDataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; - } - } - }, - PLAINTEXT { - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeTo(nullChar); - t.emit(data); - break; - } - } - }, - TagOpen { - // from < in data - @Override - void read(Tokeniser t, CharacterReader r) { - switch (r.current()) { - case '!': - t.advanceTransition(MarkupDeclarationOpen); - break; - case '/': - t.advanceTransition(EndTagOpen); - break; - case '?': - t.advanceTransition(BogusComment); - break; - default: - if (r.matchesLetter()) { - t.createTagPending(true); - t.transition(TagName); - } else { - t.error(this); - t.emit('<'); // char that got us here - t.transition(Data); - } - break; - } - } - }, - EndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.emit("')) { - t.error(this); - t.advanceTransition(Data); - } else { - t.error(this); - t.advanceTransition(BogusComment); - } - } - }, - TagName { - // from < or ', - nullChar).toLowerCase(); - t.tagPending.appendTagName(tagName); - - switch (r.consume()) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: // replacement - t.tagPending.appendTagName(replacementStr); - break; - case eof: // should emit pending tag? - t.eofError(this); - t.transition(Data); - // no default, as covered with above consumeToAny - } - } - }, - RcdataLessthanSign { - // from < in rcdata - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(RCDATAEndTagOpen); - } else if (r.matchesLetter() - && !r.containsIgnoreCase("), so rather than - // consuming to EOF; break out here - t.tagPending = new Token.EndTag(t.appropriateEndTagName()); - t.emitTagPending(); - r.unconsume(); // undo "<" - t.transition(Data); - } else { - t.emit("<"); - t.transition(Rcdata); - } - } - }, - RCDATAEndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.tagPending.appendTagName(Character.toLowerCase(r.current())); - t.dataBuffer.append(Character.toLowerCase(r.current())); - t.advanceTransition(RCDATAEndTagName); - } else { - t.emit("': - if (t.isAppropriateEndTagToken()) { - t.emitTagPending(); - t.transition(Data); - } else { - anythingElse(t, r); - } - break; - default: - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedLessthanSign { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTempBuffer(); - t.dataBuffer.append(Character.toLowerCase(r.current())); - t.emit("<" + r.current()); - t.advanceTransition(ScriptDataDoubleEscapeStart); - } else if (r.matches('/')) { - t.createTempBuffer(); - t.advanceTransition(ScriptDataEscapedEndTagOpen); - } else { - t.emit('<'); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataEscapedEndTagOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createTagPending(false); - t.tagPending.appendTagName(Character.toLowerCase(r.current())); - t.dataBuffer.append(r.current()); - t.advanceTransition(ScriptDataEscapedEndTagName); - } else { - t.emit("': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - break; - } - } else { - anythingElse(t, r); - } - } - - private void anythingElse(Tokeniser t, CharacterReader r) { - t.emit("': - if (t.dataBuffer.toString().equals("script")) { - t.transition(ScriptDataDoubleEscaped); - } else { - t.transition(ScriptDataEscaped); - } - t.emit(c); - break; - default: - r.unconsume(); - t.transition(ScriptDataEscaped); - } - } - }, - ScriptDataDoubleEscaped { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.current(); - switch (c) { - case '-': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedDash); - break; - case '<': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); - } - } - }, - ScriptDataDoubleEscapedDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataDoubleEscapedDashDash); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapedDashDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapedLessthanSign { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matches('/')) { - t.emit('/'); - t.createTempBuffer(); - t.advanceTransition(ScriptDataDoubleEscapeEnd); - } else { - t.transition(ScriptDataDoubleEscaped); - } - } - }, - ScriptDataDoubleEscapeEnd { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.dataBuffer.append(name.toLowerCase()); - t.emit(name); - return; - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) { - t.transition(ScriptDataEscaped); - } else { - t.transition(ScriptDataDoubleEscaped); - } - t.emit(c); - break; - default: - r.unconsume(); - t.transition(ScriptDataDoubleEscaped); - } - } - }, - BeforeAttributeName { - // from tagname ': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - } - } - }, - AttributeName { - // from before attribute name - @Override - void read(Tokeniser t, CharacterReader r) { - String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', - nullChar, '"', '\'', '<'); - t.tagPending.appendAttributeName(name.toLowerCase()); - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.appendAttributeName(c); - // no default, as covered in consumeToAny - } - } - }, - AfterAttributeName { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - } - } - }, - BeforeAttributeValue { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '"': - t.transition(AttributeValue_doubleQuoted); - break; - case '&': - r.unconsume(); - t.transition(AttributeValue_unquoted); - break; - case '\'': - t.transition(AttributeValue_singleQuoted); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - t.transition(AttributeValue_unquoted); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '>': - t.error(this); - t.emitTagPending(); - t.transition(Data); - break; - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - t.transition(AttributeValue_unquoted); - break; - default: - r.unconsume(); - t.transition(AttributeValue_unquoted); - } - } - }, - AttributeValue_doubleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('"', '&', nullChar); - if (value.length() > 0) { - t.tagPending.appendAttributeValue(value); - } - - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('"', true); - if (ref != null) { - t.tagPending.appendAttributeValue(ref); - } else { - t.tagPending.appendAttributeValue('&'); - } - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above - } - } - }, - AttributeValue_singleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\'', '&', nullChar); - if (value.length() > 0) { - t.tagPending.appendAttributeValue(value); - } - - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('\'', true); - if (ref != null) { - t.tagPending.appendAttributeValue(ref); - } else { - t.tagPending.appendAttributeValue('&'); - } - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above - } - } - }, - AttributeValue_unquoted { - @Override - void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', - nullChar, '"', '\'', '<', '=', '`'); - if (value.length() > 0) { - t.tagPending.appendAttributeValue(value); - } - - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '&': - Character ref = t.consumeCharacterReference('>', true); - if (ref != null) { - t.tagPending.appendAttributeValue(ref); - } else { - t.tagPending.appendAttributeValue('&'); - } - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - break; - // no default, handled in consume to any above - } - - } - }, - // CharacterReferenceInAttributeValue state handled inline - AfterAttributeValue_quoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - r.unconsume(); - t.transition(BeforeAttributeName); - } - - } - }, - SelfClosingStartTag { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.tagPending.selfClosing = true; - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeAttributeName); - } - } - }, - BogusComment { - @Override - void read(Tokeniser t, CharacterReader r) { - // todo: handle bogus comment starting from eof. when does that - // trigger? - // rewind to capture character that lead us here - r.unconsume(); - Token.Comment comment = new Token.Comment(); - comment.data.append(r.consumeTo('>')); - // todo: replace nullChar with replaceChar - t.emit(comment); - t.advanceTransition(Data); - } - }, - MarkupDeclarationOpen { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchConsume("--")) { - t.createCommentPending(); - t.transition(CommentStart); - } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { - t.transition(Doctype); - } else if (r.matchConsume("[CDATA[")) { - // todo: should actually check current namepspace, and only - // non-html allows cdata. until namespace - // is implemented properly, keep handling as cdata - // } else if (!t.currentNodeInHtmlNS() && - // r.matchConsume("[CDATA[")) { - t.transition(CdataSection); - } else { - t.error(this); - t.advanceTransition(BogusComment); // advance so this character - // gets in bogus comment - // data's rewind - } - } - }, - CommentStart { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); - } - } - }, - CommentStartDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); - } - } - }, - Comment { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.current(); - switch (c) { - case '-': - t.advanceTransition(CommentEndDash); - break; - case nullChar: - t.error(this); - r.advance(); - t.commentPending.data.append(replacementChar); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(r.consumeToAny('-', nullChar)); - } - } - }, - CommentEndDash { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.transition(CommentEnd); - break; - case nullChar: - t.error(this); - t.commentPending.data.append('-').append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append('-').append(c); - t.transition(Comment); - } - } - }, - CommentEnd { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--").append(replacementChar); - t.transition(Comment); - break; - case '!': - t.error(this); - t.transition(CommentEndBang); - break; - case '-': - t.error(this); - t.commentPending.data.append('-'); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.error(this); - t.commentPending.data.append("--").append(c); - t.transition(Comment); - } - } - }, - CommentEndBang { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '-': - t.commentPending.data.append("--!"); - t.transition(CommentEndDash); - break; - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--!").append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append("--!").append(c); - t.transition(Comment); - } - } - }, - Doctype { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeDoctypeName); - } - } - }, - BeforeDoctypeName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - t.createDoctypePending(); - t.transition(DoctypeName); - return; - } - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - t.transition(DoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.createDoctypePending(); - t.doctypePending.name.append(c); - t.transition(DoctypeName); - } - } - }, - DoctypeName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.matchesLetter()) { - String name = r.consumeLetterSequence(); - t.doctypePending.name.append(name.toLowerCase()); - return; - } - char c = r.consume(); - switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterDoctypeName); - break; - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.name.append(c); - } - } - }, - AfterDoctypeName { - @Override - void read(Tokeniser t, CharacterReader r) { - if (r.isEmpty()) { - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - return; - } - if (r.matchesAny('\t', '\n', '\f', ' ')) { - r.advance(); // ignore whitespace - } else if (r.matches('>')) { - t.emitDoctypePending(); - t.advanceTransition(Data); - } else if (r.matchConsumeIgnoreCase("PUBLIC")) { - t.transition(AfterDoctypePublicKeyword); - } else if (r.matchConsumeIgnoreCase("SYSTEM")) { - t.transition(AfterDoctypeSystemKeyword); - } else { - t.error(this); - t.doctypePending.forceQuirks = true; - t.advanceTransition(BogusDoctype); - } - - } - }, - AfterDoctypePublicKeyword { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypePublicIdentifier); - break; - case '"': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - BeforeDoctypePublicIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - DoctypePublicIdentifier_doubleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); - } - } - }, - DoctypePublicIdentifier_singleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); - } - } - }, - AfterDoctypePublicIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BetweenDoctypePublicAndSystemIdentifiers); - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - BetweenDoctypePublicAndSystemIdentifiers { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - AfterDoctypeSystemKeyword { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeSystemIdentifier); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - } - } - }, - BeforeDoctypeSystemIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set system id to empty string - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); - } - } - }, - DoctypeSystemIdentifier_doubleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '"': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); - } - } - }, - DoctypeSystemIdentifier_singleQuoted { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\'': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); - } - } - }, - AfterDoctypeSystemIdentifier { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BogusDoctype); - // NOT force quirks - } - } - }, - BogusDoctype { - @Override - void read(Tokeniser t, CharacterReader r) { - char c = r.consume(); - switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.emitDoctypePending(); - t.transition(Data); - break; - default: - // ignore char - break; - } - } - }, - CdataSection { - @Override - void read(Tokeniser t, CharacterReader r) { - String data = r.consumeTo("]]>"); - t.emit(data); - r.matchConsume("]]>"); - t.transition(Data); - } - }; - - abstract void read(Tokeniser t, CharacterReader r); - - private static final char nullChar = '\u0000'; - private static final char replacementChar = Tokeniser.replacementChar; - private static final String replacementStr = String - .valueOf(Tokeniser.replacementChar); - private static final char eof = CharacterReader.EOF; -} diff --git a/server/src/org/jsoup/parser/TreeBuilder.java b/server/src/org/jsoup/parser/TreeBuilder.java deleted file mode 100644 index 5e2dbebc66..0000000000 --- a/server/src/org/jsoup/parser/TreeBuilder.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.jsoup.parser; - -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -/** - * @author Jonathan Hedley - */ -abstract class TreeBuilder { - CharacterReader reader; - Tokeniser tokeniser; - protected Document doc; // current doc we are building into - protected DescendableLinkedList stack; // the stack of open - // elements - protected String baseUri; // current base uri, for creating new elements - protected Token currentToken; // currentToken is used only for error - // tracking. - protected ParseErrorList errors; // null when not tracking errors - - protected void initialiseParse(String input, String baseUri, - ParseErrorList errors) { - Validate.notNull(input, "String input must not be null"); - Validate.notNull(baseUri, "BaseURI must not be null"); - - doc = new Document(baseUri); - reader = new CharacterReader(input); - this.errors = errors; - tokeniser = new Tokeniser(reader, errors); - stack = new DescendableLinkedList(); - this.baseUri = baseUri; - } - - Document parse(String input, String baseUri) { - return parse(input, baseUri, ParseErrorList.noTracking()); - } - - Document parse(String input, String baseUri, ParseErrorList errors) { - initialiseParse(input, baseUri, errors); - runParser(); - return doc; - } - - protected void runParser() { - while (true) { - Token token = tokeniser.read(); - process(token); - - if (token.type == Token.TokenType.EOF) { - break; - } - } - } - - protected abstract boolean process(Token token); - - protected Element currentElement() { - return stack.getLast(); - } -} diff --git a/server/src/org/jsoup/parser/XmlTreeBuilder.java b/server/src/org/jsoup/parser/XmlTreeBuilder.java deleted file mode 100644 index c2a3635b3d..0000000000 --- a/server/src/org/jsoup/parser/XmlTreeBuilder.java +++ /dev/null @@ -1,121 +0,0 @@ -package org.jsoup.parser; - -import java.util.Iterator; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Comment; -import org.jsoup.nodes.DocumentType; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; - -/** - * @author Jonathan Hedley - */ -public class XmlTreeBuilder extends TreeBuilder { - @Override - protected void initialiseParse(String input, String baseUri, - ParseErrorList errors) { - super.initialiseParse(input, baseUri, errors); - stack.add(doc); // place the document onto the stack. differs from - // HtmlTreeBuilder (not on stack) - } - - @Override - protected boolean process(Token token) { - // start tag, end tag, doctype, comment, character, eof - switch (token.type) { - case StartTag: - insert(token.asStartTag()); - break; - case EndTag: - popStackToClose(token.asEndTag()); - break; - case Comment: - insert(token.asComment()); - break; - case Character: - insert(token.asCharacter()); - break; - case Doctype: - insert(token.asDoctype()); - break; - case EOF: // could put some normalisation here if desired - break; - default: - Validate.fail("Unexpected token type: " + token.type); - } - return true; - } - - private void insertNode(Node node) { - currentElement().appendChild(node); - } - - Element insert(Token.StartTag startTag) { - Tag tag = Tag.valueOf(startTag.name()); - // todo: wonder if for xml parsing, should treat all tags as unknown? - // because it's not html. - Element el = new Element(tag, baseUri, startTag.attributes); - insertNode(el); - if (startTag.isSelfClosing()) { - tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) { - tag.setSelfClosing(); - } - } else { - stack.add(el); - } - return el; - } - - void insert(Token.Comment commentToken) { - Comment comment = new Comment(commentToken.getData(), baseUri); - insertNode(comment); - } - - void insert(Token.Character characterToken) { - Node node = new TextNode(characterToken.getData(), baseUri); - insertNode(node); - } - - void insert(Token.Doctype d) { - DocumentType doctypeNode = new DocumentType(d.getName(), - d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); - insertNode(doctypeNode); - } - - /** - * If the stack contains an element with this tag's name, pop up the stack - * to remove the first occurrence. If not found, skips. - * - * @param endTag - */ - private void popStackToClose(Token.EndTag endTag) { - String elName = endTag.name(); - Element firstFound = null; - - Iterator it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next.nodeName().equals(elName)) { - firstFound = next; - break; - } - } - if (firstFound == null) { - return; // not found, skip - } - - it = stack.descendingIterator(); - while (it.hasNext()) { - Element next = it.next(); - if (next == firstFound) { - it.remove(); - break; - } else { - it.remove(); - } - } - } -} diff --git a/server/src/org/jsoup/parser/package-info.java b/server/src/org/jsoup/parser/package-info.java deleted file mode 100644 index c6c3d9a029..0000000000 --- a/server/src/org/jsoup/parser/package-info.java +++ /dev/null @@ -1,5 +0,0 @@ -/** - Contains the HTML parser, tag specifications, and HTML tokeniser. - */ -package org.jsoup.parser; - diff --git a/server/src/org/jsoup/safety/Cleaner.java b/server/src/org/jsoup/safety/Cleaner.java deleted file mode 100644 index 046efbbaa8..0000000000 --- a/server/src/org/jsoup/safety/Cleaner.java +++ /dev/null @@ -1,161 +0,0 @@ -package org.jsoup.safety; - -import java.util.List; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; -import org.jsoup.parser.Tag; - -/** - * The whitelist based HTML cleaner. Use to ensure that end-user provided HTML - * contains only the elements and attributes that you are expecting; no junk, - * and no cross-site scripting attacks! - *

- * The HTML cleaner parses the input as HTML and then runs it through a - * white-list, so the output HTML can only contain HTML that is allowed by the - * whitelist. - *

- * It is assumed that the input HTML is a body fragment; the clean methods only - * pull from the source's body, and the canned white-lists only allow body - * contained tags. - *

- * Rather than interacting directly with a Cleaner object, generally see the - * {@code clean} methods in {@link org.jsoup.Jsoup}. - */ -public class Cleaner { - private Whitelist whitelist; - - /** - * Create a new cleaner, that sanitizes documents using the supplied - * whitelist. - * - * @param whitelist - * white-list to clean with - */ - public Cleaner(Whitelist whitelist) { - Validate.notNull(whitelist); - this.whitelist = whitelist; - } - - /** - * Creates a new, clean document, from the original dirty document, - * containing only elements allowed by the whitelist. The original document - * is not modified. Only elements from the dirt document's body - * are used. - * - * @param dirtyDocument - * Untrusted base document to clean. - * @return cleaned document. - */ - public Document clean(Document dirtyDocument) { - Validate.notNull(dirtyDocument); - - Document clean = Document.createShell(dirtyDocument.baseUri()); - copySafeNodes(dirtyDocument.body(), clean.body()); - - return clean; - } - - /** - * Determines if the input document is valid, against the whitelist. It is - * considered valid if all the tags and attributes in the input HTML are - * allowed by the whitelist. - *

- * This method can be used as a validator for user input forms. An invalid - * document will still be cleaned successfully using the - * {@link #clean(Document)} document. If using as a validator, it is - * recommended to still clean the document to ensure enforced attributes are - * set correctly, and that the output is tidied. - * - * @param dirtyDocument - * document to test - * @return true if no tags or attributes need to be removed; false if they - * do - */ - public boolean isValid(Document dirtyDocument) { - Validate.notNull(dirtyDocument); - - Document clean = Document.createShell(dirtyDocument.baseUri()); - int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); - return numDiscarded == 0; - } - - /** - * Iterates the input and copies trusted nodes (tags, attributes, text) into - * the destination. - * - * @param source - * source of HTML - * @param dest - * destination element to copy into - * @return number of discarded elements (that were considered unsafe) - */ - private int copySafeNodes(Element source, Element dest) { - List sourceChildren = source.childNodes(); - int numDiscarded = 0; - - for (Node sourceChild : sourceChildren) { - if (sourceChild instanceof Element) { - Element sourceEl = (Element) sourceChild; - - if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone - // and copy safe - // attrs - ElementMeta meta = createSafeElement(sourceEl); - Element destChild = meta.el; - dest.appendChild(destChild); - - numDiscarded += meta.numAttribsDiscarded; - numDiscarded += copySafeNodes(sourceEl, destChild); // recurs - } else { // not a safe tag, but it may have children (els or - // text) that are, so recurse - numDiscarded++; - numDiscarded += copySafeNodes(sourceEl, dest); - } - } else if (sourceChild instanceof TextNode) { - TextNode sourceText = (TextNode) sourceChild; - TextNode destText = new TextNode(sourceText.getWholeText(), - sourceChild.baseUri()); - dest.appendChild(destText); - } // else, we don't care about comments, xml proc instructions, etc - } - return numDiscarded; - } - - private ElementMeta createSafeElement(Element sourceEl) { - String sourceTag = sourceEl.tagName(); - Attributes destAttrs = new Attributes(); - Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), - destAttrs); - int numDiscarded = 0; - - Attributes sourceAttrs = sourceEl.attributes(); - for (Attribute sourceAttr : sourceAttrs) { - if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { - destAttrs.put(sourceAttr); - } else { - numDiscarded++; - } - } - Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); - destAttrs.addAll(enforcedAttrs); - - return new ElementMeta(dest, numDiscarded); - } - - private static class ElementMeta { - Element el; - int numAttribsDiscarded; - - ElementMeta(Element el, int numAttribsDiscarded) { - this.el = el; - this.numAttribsDiscarded = numAttribsDiscarded; - } - } - -} diff --git a/server/src/org/jsoup/safety/Whitelist.java b/server/src/org/jsoup/safety/Whitelist.java deleted file mode 100644 index b86cb5c6cf..0000000000 --- a/server/src/org/jsoup/safety/Whitelist.java +++ /dev/null @@ -1,509 +0,0 @@ -package org.jsoup.safety; - -/* - Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired - this whitelist configuration, and the initial defaults. - */ - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Element; - -/** - * Whitelists define what HTML (elements and attributes) to allow through the - * cleaner. Everything else is removed. - *

- * Start with one of the defaults: - *

    - *
  • {@link #none} - *
  • {@link #simpleText} - *
  • {@link #basic} - *
  • {@link #basicWithImages} - *
  • {@link #relaxed} - *
- *

- * If you need to allow more through (please be careful!), tweak a base - * whitelist with: - *

    - *
  • {@link #addTags} - *
  • {@link #addAttributes} - *
  • {@link #addEnforcedAttribute} - *
  • {@link #addProtocols} - *
- *

- * The cleaner and these whitelists assume that you want to clean a - * body fragment of HTML (to add user supplied HTML into a - * templated page), and not to clean a full HTML document. If the latter is the - * case, either wrap the document HTML around the cleaned body HTML, or create a - * whitelist that allows html and head elements as - * appropriate. - *

- * If you are going to extend a whitelist, please be very careful. Make sure you - * understand what attributes may lead to XSS attack vectors. URL attributes are - * particularly vulnerable and require careful validation. See - * http://ha.ckers.org/xss.html for some XSS attack examples. - * - * @author Jonathan Hedley - */ -public class Whitelist { - private Set tagNames; // tags allowed, lower case. e.g. [p, br, - // span] - private Map> attributes; // tag -> attribute[]. - // allowed attributes - // [href] for a tag. - private Map> enforcedAttributes; // always - // set - // these - // attribute - // values - private Map>> protocols; // allowed - // URL - // protocols - // for - // attributes - private boolean preserveRelativeLinks; // option to preserve relative links - - /** - * This whitelist allows only text nodes: all HTML will be stripped. - * - * @return whitelist - */ - public static Whitelist none() { - return new Whitelist(); - } - - /** - * This whitelist allows only simple text formatting: - * b, em, i, strong, u. All other HTML (tags and attributes) - * will be removed. - * - * @return whitelist - */ - public static Whitelist simpleText() { - return new Whitelist().addTags("b", "em", "i", "strong", "u"); - } - - /** - * This whitelist allows a fuller range of text nodes: - * a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, - ol, p, pre, q, small, strike, strong, sub, sup, u, ul, and - * appropriate attributes. - *

- * Links (a elements) can point to - * http, https, ftp, mailto, and have an enforced - * rel=nofollow attribute. - *

- * Does not allow images. - * - * @return whitelist - */ - public static Whitelist basic() { - return new Whitelist() - .addTags("a", "b", "blockquote", "br", "cite", "code", "dd", - "dl", "dt", "em", "i", "li", "ol", "p", "pre", "q", - "small", "strike", "strong", "sub", "sup", "u", "ul") - - .addAttributes("a", "href").addAttributes("blockquote", "cite") - .addAttributes("q", "cite") - - .addProtocols("a", "href", "ftp", "http", "https", "mailto") - .addProtocols("blockquote", "cite", "http", "https") - .addProtocols("cite", "cite", "http", "https") - - .addEnforcedAttribute("a", "rel", "nofollow"); - - } - - /** - * This whitelist allows the same text tags as {@link #basic}, and also - * allows img tags, with appropriate attributes, with - * src pointing to http or https. - * - * @return whitelist - */ - public static Whitelist basicWithImages() { - return basic() - .addTags("img") - .addAttributes("img", "align", "alt", "height", "src", "title", - "width").addProtocols("img", "src", "http", "https"); - } - - /** - * This whitelist allows a full range of text and structural body HTML: - * a, b, blockquote, br, caption, cite, - code, col, colgroup, dd, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, strike, strong, sub, - sup, table, tbody, td, tfoot, th, thead, tr, u, ul - *

- * Links do not have an enforced rel=nofollow attribute, but - * you can add that if desired. - * - * @return whitelist - */ - public static Whitelist relaxed() { - return new Whitelist() - .addTags("a", "b", "blockquote", "br", "caption", "cite", - "code", "col", "colgroup", "dd", "div", "dl", "dt", - "em", "h1", "h2", "h3", "h4", "h5", "h6", "i", "img", - "li", "ol", "p", "pre", "q", "small", "strike", - "strong", "sub", "sup", "table", "tbody", "td", - "tfoot", "th", "thead", "tr", "u", "ul") - - .addAttributes("a", "href", "title") - .addAttributes("blockquote", "cite") - .addAttributes("col", "span", "width") - .addAttributes("colgroup", "span", "width") - .addAttributes("img", "align", "alt", "height", "src", "title", - "width") - .addAttributes("ol", "start", "type") - .addAttributes("q", "cite") - .addAttributes("table", "summary", "width") - .addAttributes("td", "abbr", "axis", "colspan", "rowspan", - "width") - .addAttributes("th", "abbr", "axis", "colspan", "rowspan", - "scope", "width").addAttributes("ul", "type") - - .addProtocols("a", "href", "ftp", "http", "https", "mailto") - .addProtocols("blockquote", "cite", "http", "https") - .addProtocols("img", "src", "http", "https") - .addProtocols("q", "cite", "http", "https"); - } - - /** - * Create a new, empty whitelist. Generally it will be better to start with - * a default prepared whitelist instead. - * - * @see #basic() - * @see #basicWithImages() - * @see #simpleText() - * @see #relaxed() - */ - public Whitelist() { - tagNames = new HashSet(); - attributes = new HashMap>(); - enforcedAttributes = new HashMap>(); - protocols = new HashMap>>(); - preserveRelativeLinks = false; - } - - /** - * Add a list of allowed elements to a whitelist. (If a tag is not allowed, - * it will be removed from the HTML.) - * - * @param tags - * tag names to allow - * @return this (for chaining) - */ - public Whitelist addTags(String... tags) { - Validate.notNull(tags); - - for (String tagName : tags) { - Validate.notEmpty(tagName); - tagNames.add(TagName.valueOf(tagName)); - } - return this; - } - - /** - * Add a list of allowed attributes to a tag. (If an attribute is not - * allowed on an element, it will be removed.) - *

- * E.g.: addAttributes("a", "href", "class") allows - * href and class attributes on a - * tags. - *

- * To make an attribute valid for all tags, use the pseudo tag - * :all, e.g. addAttributes(":all", "class"). - * - * @param tag - * The tag the attributes are for. The tag will be added to the - * allowed tag list if necessary. - * @param keys - * List of valid attributes for the tag - * @return this (for chaining) - */ - public Whitelist addAttributes(String tag, String... keys) { - Validate.notEmpty(tag); - Validate.notNull(keys); - Validate.isTrue(keys.length > 0, "No attributes supplied."); - - TagName tagName = TagName.valueOf(tag); - if (!tagNames.contains(tagName)) { - tagNames.add(tagName); - } - Set attributeSet = new HashSet(); - for (String key : keys) { - Validate.notEmpty(key); - attributeSet.add(AttributeKey.valueOf(key)); - } - if (attributes.containsKey(tagName)) { - Set currentSet = attributes.get(tagName); - currentSet.addAll(attributeSet); - } else { - attributes.put(tagName, attributeSet); - } - return this; - } - - /** - * Add an enforced attribute to a tag. An enforced attribute will always be - * added to the element. If the element already has the attribute set, it - * will be overridden. - *

- * E.g.: addEnforcedAttribute("a", "rel", "nofollow") will make - * all a tags output as - * <a href="..." rel="nofollow"> - * - * @param tag - * The tag the enforced attribute is for. The tag will be added - * to the allowed tag list if necessary. - * @param key - * The attribute key - * @param value - * The enforced attribute value - * @return this (for chaining) - */ - public Whitelist addEnforcedAttribute(String tag, String key, String value) { - Validate.notEmpty(tag); - Validate.notEmpty(key); - Validate.notEmpty(value); - - TagName tagName = TagName.valueOf(tag); - if (!tagNames.contains(tagName)) { - tagNames.add(tagName); - } - AttributeKey attrKey = AttributeKey.valueOf(key); - AttributeValue attrVal = AttributeValue.valueOf(value); - - if (enforcedAttributes.containsKey(tagName)) { - enforcedAttributes.get(tagName).put(attrKey, attrVal); - } else { - Map attrMap = new HashMap(); - attrMap.put(attrKey, attrVal); - enforcedAttributes.put(tagName, attrMap); - } - return this; - } - - /** - * Configure this Whitelist to preserve relative links in an element's URL - * attribute, or convert them to absolute links. By default, this is - * false: URLs will be made absolute (e.g. start with an allowed - * protocol, like e.g. {@code http://}. - *

- * Note that when handling relative links, the input document must have an - * appropriate {@code base URI} set when parsing, so that the link's - * protocol can be confirmed. Regardless of the setting of the - * {@code preserve relative - * links} option, the link must be resolvable against the base URI to an - * allowed protocol; otherwise the attribute will be removed. - * - * @param preserve - * {@code true} to allow relative links, {@code false} (default) - * to deny - * @return this Whitelist, for chaining. - * @see #addProtocols - */ - public Whitelist preserveRelativeLinks(boolean preserve) { - preserveRelativeLinks = preserve; - return this; - } - - /** - * Add allowed URL protocols for an element's URL attribute. This restricts - * the possible values of the attribute to URLs with the defined protocol. - *

- * E.g.: addProtocols("a", "href", "ftp", "http", "https") - * - * @param tag - * Tag the URL protocol is for - * @param key - * Attribute key - * @param protocols - * List of valid protocols - * @return this, for chaining - */ - public Whitelist addProtocols(String tag, String key, String... protocols) { - Validate.notEmpty(tag); - Validate.notEmpty(key); - Validate.notNull(protocols); - - TagName tagName = TagName.valueOf(tag); - AttributeKey attrKey = AttributeKey.valueOf(key); - Map> attrMap; - Set protSet; - - if (this.protocols.containsKey(tagName)) { - attrMap = this.protocols.get(tagName); - } else { - attrMap = new HashMap>(); - this.protocols.put(tagName, attrMap); - } - if (attrMap.containsKey(attrKey)) { - protSet = attrMap.get(attrKey); - } else { - protSet = new HashSet(); - attrMap.put(attrKey, protSet); - } - for (String protocol : protocols) { - Validate.notEmpty(protocol); - Protocol prot = Protocol.valueOf(protocol); - protSet.add(prot); - } - return this; - } - - boolean isSafeTag(String tag) { - return tagNames.contains(TagName.valueOf(tag)); - } - - boolean isSafeAttribute(String tagName, Element el, Attribute attr) { - TagName tag = TagName.valueOf(tagName); - AttributeKey key = AttributeKey.valueOf(attr.getKey()); - - if (attributes.containsKey(tag)) { - if (attributes.get(tag).contains(key)) { - if (protocols.containsKey(tag)) { - Map> attrProts = protocols - .get(tag); - // ok if not defined protocol; otherwise test - return !attrProts.containsKey(key) - || testValidProtocol(el, attr, attrProts.get(key)); - } else { // attribute found, no protocols defined, so OK - return true; - } - } - } - // no attributes defined for tag, try :all tag - return !tagName.equals(":all") && isSafeAttribute(":all", el, attr); - } - - private boolean testValidProtocol(Element el, Attribute attr, - Set protocols) { - // try to resolve relative urls to abs, and optionally update the - // attribute so output html has abs. - // rels without a baseuri get removed - String value = el.absUrl(attr.getKey()); - if (value.length() == 0) { - value = attr.getValue(); // if it could not be made abs, run as-is - // to allow custom unknown protocols - } - if (!preserveRelativeLinks) { - attr.setValue(value); - } - - for (Protocol protocol : protocols) { - String prot = protocol.toString() + ":"; - if (value.toLowerCase().startsWith(prot)) { - return true; - } - } - return false; - } - - Attributes getEnforcedAttributes(String tagName) { - Attributes attrs = new Attributes(); - TagName tag = TagName.valueOf(tagName); - if (enforcedAttributes.containsKey(tag)) { - Map keyVals = enforcedAttributes - .get(tag); - for (Map.Entry entry : keyVals - .entrySet()) { - attrs.put(entry.getKey().toString(), entry.getValue() - .toString()); - } - } - return attrs; - } - - // named types for config. All just hold strings, but here for my sanity. - - static class TagName extends TypedValue { - TagName(String value) { - super(value); - } - - static TagName valueOf(String value) { - return new TagName(value); - } - } - - static class AttributeKey extends TypedValue { - AttributeKey(String value) { - super(value); - } - - static AttributeKey valueOf(String value) { - return new AttributeKey(value); - } - } - - static class AttributeValue extends TypedValue { - AttributeValue(String value) { - super(value); - } - - static AttributeValue valueOf(String value) { - return new AttributeValue(value); - } - } - - static class Protocol extends TypedValue { - Protocol(String value) { - super(value); - } - - static Protocol valueOf(String value) { - return new Protocol(value); - } - } - - abstract static class TypedValue { - private String value; - - TypedValue(String value) { - Validate.notNull(value); - this.value = value; - } - - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((value == null) ? 0 : value.hashCode()); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - TypedValue other = (TypedValue) obj; - if (value == null) { - if (other.value != null) { - return false; - } - } else if (!value.equals(other.value)) { - return false; - } - return true; - } - - @Override - public String toString() { - return value; - } - } -} diff --git a/server/src/org/jsoup/safety/package-info.java b/server/src/org/jsoup/safety/package-info.java deleted file mode 100644 index acbff6665f..0000000000 --- a/server/src/org/jsoup/safety/package-info.java +++ /dev/null @@ -1,5 +0,0 @@ -/** - Contains the jsoup HTML cleaner, and whitelist definitions. - */ -package org.jsoup.safety; - diff --git a/server/src/org/jsoup/select/Collector.java b/server/src/org/jsoup/select/Collector.java deleted file mode 100644 index 20554e8653..0000000000 --- a/server/src/org/jsoup/select/Collector.java +++ /dev/null @@ -1,58 +0,0 @@ -package org.jsoup.select; - -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -/** - * Collects a list of elements that match the supplied criteria. - * - * @author Jonathan Hedley - */ -public class Collector { - - private Collector() { - } - - /** - * Build a list of elements, by visiting root and every descendant of root, - * and testing it against the evaluator. - * - * @param eval - * Evaluator to test elements against - * @param root - * root of tree to descend - * @return list of matches; empty if none - */ - public static Elements collect(Evaluator eval, Element root) { - Elements elements = new Elements(); - new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root); - return elements; - } - - private static class Accumulator implements NodeVisitor { - private final Element root; - private final Elements elements; - private final Evaluator eval; - - Accumulator(Element root, Elements elements, Evaluator eval) { - this.root = root; - this.elements = elements; - this.eval = eval; - } - - @Override - public void head(Node node, int depth) { - if (node instanceof Element) { - Element el = (Element) node; - if (eval.matches(root, el)) { - elements.add(el); - } - } - } - - @Override - public void tail(Node node, int depth) { - // void - } - } -} diff --git a/server/src/org/jsoup/select/CombiningEvaluator.java b/server/src/org/jsoup/select/CombiningEvaluator.java deleted file mode 100644 index c3f9a8af2e..0000000000 --- a/server/src/org/jsoup/select/CombiningEvaluator.java +++ /dev/null @@ -1,102 +0,0 @@ -package org.jsoup.select; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; - -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Element; - -/** - * Base combining (and, or) evaluator. - */ -abstract class CombiningEvaluator extends Evaluator { - final List evaluators; - - CombiningEvaluator() { - super(); - evaluators = new ArrayList(); - } - - CombiningEvaluator(Collection evaluators) { - this(); - this.evaluators.addAll(evaluators); - } - - Evaluator rightMostEvaluator() { - return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) - : null; - } - - void replaceRightMostEvaluator(Evaluator replacement) { - evaluators.set(evaluators.size() - 1, replacement); - } - - static final class And extends CombiningEvaluator { - And(Collection evaluators) { - super(evaluators); - } - - And(Evaluator... evaluators) { - this(Arrays.asList(evaluators)); - } - - @Override - public boolean matches(Element root, Element node) { - for (Evaluator s : evaluators) { - if (!s.matches(root, node)) { - return false; - } - } - return true; - } - - @Override - public String toString() { - return StringUtil.join(evaluators, " "); - } - } - - static final class Or extends CombiningEvaluator { - /** - * Create a new Or evaluator. The initial evaluators are ANDed together - * and used as the first clause of the OR. - * - * @param evaluators - * initial OR clause (these are wrapped into an AND - * evaluator). - */ - Or(Collection evaluators) { - super(); - if (evaluators.size() > 1) { - this.evaluators.add(new And(evaluators)); - } else { - this.evaluators.addAll(evaluators); - } - } - - Or() { - super(); - } - - public void add(Evaluator e) { - evaluators.add(e); - } - - @Override - public boolean matches(Element root, Element node) { - for (Evaluator s : evaluators) { - if (s.matches(root, node)) { - return true; - } - } - return false; - } - - @Override - public String toString() { - return String.format(":or%s", evaluators); - } - } -} diff --git a/server/src/org/jsoup/select/Elements.java b/server/src/org/jsoup/select/Elements.java deleted file mode 100644 index cddea67d96..0000000000 --- a/server/src/org/jsoup/select/Elements.java +++ /dev/null @@ -1,704 +0,0 @@ -package org.jsoup.select; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.ListIterator; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - -/** - * A list of {@link Element Elements}, with methods that act on every element in - * the list. - *

- * To get an Elements object, use the {@link Element#select(String)} method. - * - * @author Jonathan Hedley, jonathan@hedley.net - */ -public class Elements implements List, Cloneable { - private List contents; - - public Elements() { - contents = new ArrayList(); - } - - public Elements(int initialCapacity) { - contents = new ArrayList(initialCapacity); - } - - public Elements(Collection elements) { - contents = new ArrayList(elements); - } - - public Elements(List elements) { - contents = elements; - } - - public Elements(Element... elements) { - this(Arrays.asList(elements)); - } - - @Override - public Elements clone() { - List elements = new ArrayList(); - - for (Element e : contents) { - elements.add(e.clone()); - } - - return new Elements(elements); - } - - // attribute methods - /** - * Get an attribute value from the first matched element that has the - * attribute. - * - * @param attributeKey - * The attribute key. - * @return The attribute value from the first matched element that has the - * attribute.. If no elements were matched (isEmpty() == true), or - * if the no elements have the attribute, returns empty string. - * @see #hasAttr(String) - */ - public String attr(String attributeKey) { - for (Element element : contents) { - if (element.hasAttr(attributeKey)) { - return element.attr(attributeKey); - } - } - return ""; - } - - /** - * Checks if any of the matched elements have this attribute set. - * - * @param attributeKey - * attribute key - * @return true if any of the elements have the attribute; false if none do. - */ - public boolean hasAttr(String attributeKey) { - for (Element element : contents) { - if (element.hasAttr(attributeKey)) { - return true; - } - } - return false; - } - - /** - * Set an attribute on all matched elements. - * - * @param attributeKey - * attribute key - * @param attributeValue - * attribute value - * @return this - */ - public Elements attr(String attributeKey, String attributeValue) { - for (Element element : contents) { - element.attr(attributeKey, attributeValue); - } - return this; - } - - /** - * Remove an attribute from every matched element. - * - * @param attributeKey - * The attribute to remove. - * @return this (for chaining) - */ - public Elements removeAttr(String attributeKey) { - for (Element element : contents) { - element.removeAttr(attributeKey); - } - return this; - } - - /** - * Add the class name to every matched element's {@code class} attribute. - * - * @param className - * class name to add - * @return this - */ - public Elements addClass(String className) { - for (Element element : contents) { - element.addClass(className); - } - return this; - } - - /** - * Remove the class name from every matched element's {@code class} - * attribute, if present. - * - * @param className - * class name to remove - * @return this - */ - public Elements removeClass(String className) { - for (Element element : contents) { - element.removeClass(className); - } - return this; - } - - /** - * Toggle the class name on every matched element's {@code class} attribute. - * - * @param className - * class name to add if missing, or remove if present, from every - * element. - * @return this - */ - public Elements toggleClass(String className) { - for (Element element : contents) { - element.toggleClass(className); - } - return this; - } - - /** - * Determine if any of the matched elements have this class name set in - * their {@code class} attribute. - * - * @param className - * class name to check for - * @return true if any do, false if none do - */ - public boolean hasClass(String className) { - for (Element element : contents) { - if (element.hasClass(className)) { - return true; - } - } - return false; - } - - /** - * Get the form element's value of the first matched element. - * - * @return The form element's value, or empty if not set. - * @see Element#val() - */ - public String val() { - if (size() > 0) { - return first().val(); - } else { - return ""; - } - } - - /** - * Set the form element's value in each of the matched elements. - * - * @param value - * The value to set into each matched element - * @return this (for chaining) - */ - public Elements val(String value) { - for (Element element : contents) { - element.val(value); - } - return this; - } - - /** - * Get the combined text of all the matched elements. - *

- * Note that it is possible to get repeats if the matched elements contain - * both parent elements and their own children, as the Element.text() method - * returns the combined text of a parent and all its children. - * - * @return string of all text: unescaped and no HTML. - * @see Element#text() - */ - public String text() { - StringBuilder sb = new StringBuilder(); - for (Element element : contents) { - if (sb.length() != 0) { - sb.append(" "); - } - sb.append(element.text()); - } - return sb.toString(); - } - - public boolean hasText() { - for (Element element : contents) { - if (element.hasText()) { - return true; - } - } - return false; - } - - /** - * Get the combined inner HTML of all matched elements. - * - * @return string of all element's inner HTML. - * @see #text() - * @see #outerHtml() - */ - public String html() { - StringBuilder sb = new StringBuilder(); - for (Element element : contents) { - if (sb.length() != 0) { - sb.append("\n"); - } - sb.append(element.html()); - } - return sb.toString(); - } - - /** - * Get the combined outer HTML of all matched elements. - * - * @return string of all element's outer HTML. - * @see #text() - * @see #html() - */ - public String outerHtml() { - StringBuilder sb = new StringBuilder(); - for (Element element : contents) { - if (sb.length() != 0) { - sb.append("\n"); - } - sb.append(element.outerHtml()); - } - return sb.toString(); - } - - /** - * Get the combined outer HTML of all matched elements. Alias of - * {@link #outerHtml()}. - * - * @return string of all element's outer HTML. - * @see #text() - * @see #html() - */ - @Override - public String toString() { - return outerHtml(); - } - - /** - * Update the tag name of each matched element. For example, to change each - * {@code } to a {@code }, do {@code doc.select("i").tagName("em");} - * - * @param tagName - * the new tag name - * @return this, for chaining - * @see Element#tagName(String) - */ - public Elements tagName(String tagName) { - for (Element element : contents) { - element.tagName(tagName); - } - return this; - } - - /** - * Set the inner HTML of each matched element. - * - * @param html - * HTML to parse and set into each matched element. - * @return this, for chaining - * @see Element#html(String) - */ - public Elements html(String html) { - for (Element element : contents) { - element.html(html); - } - return this; - } - - /** - * Add the supplied HTML to the start of each matched element's inner HTML. - * - * @param html - * HTML to add inside each element, before the existing HTML - * @return this, for chaining - * @see Element#prepend(String) - */ - public Elements prepend(String html) { - for (Element element : contents) { - element.prepend(html); - } - return this; - } - - /** - * Add the supplied HTML to the end of each matched element's inner HTML. - * - * @param html - * HTML to add inside each element, after the existing HTML - * @return this, for chaining - * @see Element#append(String) - */ - public Elements append(String html) { - for (Element element : contents) { - element.append(html); - } - return this; - } - - /** - * Insert the supplied HTML before each matched element's outer HTML. - * - * @param html - * HTML to insert before each element - * @return this, for chaining - * @see Element#before(String) - */ - public Elements before(String html) { - for (Element element : contents) { - element.before(html); - } - return this; - } - - /** - * Insert the supplied HTML after each matched element's outer HTML. - * - * @param html - * HTML to insert after each element - * @return this, for chaining - * @see Element#after(String) - */ - public Elements after(String html) { - for (Element element : contents) { - element.after(html); - } - return this; - } - - /** - * Wrap the supplied HTML around each matched elements. For example, with - * HTML {@code

This is Jsoup

}, - * doc.select("b").wrap("<i></i>"); becomes - * {@code

This is jsoup

} - * - * @param html - * HTML to wrap around each element, e.g. - * {@code
}. Can be arbitrarily deep. - * @return this (for chaining) - * @see Element#wrap - */ - public Elements wrap(String html) { - Validate.notEmpty(html); - for (Element element : contents) { - element.wrap(html); - } - return this; - } - - /** - * Removes the matched elements from the DOM, and moves their children up - * into their parents. This has the effect of dropping the elements but - * keeping their children. - *

- * This is useful for e.g removing unwanted formatting elements but keeping - * their contents. - *

- * E.g. with HTML: - * {@code

One Two
}
- * {@code doc.select("font").unwrap();}
- * HTML = {@code
One Two
} - * - * @return this (for chaining) - * @see Node#unwrap - */ - public Elements unwrap() { - for (Element element : contents) { - element.unwrap(); - } - return this; - } - - /** - * Empty (remove all child nodes from) each matched element. This is similar - * to setting the inner HTML of each element to nothing. - *

- * E.g. HTML: {@code

Hello there

now

}
- * doc.select("p").empty();
- * HTML = {@code
- *

- *

- *
} - * - * @return this, for chaining - * @see Element#empty() - * @see #remove() - */ - public Elements empty() { - for (Element element : contents) { - element.empty(); - } - return this; - } - - /** - * Remove each matched element from the DOM. This is similar to setting the - * outer HTML of each element to nothing. - *

- * E.g. HTML: {@code

Hello

there

}
- * doc.select("p").remove();
- * HTML = {@code
} - *

- * Note that this method should not be used to clean user-submitted HTML; - * rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. - * - * @return this, for chaining - * @see Element#empty() - * @see #empty() - */ - public Elements remove() { - for (Element element : contents) { - element.remove(); - } - return this; - } - - // filters - - /** - * Find matching elements within this element list. - * - * @param query - * A {@link Selector} query - * @return the filtered list of elements, or an empty list if none match. - */ - public Elements select(String query) { - return Selector.select(query, this); - } - - /** - * Remove elements from this list that match the {@link Selector} query. - *

- * E.g. HTML: {@code

Two
}
- * Elements divs = doc.select("div").not("#logo");
- * Result: {@code divs: [
Two
]} - *

- * - * @param query - * the selector query whose results should be removed from these - * elements - * @return a new elements list that contains only the filtered results - */ - public Elements not(String query) { - Elements out = Selector.select(query, this); - return Selector.filterOut(this, out); - } - - /** - * Get the nth matched element as an Elements object. - *

- * See also {@link #get(int)} to retrieve an Element. - * - * @param index - * the (zero-based) index of the element in the list to retain - * @return Elements containing only the specified element, or, if that - * element did not exist, an empty list. - */ - public Elements eq(int index) { - return contents.size() > index ? new Elements(get(index)) - : new Elements(); - } - - /** - * Test if any of the matched elements match the supplied query. - * - * @param query - * A selector - * @return true if at least one element in the list matches the query. - */ - public boolean is(String query) { - Elements children = select(query); - return !children.isEmpty(); - } - - /** - * Get all of the parents and ancestor elements of the matched elements. - * - * @return all of the parents and ancestor elements of the matched elements - */ - public Elements parents() { - HashSet combo = new LinkedHashSet(); - for (Element e : contents) { - combo.addAll(e.parents()); - } - return new Elements(combo); - } - - // list-like methods - /** - * Get the first matched element. - * - * @return The first matched element, or null if contents is - * empty; - */ - public Element first() { - return contents.isEmpty() ? null : contents.get(0); - } - - /** - * Get the last matched element. - * - * @return The last matched element, or null if contents is - * empty. - */ - public Element last() { - return contents.isEmpty() ? null : contents.get(contents.size() - 1); - } - - /** - * Perform a depth-first traversal on each of the selected elements. - * - * @param nodeVisitor - * the visitor callbacks to perform on each node - * @return this, for chaining - */ - public Elements traverse(NodeVisitor nodeVisitor) { - Validate.notNull(nodeVisitor); - NodeTraversor traversor = new NodeTraversor(nodeVisitor); - for (Element el : contents) { - traversor.traverse(el); - } - return this; - } - - // implements List delegates: - @Override - public int size() { - return contents.size(); - } - - @Override - public boolean isEmpty() { - return contents.isEmpty(); - } - - @Override - public boolean contains(Object o) { - return contents.contains(o); - } - - @Override - public Iterator iterator() { - return contents.iterator(); - } - - @Override - public Object[] toArray() { - return contents.toArray(); - } - - @Override - public T[] toArray(T[] a) { - return contents.toArray(a); - } - - @Override - public boolean add(Element element) { - return contents.add(element); - } - - @Override - public boolean remove(Object o) { - return contents.remove(o); - } - - @Override - public boolean containsAll(Collection c) { - return contents.containsAll(c); - } - - @Override - public boolean addAll(Collection c) { - return contents.addAll(c); - } - - @Override - public boolean addAll(int index, Collection c) { - return contents.addAll(index, c); - } - - @Override - public boolean removeAll(Collection c) { - return contents.removeAll(c); - } - - @Override - public boolean retainAll(Collection c) { - return contents.retainAll(c); - } - - @Override - public void clear() { - contents.clear(); - } - - @Override - public boolean equals(Object o) { - return contents.equals(o); - } - - @Override - public int hashCode() { - return contents.hashCode(); - } - - @Override - public Element get(int index) { - return contents.get(index); - } - - @Override - public Element set(int index, Element element) { - return contents.set(index, element); - } - - @Override - public void add(int index, Element element) { - contents.add(index, element); - } - - @Override - public Element remove(int index) { - return contents.remove(index); - } - - @Override - public int indexOf(Object o) { - return contents.indexOf(o); - } - - @Override - public int lastIndexOf(Object o) { - return contents.lastIndexOf(o); - } - - @Override - public ListIterator listIterator() { - return contents.listIterator(); - } - - @Override - public ListIterator listIterator(int index) { - return contents.listIterator(index); - } - - @Override - public List subList(int fromIndex, int toIndex) { - return contents.subList(fromIndex, toIndex); - } -} diff --git a/server/src/org/jsoup/select/Evaluator.java b/server/src/org/jsoup/select/Evaluator.java deleted file mode 100644 index 5dd4c91616..0000000000 --- a/server/src/org/jsoup/select/Evaluator.java +++ /dev/null @@ -1,474 +0,0 @@ -package org.jsoup.select; - -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Element; - -/** - * Evaluates that an element matches the selector. - */ -public abstract class Evaluator { - protected Evaluator() { - } - - /** - * Test if the element meets the evaluator's requirements. - * - * @param root - * UI of the matching subtree - * @param element - * tested element - */ - public abstract boolean matches(Element root, Element element); - - /** - * Evaluator for tag name - */ - public static final class Tag extends Evaluator { - private String tagName; - - public Tag(String tagName) { - this.tagName = tagName; - } - - @Override - public boolean matches(Element root, Element element) { - return (element.tagName().equals(tagName)); - } - - @Override - public String toString() { - return String.format("%s", tagName); - } - } - - /** - * Evaluator for element id - */ - public static final class Id extends Evaluator { - private String id; - - public Id(String id) { - this.id = id; - } - - @Override - public boolean matches(Element root, Element element) { - return (id.equals(element.id())); - } - - @Override - public String toString() { - return String.format("#%s", id); - } - - } - - /** - * Evaluator for element class - */ - public static final class Class extends Evaluator { - private String className; - - public Class(String className) { - this.className = className; - } - - @Override - public boolean matches(Element root, Element element) { - return (element.hasClass(className)); - } - - @Override - public String toString() { - return String.format(".%s", className); - } - - } - - /** - * Evaluator for attribute name matching - */ - public static final class Attribute extends Evaluator { - private String key; - - public Attribute(String key) { - this.key = key; - } - - @Override - public boolean matches(Element root, Element element) { - return element.hasAttr(key); - } - - @Override - public String toString() { - return String.format("[%s]", key); - } - - } - - /** - * Evaluator for attribute name prefix matching - */ - public static final class AttributeStarting extends Evaluator { - private String keyPrefix; - - public AttributeStarting(String keyPrefix) { - this.keyPrefix = keyPrefix; - } - - @Override - public boolean matches(Element root, Element element) { - List values = element.attributes() - .asList(); - for (org.jsoup.nodes.Attribute attribute : values) { - if (attribute.getKey().startsWith(keyPrefix)) { - return true; - } - } - return false; - } - - @Override - public String toString() { - return String.format("[^%s]", keyPrefix); - } - - } - - /** - * Evaluator for attribute name/value matching - */ - public static final class AttributeWithValue extends AttributeKeyPair { - public AttributeWithValue(String key, String value) { - super(key, value); - } - - @Override - public boolean matches(Element root, Element element) { - return element.hasAttr(key) - && value.equalsIgnoreCase(element.attr(key)); - } - - @Override - public String toString() { - return String.format("[%s=%s]", key, value); - } - - } - - /** - * Evaluator for attribute name != value matching - */ - public static final class AttributeWithValueNot extends AttributeKeyPair { - public AttributeWithValueNot(String key, String value) { - super(key, value); - } - - @Override - public boolean matches(Element root, Element element) { - return !value.equalsIgnoreCase(element.attr(key)); - } - - @Override - public String toString() { - return String.format("[%s!=%s]", key, value); - } - - } - - /** - * Evaluator for attribute name/value matching (value prefix) - */ - public static final class AttributeWithValueStarting extends - AttributeKeyPair { - public AttributeWithValueStarting(String key, String value) { - super(key, value); - } - - @Override - public boolean matches(Element root, Element element) { - return element.hasAttr(key) - && element.attr(key).toLowerCase().startsWith(value); // value - // is - // lower - // case - // already - } - - @Override - public String toString() { - return String.format("[%s^=%s]", key, value); - } - - } - - /** - * Evaluator for attribute name/value matching (value ending) - */ - public static final class AttributeWithValueEnding extends AttributeKeyPair { - public AttributeWithValueEnding(String key, String value) { - super(key, value); - } - - @Override - public boolean matches(Element root, Element element) { - return element.hasAttr(key) - && element.attr(key).toLowerCase().endsWith(value); // value - // is - // lower - // case - } - - @Override - public String toString() { - return String.format("[%s$=%s]", key, value); - } - - } - - /** - * Evaluator for attribute name/value matching (value containing) - */ - public static final class AttributeWithValueContaining extends - AttributeKeyPair { - public AttributeWithValueContaining(String key, String value) { - super(key, value); - } - - @Override - public boolean matches(Element root, Element element) { - return element.hasAttr(key) - && element.attr(key).toLowerCase().contains(value); // value - // is - // lower - // case - } - - @Override - public String toString() { - return String.format("[%s*=%s]", key, value); - } - - } - - /** - * Evaluator for attribute name/value matching (value regex matching) - */ - public static final class AttributeWithValueMatching extends Evaluator { - String key; - Pattern pattern; - - public AttributeWithValueMatching(String key, Pattern pattern) { - this.key = key.trim().toLowerCase(); - this.pattern = pattern; - } - - @Override - public boolean matches(Element root, Element element) { - return element.hasAttr(key) - && pattern.matcher(element.attr(key)).find(); - } - - @Override - public String toString() { - return String.format("[%s~=%s]", key, pattern.toString()); - } - - } - - /** - * Abstract evaluator for attribute name/value matching - */ - public abstract static class AttributeKeyPair extends Evaluator { - String key; - String value; - - public AttributeKeyPair(String key, String value) { - Validate.notEmpty(key); - Validate.notEmpty(value); - - this.key = key.trim().toLowerCase(); - this.value = value.trim().toLowerCase(); - } - } - - /** - * Evaluator for any / all element matching - */ - public static final class AllElements extends Evaluator { - - @Override - public boolean matches(Element root, Element element) { - return true; - } - - @Override - public String toString() { - return "*"; - } - } - - /** - * Evaluator for matching by sibling index number (e < idx) - */ - public static final class IndexLessThan extends IndexEvaluator { - public IndexLessThan(int index) { - super(index); - } - - @Override - public boolean matches(Element root, Element element) { - return element.elementSiblingIndex() < index; - } - - @Override - public String toString() { - return String.format(":lt(%d)", index); - } - - } - - /** - * Evaluator for matching by sibling index number (e > idx) - */ - public static final class IndexGreaterThan extends IndexEvaluator { - public IndexGreaterThan(int index) { - super(index); - } - - @Override - public boolean matches(Element root, Element element) { - return element.elementSiblingIndex() > index; - } - - @Override - public String toString() { - return String.format(":gt(%d)", index); - } - - } - - /** - * Evaluator for matching by sibling index number (e = idx) - */ - public static final class IndexEquals extends IndexEvaluator { - public IndexEquals(int index) { - super(index); - } - - @Override - public boolean matches(Element root, Element element) { - return element.elementSiblingIndex() == index; - } - - @Override - public String toString() { - return String.format(":eq(%d)", index); - } - - } - - /** - * Abstract evaluator for sibling index matching - * - * @author ant - */ - public abstract static class IndexEvaluator extends Evaluator { - int index; - - public IndexEvaluator(int index) { - this.index = index; - } - } - - /** - * Evaluator for matching Element (and its descendants) text - */ - public static final class ContainsText extends Evaluator { - private String searchText; - - public ContainsText(String searchText) { - this.searchText = searchText.toLowerCase(); - } - - @Override - public boolean matches(Element root, Element element) { - return (element.text().toLowerCase().contains(searchText)); - } - - @Override - public String toString() { - return String.format(":contains(%s", searchText); - } - } - - /** - * Evaluator for matching Element's own text - */ - public static final class ContainsOwnText extends Evaluator { - private String searchText; - - public ContainsOwnText(String searchText) { - this.searchText = searchText.toLowerCase(); - } - - @Override - public boolean matches(Element root, Element element) { - return (element.ownText().toLowerCase().contains(searchText)); - } - - @Override - public String toString() { - return String.format(":containsOwn(%s", searchText); - } - } - - /** - * Evaluator for matching Element (and its descendants) text with regex - */ - public static final class Matches extends Evaluator { - private Pattern pattern; - - public Matches(Pattern pattern) { - this.pattern = pattern; - } - - @Override - public boolean matches(Element root, Element element) { - Matcher m = pattern.matcher(element.text()); - return m.find(); - } - - @Override - public String toString() { - return String.format(":matches(%s", pattern); - } - } - - /** - * Evaluator for matching Element's own text with regex - */ - public static final class MatchesOwn extends Evaluator { - private Pattern pattern; - - public MatchesOwn(Pattern pattern) { - this.pattern = pattern; - } - - @Override - public boolean matches(Element root, Element element) { - Matcher m = pattern.matcher(element.ownText()); - return m.find(); - } - - @Override - public String toString() { - return String.format(":matchesOwn(%s", pattern); - } - } -} diff --git a/server/src/org/jsoup/select/NodeTraversor.java b/server/src/org/jsoup/select/NodeTraversor.java deleted file mode 100644 index f94a7762fc..0000000000 --- a/server/src/org/jsoup/select/NodeTraversor.java +++ /dev/null @@ -1,55 +0,0 @@ -package org.jsoup.select; - -import org.jsoup.nodes.Node; - -/** - * Depth-first node traversor. Use to iterate through all nodes under and - * including the specified root node. - *

- * This implementation does not use recursion, so a deep DOM does not risk - * blowing the stack. - */ -public class NodeTraversor { - private NodeVisitor visitor; - - /** - * Create a new traversor. - * - * @param visitor - * a class implementing the {@link NodeVisitor} interface, to be - * called when visiting each node. - */ - public NodeTraversor(NodeVisitor visitor) { - this.visitor = visitor; - } - - /** - * Start a depth-first traverse of the root and all of its descendants. - * - * @param root - * the root node point to traverse. - */ - public void traverse(Node root) { - Node node = root; - int depth = 0; - - while (node != null) { - visitor.head(node, depth); - if (node.childNodes().size() > 0) { - node = node.childNode(0); - depth++; - } else { - while (node.nextSibling() == null && depth > 0) { - visitor.tail(node, depth); - node = node.parent(); - depth--; - } - visitor.tail(node, depth); - if (node == root) { - break; - } - node = node.nextSibling(); - } - } - } -} diff --git a/server/src/org/jsoup/select/NodeVisitor.java b/server/src/org/jsoup/select/NodeVisitor.java deleted file mode 100644 index 9e827d6c55..0000000000 --- a/server/src/org/jsoup/select/NodeVisitor.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.jsoup.select; - -import org.jsoup.nodes.Node; - -/** - * Node visitor interface. Provide an implementing class to - * {@link NodeTraversor} to iterate through nodes. - *

- * This interface provides two methods, {@code head} and {@code tail}. The head - * method is called when the node is first seen, and the tail method when all of - * the node's children have been visited. As an example, head can be used to - * create a start tag for a node, and tail to create the end tag. - */ -public interface NodeVisitor { - /** - * Callback for when a node is first visited. - * - * @param node - * the node being visited. - * @param depth - * the depth of the node, relative to the root node. E.g., the - * root node has depth 0, and a child node of that will have - * depth 1. - */ - public void head(Node node, int depth); - - /** - * Callback for when a node is last visited, after all of its descendants - * have been visited. - * - * @param node - * the node being visited. - * @param depth - * the depth of the node, relative to the root node. E.g., the - * root node has depth 0, and a child node of that will have - * depth 1. - */ - public void tail(Node node, int depth); -} diff --git a/server/src/org/jsoup/select/QueryParser.java b/server/src/org/jsoup/select/QueryParser.java deleted file mode 100644 index 7a04899d82..0000000000 --- a/server/src/org/jsoup/select/QueryParser.java +++ /dev/null @@ -1,334 +0,0 @@ -package org.jsoup.select; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Pattern; - -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.parser.TokenQueue; - -/** - * Parses a CSS selector into an Evaluator tree. - */ -class QueryParser { - private final static String[] combinators = { ",", ">", "+", "~", " " }; - - private TokenQueue tq; - private String query; - private List evals = new ArrayList(); - - /** - * Create a new QueryParser. - * - * @param query - * CSS query - */ - private QueryParser(String query) { - this.query = query; - tq = new TokenQueue(query); - } - - /** - * Parse a CSS query into an Evaluator. - * - * @param query - * CSS query - * @return Evaluator - */ - public static Evaluator parse(String query) { - QueryParser p = new QueryParser(query); - return p.parse(); - } - - /** - * Parse the query - * - * @return Evaluator - */ - Evaluator parse() { - tq.consumeWhitespace(); - - if (tq.matchesAny(combinators)) { // if starts with a combinator, use - // root as elements - evals.add(new StructuralEvaluator.Root()); - combinator(tq.consume()); - } else { - findElements(); - } - - while (!tq.isEmpty()) { - // hierarchy and extras - boolean seenWhite = tq.consumeWhitespace(); - - if (tq.matchesAny(combinators)) { - combinator(tq.consume()); - } else if (seenWhite) { - combinator(' '); - } else { // E.class, E#id, E[attr] etc. AND - findElements(); // take next el, #. etc off queue - } - } - - if (evals.size() == 1) { - return evals.get(0); - } - - return new CombiningEvaluator.And(evals); - } - - private void combinator(char combinator) { - tq.consumeWhitespace(); - String subQuery = consumeSubQuery(); // support multi > childs - - Evaluator rootEval; // the new topmost evaluator - Evaluator currentEval; // the evaluator the new eval will be combined - // to. could be root, or rightmost or. - Evaluator newEval = parse(subQuery); // the evaluator to add into target - // evaluator - boolean replaceRightMost = false; - - if (evals.size() == 1) { - rootEval = currentEval = evals.get(0); - // make sure OR (,) has precedence: - if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { - currentEval = ((CombiningEvaluator.Or) currentEval) - .rightMostEvaluator(); - replaceRightMost = true; - } - } else { - rootEval = currentEval = new CombiningEvaluator.And(evals); - } - evals.clear(); - - // for most combinators: change the current eval into an AND of the - // current eval and the new eval - if (combinator == '>') { - currentEval = new CombiningEvaluator.And(newEval, - new StructuralEvaluator.ImmediateParent(currentEval)); - } else if (combinator == ' ') { - currentEval = new CombiningEvaluator.And(newEval, - new StructuralEvaluator.Parent(currentEval)); - } else if (combinator == '+') { - currentEval = new CombiningEvaluator.And(newEval, - new StructuralEvaluator.ImmediatePreviousSibling( - currentEval)); - } else if (combinator == '~') { - currentEval = new CombiningEvaluator.And(newEval, - new StructuralEvaluator.PreviousSibling(currentEval)); - } else if (combinator == ',') { // group or. - CombiningEvaluator.Or or; - if (currentEval instanceof CombiningEvaluator.Or) { - or = (CombiningEvaluator.Or) currentEval; - or.add(newEval); - } else { - or = new CombiningEvaluator.Or(); - or.add(currentEval); - or.add(newEval); - } - currentEval = or; - } else { - throw new Selector.SelectorParseException("Unknown combinator: " - + combinator); - } - - if (replaceRightMost) { - ((CombiningEvaluator.Or) rootEval) - .replaceRightMostEvaluator(currentEval); - } else { - rootEval = currentEval; - } - evals.add(rootEval); - } - - private String consumeSubQuery() { - StringBuilder sq = new StringBuilder(); - while (!tq.isEmpty()) { - if (tq.matches("(")) { - sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); - } else if (tq.matches("[")) { - sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); - } else if (tq.matchesAny(combinators)) { - break; - } else { - sq.append(tq.consume()); - } - } - return sq.toString(); - } - - private void findElements() { - if (tq.matchChomp("#")) { - byId(); - } else if (tq.matchChomp(".")) { - byClass(); - } else if (tq.matchesWord()) { - byTag(); - } else if (tq.matches("[")) { - byAttribute(); - } else if (tq.matchChomp("*")) { - allElements(); - } else if (tq.matchChomp(":lt(")) { - indexLessThan(); - } else if (tq.matchChomp(":gt(")) { - indexGreaterThan(); - } else if (tq.matchChomp(":eq(")) { - indexEquals(); - } else if (tq.matches(":has(")) { - has(); - } else if (tq.matches(":contains(")) { - contains(false); - } else if (tq.matches(":containsOwn(")) { - contains(true); - } else if (tq.matches(":matches(")) { - matches(false); - } else if (tq.matches(":matchesOwn(")) { - matches(true); - } else if (tq.matches(":not(")) { - not(); - } else { - throw new Selector.SelectorParseException( - "Could not parse query '%s': unexpected token at '%s'", - query, tq.remainder()); - } - - } - - private void byId() { - String id = tq.consumeCssIdentifier(); - Validate.notEmpty(id); - evals.add(new Evaluator.Id(id)); - } - - private void byClass() { - String className = tq.consumeCssIdentifier(); - Validate.notEmpty(className); - evals.add(new Evaluator.Class(className.trim().toLowerCase())); - } - - private void byTag() { - String tagName = tq.consumeElementSelector(); - Validate.notEmpty(tagName); - - // namespaces: if element name is "abc:def", selector must be "abc|def", - // so flip: - if (tagName.contains("|")) { - tagName = tagName.replace("|", ":"); - } - - evals.add(new Evaluator.Tag(tagName.trim().toLowerCase())); - } - - private void byAttribute() { - TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content - // queue - String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, - // not, - // start, - // end, - // contain, - // match, - // (no - // val) - Validate.notEmpty(key); - cq.consumeWhitespace(); - - if (cq.isEmpty()) { - if (key.startsWith("^")) { - evals.add(new Evaluator.AttributeStarting(key.substring(1))); - } else { - evals.add(new Evaluator.Attribute(key)); - } - } else { - if (cq.matchChomp("=")) { - evals.add(new Evaluator.AttributeWithValue(key, cq.remainder())); - } else if (cq.matchChomp("!=")) { - evals.add(new Evaluator.AttributeWithValueNot(key, cq - .remainder())); - } else if (cq.matchChomp("^=")) { - evals.add(new Evaluator.AttributeWithValueStarting(key, cq - .remainder())); - } else if (cq.matchChomp("$=")) { - evals.add(new Evaluator.AttributeWithValueEnding(key, cq - .remainder())); - } else if (cq.matchChomp("*=")) { - evals.add(new Evaluator.AttributeWithValueContaining(key, cq - .remainder())); - } else if (cq.matchChomp("~=")) { - evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern - .compile(cq.remainder()))); - } else { - throw new Selector.SelectorParseException( - "Could not parse attribute query '%s': unexpected token at '%s'", - query, cq.remainder()); - } - } - } - - private void allElements() { - evals.add(new Evaluator.AllElements()); - } - - // pseudo selectors :lt, :gt, :eq - private void indexLessThan() { - evals.add(new Evaluator.IndexLessThan(consumeIndex())); - } - - private void indexGreaterThan() { - evals.add(new Evaluator.IndexGreaterThan(consumeIndex())); - } - - private void indexEquals() { - evals.add(new Evaluator.IndexEquals(consumeIndex())); - } - - private int consumeIndex() { - String indexS = tq.chompTo(")").trim(); - Validate.isTrue(StringUtil.isNumeric(indexS), "Index must be numeric"); - return Integer.parseInt(indexS); - } - - // pseudo selector :has(el) - private void has() { - tq.consume(":has"); - String subQuery = tq.chompBalanced('(', ')'); - Validate.notEmpty(subQuery, ":has(el) subselect must not be empty"); - evals.add(new StructuralEvaluator.Has(parse(subQuery))); - } - - // pseudo selector :contains(text), containsOwn(text) - private void contains(boolean own) { - tq.consume(own ? ":containsOwn" : ":contains"); - String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); - Validate.notEmpty(searchText, ":contains(text) query must not be empty"); - if (own) { - evals.add(new Evaluator.ContainsOwnText(searchText)); - } else { - evals.add(new Evaluator.ContainsText(searchText)); - } - } - - // :matches(regex), matchesOwn(regex) - private void matches(boolean own) { - tq.consume(own ? ":matchesOwn" : ":matches"); - String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex - // bits will be escaped - Validate.notEmpty(regex, ":matches(regex) query must not be empty"); - - if (own) { - evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex))); - } else { - evals.add(new Evaluator.Matches(Pattern.compile(regex))); - } - } - - // :not(selector) - private void not() { - tq.consume(":not"); - String subQuery = tq.chompBalanced('(', ')'); - Validate.notEmpty(subQuery, - ":not(selector) subselect must not be empty"); - - evals.add(new StructuralEvaluator.Not(parse(subQuery))); - } -} diff --git a/server/src/org/jsoup/select/Selector.java b/server/src/org/jsoup/select/Selector.java deleted file mode 100644 index d5ea6f2dc9..0000000000 --- a/server/src/org/jsoup/select/Selector.java +++ /dev/null @@ -1,278 +0,0 @@ -package org.jsoup.select; - -import java.util.Collection; -import java.util.LinkedHashSet; - -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Element; - -/** - * CSS-like element selector, that finds elements matching a query. - *

- *

Selector syntax

- * A selector is a chain of simple selectors, separated by combinators. - * Selectors are case insensitive (including against elements, attributes, and - * attribute values). - *

- * The universal selector (*) is implicit when no element selector is supplied - * (i.e. {@code *.header} and {@code .header} is equivalent). - *

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
PatternMatchesExample
*any element*
tagelements with the given tag namediv
ns|Eelements of type E in the namespace nsfb|name finds <fb:name> elements
#idelements with attribute ID of "id"div#wrap, #logo
.classelements with a class name of "class"div.left, .result
[attr]elements with an attribute named "attr" (with any value)a[href], [title]
[^attrPrefix]elements with an attribute name starting with "attrPrefix". Use to find - * elements with HTML5 datasets[^data-], div[^data-]
[attr=val]elements with an attribute named "attr", and value equal to "val"img[width=500], a[rel=nofollow]
[attr^=valPrefix]elements with an attribute named "attr", and value starting with - * "valPrefix"a[href^=http:]
[attr$=valSuffix]elements with an attribute named "attr", and value ending with - * "valSuffix"img[src$=.png]
[attr*=valContaining]elements with an attribute named "attr", and value containing - * "valContaining"a[href*=/search/]
[attr~=regex]elements with an attribute named "attr", and value matching the regular - * expressionimg[src~=(?i)\\.(png|jpe?g)]
The above may be combined in any orderdiv.header[title]
- * - *

Combinators

E Fan F element descended from an E elementdiv a, .logo h1
E > Fan F direct child of Eol > li
E + Fan F element immediately preceded by sibling Eli + li, div.head + div
E ~ Fan F element preceded by sibling Eh1 ~ p
E, F, Gall matching elements E, F, or Ga[href], div, h3
- * - *

Pseudo selectors

:lt(n)elements whose sibling index is less than ntd:lt(3) finds the first 2 cells of each row
:gt(n)elements whose sibling index is greater than ntd:gt(1) finds cells after skipping the first two
:eq(n)elements whose sibling index is equal to ntd:eq(0) finds the first cell of each row
:has(selector)elements that contains at least one element matching the - * selectordiv:has(p) finds divs that contain p elements
:not(selector)elements that do not match the selector. See also - * {@link Elements#not(String)}div:not(.logo) finds all divs that do not have the "logo" - * class.
- * div:not(:has(div)) finds divs that do not contain divs.
:contains(text)elements that contains the specified text. The search is case - * insensitive. The text may appear in the found element, or any of its - * descendants.p:contains(jsoup) finds p elements containing the text - * "jsoup".
:matches(regex)elements whose text matches the specified regular expression. The text - * may appear in the found element, or any of its descendants.td:matches(\\d+) finds table cells containing digits. - * div:matches((?i)login) finds divs containing the text, case - * insensitively.
:containsOwn(text)elements that directly contains the specified text. The search is case - * insensitive. The text must appear in the found element, not any of its - * descendants.p:containsOwn(jsoup) finds p elements with own text "jsoup". - *
:matchesOwn(regex)elements whose own text matches the specified regular expression. The - * text must appear in the found element, not any of its descendants.td:matchesOwn(\\d+) finds table cells directly containing - * digits. div:matchesOwn((?i)login) finds divs containing the - * text, case insensitively.
The above may be combined in any order and with other selectors.light:contains(name):eq(0)
- * - * @author Jonathan Hedley, jonathan@hedley.net - * @see Element#select(String) - */ -public class Selector { - private final Evaluator evaluator; - private final Element root; - - private Selector(String query, Element root) { - Validate.notNull(query); - query = query.trim(); - Validate.notEmpty(query); - Validate.notNull(root); - - evaluator = QueryParser.parse(query); - - this.root = root; - } - - /** - * Find elements matching selector. - * - * @param query - * CSS selector - * @param root - * root element to descend into - * @return matching elements, empty if not - */ - public static Elements select(String query, Element root) { - return new Selector(query, root).select(); - } - - /** - * Find elements matching selector. - * - * @param query - * CSS selector - * @param roots - * root elements to descend into - * @return matching elements, empty if not - */ - public static Elements select(String query, Iterable roots) { - Validate.notEmpty(query); - Validate.notNull(roots); - LinkedHashSet elements = new LinkedHashSet(); - - for (Element root : roots) { - elements.addAll(select(query, root)); - } - return new Elements(elements); - } - - private Elements select() { - return Collector.collect(evaluator, root); - } - - // exclude set. package open so that Elements can implement .not() selector. - static Elements filterOut(Collection elements, - Collection outs) { - Elements output = new Elements(); - for (Element el : elements) { - boolean found = false; - for (Element out : outs) { - if (el.equals(out)) { - found = true; - break; - } - } - if (!found) { - output.add(el); - } - } - return output; - } - - public static class SelectorParseException extends IllegalStateException { - public SelectorParseException(String msg, Object... params) { - super(String.format(msg, params)); - } - } -} diff --git a/server/src/org/jsoup/select/StructuralEvaluator.java b/server/src/org/jsoup/select/StructuralEvaluator.java deleted file mode 100644 index dea2413fb8..0000000000 --- a/server/src/org/jsoup/select/StructuralEvaluator.java +++ /dev/null @@ -1,152 +0,0 @@ -package org.jsoup.select; - -import org.jsoup.nodes.Element; - -/** - * Base structural evaluator. - */ -abstract class StructuralEvaluator extends Evaluator { - Evaluator evaluator; - - static class Root extends Evaluator { - @Override - public boolean matches(Element root, Element element) { - return root == element; - } - } - - static class Has extends StructuralEvaluator { - public Has(Evaluator evaluator) { - this.evaluator = evaluator; - } - - @Override - public boolean matches(Element root, Element element) { - for (Element e : element.getAllElements()) { - if (e != element && evaluator.matches(root, e)) { - return true; - } - } - return false; - } - - @Override - public String toString() { - return String.format(":has(%s)", evaluator); - } - } - - static class Not extends StructuralEvaluator { - public Not(Evaluator evaluator) { - this.evaluator = evaluator; - } - - @Override - public boolean matches(Element root, Element node) { - return !evaluator.matches(root, node); - } - - @Override - public String toString() { - return String.format(":not%s", evaluator); - } - } - - static class Parent extends StructuralEvaluator { - public Parent(Evaluator evaluator) { - this.evaluator = evaluator; - } - - @Override - public boolean matches(Element root, Element element) { - if (root == element) { - return false; - } - - Element parent = element.parent(); - while (parent != root) { - if (evaluator.matches(root, parent)) { - return true; - } - parent = parent.parent(); - } - return false; - } - - @Override - public String toString() { - return String.format(":parent%s", evaluator); - } - } - - static class ImmediateParent extends StructuralEvaluator { - public ImmediateParent(Evaluator evaluator) { - this.evaluator = evaluator; - } - - @Override - public boolean matches(Element root, Element element) { - if (root == element) { - return false; - } - - Element parent = element.parent(); - return parent != null && evaluator.matches(root, parent); - } - - @Override - public String toString() { - return String.format(":ImmediateParent%s", evaluator); - } - } - - static class PreviousSibling extends StructuralEvaluator { - public PreviousSibling(Evaluator evaluator) { - this.evaluator = evaluator; - } - - @Override - public boolean matches(Element root, Element element) { - if (root == element) { - return false; - } - - Element prev = element.previousElementSibling(); - - while (prev != null) { - if (evaluator.matches(root, prev)) { - return true; - } - - prev = prev.previousElementSibling(); - } - return false; - } - - @Override - public String toString() { - return String.format(":prev*%s", evaluator); - } - } - - static class ImmediatePreviousSibling extends StructuralEvaluator { - public ImmediatePreviousSibling(Evaluator evaluator) { - this.evaluator = evaluator; - } - - @Override - public boolean matches(Element root, Element element) { - if (root == element) { - return false; - } - - Element prev = element.previousElementSibling(); - return prev != null && evaluator.matches(root, prev); - } - - @Override - public String toString() { - return String.format(":prev%s", evaluator); - } - } -} diff --git a/server/src/org/jsoup/select/package-info.java b/server/src/org/jsoup/select/package-info.java deleted file mode 100644 index a6e6a2fa0f..0000000000 --- a/server/src/org/jsoup/select/package-info.java +++ /dev/null @@ -1,4 +0,0 @@ -/** - Packages to support the CSS-style element selector. - */ -package org.jsoup.select; \ No newline at end of file -- 2.39.5