diff options
Diffstat (limited to 'server/src/org/jsoup')
46 files changed, 6158 insertions, 3929 deletions
diff --git a/server/src/org/jsoup/Connection.java b/server/src/org/jsoup/Connection.java index 564eeb89b7..1d9879bfb3 100644 --- a/server/src/org/jsoup/Connection.java +++ b/server/src/org/jsoup/Connection.java @@ -1,24 +1,29 @@ package org.jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.parser.Parser; - +import java.io.IOException; import java.net.URL; -import java.util.Map; import java.util.Collection; -import java.io.IOException; +import java.util.Map; + +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; /** - * A Connection provides a convenient interface to fetch content from the web, and parse them into Documents. + * A Connection provides a convenient interface to fetch content from the web, + * and parse them into Documents. * <p> - * To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}. Connections contain {@link Connection.Request} - * and {@link Connection.Response} objects. The request objects are reusable as prototype requests. + * To get a new Connection, use {@link org.jsoup.Jsoup#connect(String)}. + * Connections contain {@link Connection.Request} and + * {@link Connection.Response} objects. The request objects are reusable as + * prototype requests. * <p> - * Request configuration can be made using either the shortcut methods in Connection (e.g. {@link #userAgent(String)}), - * or by methods in the Connection.Request object directly. All request configuration must be made before the request - * is executed. + * Request configuration can be made using either the shortcut methods in + * Connection (e.g. {@link #userAgent(String)}), or by methods in the + * Connection.Request object directly. All request configuration must be made + * before the request is executed. * <p> - * The Connection interface is <b>currently in beta</b> and subject to change. Comments, suggestions, and bug reports are welcome. + * The Connection interface is <b>currently in beta</b> and subject to change. + * Comments, suggestions, and bug reports are welcome. */ public interface Connection { @@ -31,102 +36,140 @@ public interface Connection { /** * Set the request URL to fetch. The protocol must be HTTP or HTTPS. - * @param url URL to connect to + * + * @param url + * URL to connect to * @return this Connection, for chaining */ public Connection url(URL url); /** * Set the request URL to fetch. The protocol must be HTTP or HTTPS. - * @param url URL to connect to + * + * @param url + * URL to connect to * @return this Connection, for chaining */ public Connection url(String url); /** * Set the request user-agent header. - * @param userAgent user-agent to use + * + * @param userAgent + * user-agent to use * @return this Connection, for chaining */ public Connection userAgent(String userAgent); /** - * Set the request timeouts (connect and read). If a timeout occurs, an IOException will be thrown. The default - * timeout is 3 seconds (3000 millis). A timeout of zero is treated as an infinite timeout. - * @param millis number of milliseconds (thousandths of a second) before timing out connects or reads. + * Set the request timeouts (connect and read). If a timeout occurs, an + * IOException will be thrown. The default timeout is 3 seconds (3000 + * millis). A timeout of zero is treated as an infinite timeout. + * + * @param millis + * number of milliseconds (thousandths of a second) before timing + * out connects or reads. * @return this Connection, for chaining */ public Connection timeout(int millis); /** * Set the request referrer (aka "referer") header. - * @param referrer referrer to use + * + * @param referrer + * referrer to use * @return this Connection, for chaining */ public Connection referrer(String referrer); /** - * Configures the connection to (not) follow server redirects. By default this is <b>true</b>. - * @param followRedirects true if server redirects should be followed. + * Configures the connection to (not) follow server redirects. By default + * this is <b>true</b>. + * + * @param followRedirects + * true if server redirects should be followed. * @return this Connection, for chaining */ public Connection followRedirects(boolean followRedirects); /** * Set the request method to use, GET or POST. Default is GET. - * @param method HTTP request method + * + * @param method + * HTTP request method * @return this Connection, for chaining */ public Connection method(Method method); /** - * Configures the connection to not throw exceptions when a HTTP error occurs. (4xx - 5xx, e.g. 404 or 500). By - * default this is <b>false</b>; an IOException is thrown if an error is encountered. If set to <b>true</b>, the - * response is populated with the error body, and the status message will reflect the error. - * @param ignoreHttpErrors - false (default) if HTTP errors should be ignored. + * Configures the connection to not throw exceptions when a HTTP error + * occurs. (4xx - 5xx, e.g. 404 or 500). By default this is <b>false</b>; an + * IOException is thrown if an error is encountered. If set to <b>true</b>, + * the response is populated with the error body, and the status message + * will reflect the error. + * + * @param ignoreHttpErrors + * - false (default) if HTTP errors should be ignored. * @return this Connection, for chaining */ public Connection ignoreHttpErrors(boolean ignoreHttpErrors); /** - * Ignore the document's Content-Type when parsing the response. By default this is <b>false</b>, an unrecognised - * content-type will cause an IOException to be thrown. (This is to prevent producing garbage by attempting to parse - * a JPEG binary image, for example.) Set to true to force a parse attempt regardless of content type. - * @param ignoreContentType set to true if you would like the content type ignored on parsing the response into a - * Document. + * Ignore the document's Content-Type when parsing the response. By default + * this is <b>false</b>, an unrecognised content-type will cause an + * IOException to be thrown. (This is to prevent producing garbage by + * attempting to parse a JPEG binary image, for example.) Set to true to + * force a parse attempt regardless of content type. + * + * @param ignoreContentType + * set to true if you would like the content type ignored on + * parsing the response into a Document. * @return this Connection, for chaining */ public Connection ignoreContentType(boolean ignoreContentType); /** - * Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the request - * body for POSTs. A request may have multiple values of the same name. - * @param key data key - * @param value data value + * Add a request data parameter. Request parameters are sent in the request + * query string for GETs, and in the request body for POSTs. A request may + * have multiple values of the same name. + * + * @param key + * data key + * @param value + * data value * @return this Connection, for chaining */ public Connection data(String key, String value); /** * Adds all of the supplied data to the request data parameters - * @param data map of data parameters + * + * @param data + * map of data parameters * @return this Connection, for chaining */ public Connection data(Map<String, String> data); /** - * Add a number of request data parameters. Multiple parameters may be set at once, e.g.: - * <code>.data("name", "jsoup", "language", "Java", "language", "English");</code> creates a query string like: + * Add a number of request data parameters. Multiple parameters may be set + * at once, e.g.: + * <code>.data("name", "jsoup", "language", "Java", "language", "English");</code> + * creates a query string like: * <code>?name=jsoup&language=Java&language=English</code> - * @param keyvals a set of key value pairs. + * + * @param keyvals + * a set of key value pairs. * @return this Connection, for chaining */ public Connection data(String... keyvals); /** * Set a request header. - * @param name header name - * @param value header value + * + * @param name + * header name + * @param value + * header value * @return this Connection, for chaining * @see org.jsoup.Connection.Request#headers() */ @@ -134,111 +177,141 @@ public interface Connection { /** * Set a cookie to be sent in the request. - * @param name name of cookie - * @param value value of cookie + * + * @param name + * name of cookie + * @param value + * value of cookie * @return this Connection, for chaining */ public Connection cookie(String name, String value); /** * Adds each of the supplied cookies to the request. - * @param cookies map of cookie name -> value pairs + * + * @param cookies + * map of cookie name -> value pairs * @return this Connection, for chaining */ public Connection cookies(Map<String, String> cookies); /** - * Provide an alternate parser to use when parsing the response to a Document. - * @param parser alternate parser + * Provide an alternate parser to use when parsing the response to a + * Document. + * + * @param parser + * alternate parser * @return this Connection, for chaining */ public Connection parser(Parser parser); /** * Execute the request as a GET, and parse the result. + * * @return parsed Document - * @throws IOException on error + * @throws IOException + * on error */ public Document get() throws IOException; /** * Execute the request as a POST, and parse the result. + * * @return parsed Document - * @throws IOException on error + * @throws IOException + * on error */ public Document post() throws IOException; /** * Execute the request. + * * @return a response object - * @throws IOException on error + * @throws IOException + * on error */ public Response execute() throws IOException; /** * Get the request object associated with this connection + * * @return request */ public Request request(); /** * Set the connection's request - * @param request new request object + * + * @param request + * new request object * @return this Connection, for chaining */ public Connection request(Request request); /** * Get the response, once the request has been executed + * * @return response */ public Response response(); /** * Set the connection's response - * @param response new response + * + * @param response + * new response * @return this Connection, for chaining */ public Connection response(Response response); - /** * Common methods for Requests and Responses - * @param <T> Type of Base, either Request or Response + * + * @param <T> + * Type of Base, either Request or Response */ interface Base<T extends Base> { /** * Get the URL + * * @return URL */ public URL url(); /** * Set the URL - * @param url new URL + * + * @param url + * new URL * @return this, for chaining */ public T url(URL url); /** * Get the request method + * * @return method */ public Method method(); /** * Set the request method - * @param method new method + * + * @param method + * new method * @return this, for chaining */ public T method(Method method); /** - * Get the value of a header. This is a simplified header model, where a header may only have one value. + * Get the value of a header. This is a simplified header model, where a + * header may only have one value. * <p> * Header names are case insensitive. - * @param name name of header (case insensitive) + * + * @param name + * name of header (case insensitive) * @return value of header, or null if not set. * @see #hasHeader(String) * @see #cookie(String) @@ -246,29 +319,38 @@ public interface Connection { public String header(String name); /** - * Set a header. This method will overwrite any existing header with the same case insensitive name. - * @param name Name of header - * @param value Value of header + * Set a header. This method will overwrite any existing header with the + * same case insensitive name. + * + * @param name + * Name of header + * @param value + * Value of header * @return this, for chaining */ public T header(String name, String value); /** * Check if a header is present - * @param name name of header (case insensitive) + * + * @param name + * name of header (case insensitive) * @return if the header is present in this request/response */ public boolean hasHeader(String name); /** * Remove a header by name - * @param name name of header to remove (case insensitive) + * + * @param name + * name of header to remove (case insensitive) * @return this, for chaining */ public T removeHeader(String name); /** * Retrieve all of the request/response headers as a map + * * @return headers */ public Map<String, String> headers(); @@ -276,37 +358,48 @@ public interface Connection { /** * Get a cookie value by name from this request/response. * <p> - * Response objects have a simplified cookie model. Each cookie set in the response is added to the response - * object's cookie key=value map. The cookie's path, domain, and expiry date are ignored. - * @param name name of cookie to retrieve. + * Response objects have a simplified cookie model. Each cookie set in + * the response is added to the response object's cookie key=value map. + * The cookie's path, domain, and expiry date are ignored. + * + * @param name + * name of cookie to retrieve. * @return value of cookie, or null if not set */ public String cookie(String name); /** * Set a cookie in this request/response. - * @param name name of cookie - * @param value value of cookie + * + * @param name + * name of cookie + * @param value + * value of cookie * @return this, for chaining */ public T cookie(String name, String value); /** * Check if a cookie is present - * @param name name of cookie + * + * @param name + * name of cookie * @return if the cookie is present in this request/response */ public boolean hasCookie(String name); /** * Remove a cookie by name - * @param name name of cookie to remove + * + * @param name + * name of cookie to remove * @return this, for chaining */ public T removeCookie(String name); /** * Retrieve all of the request/response cookies as a map + * * @return cookies */ public Map<String, String> cookies(); @@ -319,79 +412,99 @@ public interface Connection { public interface Request extends Base<Request> { /** * Get the request timeout, in milliseconds. + * * @return the timeout in milliseconds. */ public int timeout(); /** * Update the request timeout. - * @param millis timeout, in milliseconds + * + * @param millis + * timeout, in milliseconds * @return this Request, for chaining */ public Request timeout(int millis); /** * Get the current followRedirects configuration. + * * @return true if followRedirects is enabled. */ public boolean followRedirects(); /** - * Configures the request to (not) follow server redirects. By default this is <b>true</b>. - * - * @param followRedirects true if server redirects should be followed. + * Configures the request to (not) follow server redirects. By default + * this is <b>true</b>. + * + * @param followRedirects + * true if server redirects should be followed. * @return this Request, for chaining */ public Request followRedirects(boolean followRedirects); /** * Get the current ignoreHttpErrors configuration. - * @return true if errors will be ignored; false (default) if HTTP errors will cause an IOException to be thrown. + * + * @return true if errors will be ignored; false (default) if HTTP + * errors will cause an IOException to be thrown. */ public boolean ignoreHttpErrors(); - /** - * Configures the request to ignore HTTP errors in the response. - * @param ignoreHttpErrors set to true to ignore HTTP errors. + /** + * Configures the request to ignore HTTP errors in the response. + * + * @param ignoreHttpErrors + * set to true to ignore HTTP errors. * @return this Request, for chaining - */ + */ public Request ignoreHttpErrors(boolean ignoreHttpErrors); /** * Get the current ignoreContentType configuration. - * @return true if invalid content-types will be ignored; false (default) if they will cause an IOException to be thrown. + * + * @return true if invalid content-types will be ignored; false + * (default) if they will cause an IOException to be thrown. */ public boolean ignoreContentType(); /** - * Configures the request to ignore the Content-Type of the response. - * @param ignoreContentType set to true to ignore the content type. + * Configures the request to ignore the Content-Type of the response. + * + * @param ignoreContentType + * set to true to ignore the content type. * @return this Request, for chaining - */ + */ public Request ignoreContentType(boolean ignoreContentType); /** * Add a data parameter to the request - * @param keyval data to add. + * + * @param keyval + * data to add. * @return this Request, for chaining */ public Request data(KeyVal keyval); /** * Get all of the request's data parameters + * * @return collection of keyvals */ public Collection<KeyVal> data(); /** * Specify the parser to use when parsing the document. - * @param parser parser to use. + * + * @param parser + * parser to use. * @return this Request, for chaining */ public Request parser(Parser parser); /** * Get the current parser to use when parsing the document. + * * @return current Parser */ public Parser parser(); @@ -401,46 +514,54 @@ public interface Connection { * Represents a HTTP response. */ public interface Response extends Base<Response> { - - /** + + /** * Get the status code of the response. + * * @return status code */ public int statusCode(); /** * Get the status message of the response. + * * @return status message */ public String statusMessage(); /** * Get the character set name of the response. + * * @return character set name */ public String charset(); /** * Get the response content type (e.g. "text/html"); + * * @return the response content type */ public String contentType(); /** * Parse the body of the response as a Document. + * * @return a parsed Document - * @throws IOException on error + * @throws IOException + * on error */ public Document parse() throws IOException; /** * Get the body of the response as a plain string. + * * @return body */ public String body(); /** * Get the body of the response as an array of bytes. + * * @return body bytes */ public byte[] bodyAsBytes(); @@ -453,29 +574,34 @@ public interface Connection { /** * Update the key of a keyval - * @param key new key + * + * @param key + * new key * @return this KeyVal, for chaining */ public KeyVal key(String key); /** * Get the key of a keyval + * * @return the key */ public String key(); /** * Update the value of a keyval - * @param value the new value + * + * @param value + * the new value * @return this KeyVal, for chaining */ public KeyVal value(String value); /** * Get the value of a keyval + * * @return the value */ public String value(); } } - diff --git a/server/src/org/jsoup/Jsoup.java b/server/src/org/jsoup/Jsoup.java index 8c6afcee36..b5429d9410 100644 --- a/server/src/org/jsoup/Jsoup.java +++ b/server/src/org/jsoup/Jsoup.java @@ -1,178 +1,233 @@ package org.jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.parser.Parser; -import org.jsoup.safety.Cleaner; -import org.jsoup.safety.Whitelist; -import org.jsoup.helper.DataUtil; -import org.jsoup.helper.HttpConnection; - import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; -/** - The core public access point to the jsoup functionality. +import org.jsoup.helper.DataUtil; +import org.jsoup.helper.HttpConnection; +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.jsoup.safety.Cleaner; +import org.jsoup.safety.Whitelist; - @author Jonathan Hedley */ +/** + * The core public access point to the jsoup functionality. + * + * @author Jonathan Hedley + */ public class Jsoup { - private Jsoup() {} + private Jsoup() { + } /** - Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. - - @param html HTML to parse - @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur - before the HTML declares a {@code <base href>} tag. - @return sane HTML + * Parse HTML into a Document. The parser will make a sensible, balanced + * document tree out of any HTML. + * + * @param html + * HTML to parse + * @param baseUri + * The URL where the HTML was retrieved from. Used to resolve + * relative URLs to absolute URLs, that occur before the HTML + * declares a {@code <base href>} tag. + * @return sane HTML */ public static Document parse(String html, String baseUri) { return Parser.parse(html, baseUri); } /** - Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML - (non-HTML) parser. - - @param html HTML to parse - @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur - before the HTML declares a {@code <base href>} tag. - @param parser alternate {@link Parser#xmlParser() parser} to use. - @return sane HTML + * Parse HTML into a Document, using the provided Parser. You can provide an + * alternate parser, such as a simple XML (non-HTML) parser. + * + * @param html + * HTML to parse + * @param baseUri + * The URL where the HTML was retrieved from. Used to resolve + * relative URLs to absolute URLs, that occur before the HTML + * declares a {@code <base href>} tag. + * @param parser + * alternate {@link Parser#xmlParser() parser} to use. + * @return sane HTML */ public static Document parse(String html, String baseUri, Parser parser) { return parser.parseInput(html, baseUri); } /** - Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a - {@code <base href>} tag. - - @param html HTML to parse - @return sane HTML - - @see #parse(String, String) + * Parse HTML into a Document. As no base URI is specified, absolute URL + * detection relies on the HTML including a {@code <base href>} tag. + * + * @param html + * HTML to parse + * @return sane HTML + * @see #parse(String, String) */ public static Document parse(String html) { return Parser.parse(html, ""); } /** - * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. + * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML + * page. * <p> * Use examples: * <ul> - * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> - * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post(); + * <li> + * <code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code> + * </li> + * <li> + * <code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post(); * </ul> - * @param url URL to connect to. The protocol must be {@code http} or {@code https}. - * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. + * + * @param url + * URL to connect to. The protocol must be {@code http} or + * {@code https}. + * @return the connection. You can add data, cookies, and headers; set the + * user-agent, referrer, method; and then execute. */ public static Connection connect(String url) { return HttpConnection.connect(url); } /** - Parse the contents of a file as HTML. - - @param in file to load HTML from - @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if - present, or fall back to {@code UTF-8} (which is often safe to do). - @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. - @return sane HTML - - @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + * Parse the contents of a file as HTML. + * + * @param in + * file to load HTML from + * @param charsetName + * (optional) character set of file contents. Set to {@code null} + * to determine from {@code http-equiv} meta tag, if present, or + * fall back to {@code UTF-8} (which is often safe to do). + * @param baseUri + * The URL where the HTML was retrieved from, to resolve relative + * links against. + * @return sane HTML + * @throws IOException + * if the file could not be found, or read, or if the + * charsetName is invalid. */ - public static Document parse(File in, String charsetName, String baseUri) throws IOException { + public static Document parse(File in, String charsetName, String baseUri) + throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** - Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. - - @param in file to load HTML from - @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if - present, or fall back to {@code UTF-8} (which is often safe to do). - @return sane HTML - - @throws IOException if the file could not be found, or read, or if the charsetName is invalid. - @see #parse(File, String, String) + * Parse the contents of a file as HTML. The location of the file is used as + * the base URI to qualify relative URLs. + * + * @param in + * file to load HTML from + * @param charsetName + * (optional) character set of file contents. Set to {@code null} + * to determine from {@code http-equiv} meta tag, if present, or + * fall back to {@code UTF-8} (which is often safe to do). + * @return sane HTML + * @throws IOException + * if the file could not be found, or read, or if the + * charsetName is invalid. + * @see #parse(File, String, String) */ - public static Document parse(File in, String charsetName) throws IOException { + public static Document parse(File in, String charsetName) + throws IOException { return DataUtil.load(in, charsetName, in.getAbsolutePath()); } - /** - Read an input stream, and parse it to a Document. - - @param in input stream to read. Make sure to close it after parsing. - @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if - present, or fall back to {@code UTF-8} (which is often safe to do). - @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. - @return sane HTML - - @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + /** + * Read an input stream, and parse it to a Document. + * + * @param in + * input stream to read. Make sure to close it after parsing. + * @param charsetName + * (optional) character set of file contents. Set to {@code null} + * to determine from {@code http-equiv} meta tag, if present, or + * fall back to {@code UTF-8} (which is often safe to do). + * @param baseUri + * The URL where the HTML was retrieved from, to resolve relative + * links against. + * @return sane HTML + * @throws IOException + * if the file could not be found, or read, or if the + * charsetName is invalid. */ - public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { + public static Document parse(InputStream in, String charsetName, + String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** - Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML - (non-HTML) parser. - - @param in input stream to read. Make sure to close it after parsing. - @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if - present, or fall back to {@code UTF-8} (which is often safe to do). - @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. - @param parser alternate {@link Parser#xmlParser() parser} to use. - @return sane HTML - - @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + * Read an input stream, and parse it to a Document. You can provide an + * alternate parser, such as a simple XML (non-HTML) parser. + * + * @param in + * input stream to read. Make sure to close it after parsing. + * @param charsetName + * (optional) character set of file contents. Set to {@code null} + * to determine from {@code http-equiv} meta tag, if present, or + * fall back to {@code UTF-8} (which is often safe to do). + * @param baseUri + * The URL where the HTML was retrieved from, to resolve relative + * links against. + * @param parser + * alternate {@link Parser#xmlParser() parser} to use. + * @return sane HTML + * @throws IOException + * if the file could not be found, or read, or if the + * charsetName is invalid. */ - public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { + public static Document parse(InputStream in, String charsetName, + String baseUri, Parser parser) throws IOException { return DataUtil.load(in, charsetName, baseUri, parser); } /** - Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. - - @param bodyHtml body HTML fragment - @param baseUri URL to resolve relative URLs against. - @return sane HTML document - - @see Document#body() + * Parse a fragment of HTML, with the assumption that it forms the + * {@code body} of the HTML. + * + * @param bodyHtml + * body HTML fragment + * @param baseUri + * URL to resolve relative URLs against. + * @return sane HTML document + * @see Document#body() */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { return Parser.parseBodyFragment(bodyHtml, baseUri); } /** - Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. - - @param bodyHtml body HTML fragment - @return sane HTML document - - @see Document#body() + * Parse a fragment of HTML, with the assumption that it forms the + * {@code body} of the HTML. + * + * @param bodyHtml + * body HTML fragment + * @return sane HTML document + * @see Document#body() */ public static Document parseBodyFragment(String bodyHtml) { return Parser.parseBodyFragment(bodyHtml, ""); } /** - Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. - <p> - The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. - - @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. - @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. - @return The parsed HTML. - - @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading - the response stream. - - @see #connect(String) + * Fetch a URL, and parse it as HTML. Provided for compatibility; in most + * cases use {@link #connect(String)} instead. + * <p> + * The encoding character set is determined by the content-type header or + * http-equiv meta tag, or falls back to {@code UTF-8}. + * + * @param url + * URL to fetch (with a GET). The protocol must be {@code http} + * or {@code https}. + * @param timeoutMillis + * Connection and read timeout, in milliseconds. If exceeded, + * IOException is thrown. + * @return The parsed HTML. + * @throws IOException + * If the final server response != 200 OK (redirects are + * followed), or if there's an error reading the response + * stream. + * @see #connect(String) */ public static Document parse(URL url, int timeoutMillis) throws IOException { Connection con = HttpConnection.connect(url); @@ -181,17 +236,20 @@ public class Jsoup { } /** - Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted - tags and attributes. - - @param bodyHtml input untrusted HTML - @param baseUri URL to resolve relative URLs against - @param whitelist white-list of permitted HTML elements - @return safe HTML - - @see Cleaner#clean(Document) + * Get safe HTML from untrusted input HTML, by parsing input HTML and + * filtering it through a white-list of permitted tags and attributes. + * + * @param bodyHtml + * input untrusted HTML + * @param baseUri + * URL to resolve relative URLs against + * @param whitelist + * white-list of permitted HTML elements + * @return safe HTML + * @see Cleaner#clean(Document) */ - public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { + public static String clean(String bodyHtml, String baseUri, + Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, baseUri); Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); @@ -199,31 +257,37 @@ public class Jsoup { } /** - Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted - tags and attributes. - - @param bodyHtml input untrusted HTML - @param whitelist white-list of permitted HTML elements - @return safe HTML - - @see Cleaner#clean(Document) + * Get safe HTML from untrusted input HTML, by parsing input HTML and + * filtering it through a white-list of permitted tags and attributes. + * + * @param bodyHtml + * input untrusted HTML + * @param whitelist + * white-list of permitted HTML elements + * @return safe HTML + * @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, Whitelist whitelist) { return clean(bodyHtml, "", whitelist); } /** - Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should - still be run through the cleaner to set up enforced attributes, and to tidy the output. - @param bodyHtml HTML to test - @param whitelist whitelist to test against - @return true if no tags or attributes were removed; false otherwise - @see #clean(String, org.jsoup.safety.Whitelist) + * Test if the input HTML has only tags and attributes allowed by the + * Whitelist. Useful for form validation. The input HTML should still be run + * through the cleaner to set up enforced attributes, and to tidy the + * output. + * + * @param bodyHtml + * HTML to test + * @param whitelist + * whitelist to test against + * @return true if no tags or attributes were removed; false otherwise + * @see #clean(String, org.jsoup.safety.Whitelist) */ public static boolean isValid(String bodyHtml, Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, ""); Cleaner cleaner = new Cleaner(whitelist); return cleaner.isValid(dirty); } - + } diff --git a/server/src/org/jsoup/examples/HtmlToPlainText.java b/server/src/org/jsoup/examples/HtmlToPlainText.java index 8f563e9608..53e485be34 100644 --- a/server/src/org/jsoup/examples/HtmlToPlainText.java +++ b/server/src/org/jsoup/examples/HtmlToPlainText.java @@ -1,5 +1,7 @@ package org.jsoup.examples; +import java.io.IOException; + import org.jsoup.Jsoup; import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; @@ -10,15 +12,15 @@ import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; -import java.io.IOException; - /** - * HTML to plain-text. This example program demonstrates the use of jsoup to convert HTML input to lightly-formatted - * plain-text. That is divergent from the general goal of jsoup's .text() methods, which is to get clean data from a - * scrape. + * HTML to plain-text. This example program demonstrates the use of jsoup to + * convert HTML input to lightly-formatted plain-text. That is divergent from + * the general goal of jsoup's .text() methods, which is to get clean data from + * a scrape. * <p/> - * Note that this is a fairly simplistic formatter -- for real world use you'll want to embrace and extend. - * + * Note that this is a fairly simplistic formatter -- for real world use you'll + * want to embrace and extend. + * * @author Jonathan Hedley, jonathan@hedley.net */ public class HtmlToPlainText { @@ -36,13 +38,16 @@ public class HtmlToPlainText { /** * Format an Element to plain-text - * @param element the root element to format + * + * @param element + * the root element to format * @return formatted text */ public String getPlainText(Element element) { FormattingVisitor formatter = new FormattingVisitor(); NodeTraversor traversor = new NodeTraversor(formatter); - traversor.traverse(element); // walk the DOM, and call .head() and .tail() for each node + traversor.traverse(element); // walk the DOM, and call .head() and + // .tail() for each node return formatter.toString(); } @@ -51,44 +56,57 @@ public class HtmlToPlainText { private class FormattingVisitor implements NodeVisitor { private static final int maxWidth = 80; private int width = 0; - private StringBuilder accum = new StringBuilder(); // holds the accumulated text + private StringBuilder accum = new StringBuilder(); // holds the + // accumulated text // hit when the node is first seen + @Override public void head(Node node, int depth) { String name = node.nodeName(); - if (node instanceof TextNode) - append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. - else if (name.equals("li")) + if (node instanceof TextNode) { + append(((TextNode) node).text()); // TextNodes carry all + // user-readable text in the + // DOM. + } else if (name.equals("li")) { append("\n * "); + } } // hit when all of the node's children (if any) have been visited + @Override public void tail(Node node, int depth) { String name = node.nodeName(); - if (name.equals("br")) + if (name.equals("br")) { append("\n"); - else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5")) + } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5")) { append("\n\n"); - else if (name.equals("a")) + } else if (name.equals("a")) { append(String.format(" <%s>", node.absUrl("href"))); + } } // appends text to the string builder with a simple word wrap method private void append(String text) { - if (text.startsWith("\n")) - width = 0; // reset counter if starts with a newline. only from formats above, not in natural text - if (text.equals(" ") && - (accum.length() == 0 || StringUtil.in(accum.substring(accum.length() - 1), " ", "\n"))) + if (text.startsWith("\n")) { + width = 0; // reset counter if starts with a newline. only from + // formats above, not in natural text + } + if (text.equals(" ") + && (accum.length() == 0 || StringUtil.in( + accum.substring(accum.length() - 1), " ", "\n"))) { return; // don't accumulate long runs of empty spaces + } if (text.length() + width > maxWidth) { // won't fit, needs to wrap String words[] = text.split("\\s+"); for (int i = 0; i < words.length; i++) { String word = words[i]; boolean last = i == words.length - 1; - if (!last) // insert a space if not the last word + if (!last) { word = word + " "; - if (word.length() + width > maxWidth) { // wrap and reset counter + } + if (word.length() + width > maxWidth) { // wrap and reset + // counter accum.append("\n").append(word); width = word.length(); } else { @@ -102,6 +120,7 @@ public class HtmlToPlainText { } } + @Override public String toString() { return accum.toString(); } diff --git a/server/src/org/jsoup/examples/ListLinks.java b/server/src/org/jsoup/examples/ListLinks.java index 64b29ba107..d57a488435 100644 --- a/server/src/org/jsoup/examples/ListLinks.java +++ b/server/src/org/jsoup/examples/ListLinks.java @@ -1,13 +1,13 @@ package org.jsoup.examples; +import java.io.IOException; + import org.jsoup.Jsoup; import org.jsoup.helper.Validate; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -import java.io.IOException; - /** * Example program to list links from a URL. */ @@ -24,22 +24,25 @@ public class ListLinks { print("\nMedia: (%d)", media.size()); for (Element src : media) { - if (src.tagName().equals("img")) - print(" * %s: <%s> %sx%s (%s)", - src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), - trim(src.attr("alt"), 20)); - else + if (src.tagName().equals("img")) { + print(" * %s: <%s> %sx%s (%s)", src.tagName(), + src.attr("abs:src"), src.attr("width"), + src.attr("height"), trim(src.attr("alt"), 20)); + } else { print(" * %s: <%s>", src.tagName(), src.attr("abs:src")); + } } print("\nImports: (%d)", imports.size()); for (Element link : imports) { - print(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel")); + print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), + link.attr("rel")); } print("\nLinks: (%d)", links.size()); for (Element link : links) { - print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); + print(" * a: <%s> (%s)", link.attr("abs:href"), + trim(link.text(), 35)); } } @@ -48,9 +51,10 @@ public class ListLinks { } private static String trim(String s, int width) { - if (s.length() > width) - return s.substring(0, width-1) + "."; - else + if (s.length() > width) { + return s.substring(0, width - 1) + "."; + } else { return s; + } } } diff --git a/server/src/org/jsoup/helper/DataUtil.java b/server/src/org/jsoup/helper/DataUtil.java index 9adfe42153..26b85ea7dc 100644 --- a/server/src/org/jsoup/helper/DataUtil.java +++ b/server/src/org/jsoup/helper/DataUtil.java @@ -1,102 +1,147 @@ package org.jsoup.helper; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.parser.Parser; - -import java.io.*; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.parser.Parser; + /** * Internal static utilities for handling data. - * + * */ public class DataUtil { - private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); - static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset + private static final Pattern charsetPattern = Pattern + .compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)"); + static final String defaultCharset = "UTF-8"; // used if not found in header + // or meta charset private static final int bufferSize = 0x20000; // ~130K. - private DataUtil() {} + private DataUtil() { + } /** * Loads a file to a Document. - * @param in file to load - * @param charsetName character set of input - * @param baseUri base URI of document, to resolve relative links against + * + * @param in + * file to load + * @param charsetName + * character set of input + * @param baseUri + * base URI of document, to resolve relative links against * @return Document - * @throws IOException on IO error + * @throws IOException + * on IO error */ - public static Document load(File in, String charsetName, String baseUri) throws IOException { + public static Document load(File in, String charsetName, String baseUri) + throws IOException { FileInputStream inStream = null; try { inStream = new FileInputStream(in); ByteBuffer byteData = readToByteBuffer(inStream); - return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser()); + return parseByteData(byteData, charsetName, baseUri, + Parser.htmlParser()); } finally { - if (inStream != null) + if (inStream != null) { inStream.close(); + } } } /** * Parses a Document from an input steam. - * @param in input stream to parse. You will need to close it. - * @param charsetName character set of input - * @param baseUri base URI of document, to resolve relative links against + * + * @param in + * input stream to parse. You will need to close it. + * @param charsetName + * character set of input + * @param baseUri + * base URI of document, to resolve relative links against * @return Document - * @throws IOException on IO error + * @throws IOException + * on IO error */ - public static Document load(InputStream in, String charsetName, String baseUri) throws IOException { + public static Document load(InputStream in, String charsetName, + String baseUri) throws IOException { ByteBuffer byteData = readToByteBuffer(in); - return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser()); + return parseByteData(byteData, charsetName, baseUri, + Parser.htmlParser()); } /** * Parses a Document from an input steam, using the provided Parser. - * @param in input stream to parse. You will need to close it. - * @param charsetName character set of input - * @param baseUri base URI of document, to resolve relative links against - * @param parser alternate {@link Parser#xmlParser() parser} to use. + * + * @param in + * input stream to parse. You will need to close it. + * @param charsetName + * character set of input + * @param baseUri + * base URI of document, to resolve relative links against + * @param parser + * alternate {@link Parser#xmlParser() parser} to use. * @return Document - * @throws IOException on IO error + * @throws IOException + * on IO error */ - public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { + public static Document load(InputStream in, String charsetName, + String baseUri, Parser parser) throws IOException { ByteBuffer byteData = readToByteBuffer(in); return parseByteData(byteData, charsetName, baseUri, parser); } - // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support - // switching the chartset midstream when a meta http-equiv tag defines the charset. - static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) { + // reads bytes first into a buffer, then decodes with the appropriate + // charset. done this way to support + // switching the chartset midstream when a meta http-equiv tag defines the + // charset. + static Document parseByteData(ByteBuffer byteData, String charsetName, + String baseUri, Parser parser) { String docData; Document doc = null; if (charsetName == null) { // determine from meta. safe parse as UTF-8 - // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> - docData = Charset.forName(defaultCharset).decode(byteData).toString(); + // look for <meta http-equiv="Content-Type" + // content="text/html;charset=gb2312"> or HTML5 <meta + // charset="gb2312"> + docData = Charset.forName(defaultCharset).decode(byteData) + .toString(); doc = parser.parseInput(docData, baseUri); - Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first(); + Element meta = doc.select( + "meta[http-equiv=content-type], meta[charset]").first(); if (meta != null) { // if not found, will keep utf-8 as best attempt - String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset"); - if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode + String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta + .attr("content")) : meta.attr("charset"); + if (foundCharset != null && foundCharset.length() != 0 + && !foundCharset.equals(defaultCharset)) { // need to + // re-decode charsetName = foundCharset; byteData.rewind(); - docData = Charset.forName(foundCharset).decode(byteData).toString(); + docData = Charset.forName(foundCharset).decode(byteData) + .toString(); doc = null; } } } else { // specified by content type header (or by user on file load) - Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); + Validate.notEmpty( + charsetName, + "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); docData = Charset.forName(charsetName).decode(byteData).toString(); } if (doc == null) { - // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present - // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight + // there are times where there is a spurious byte-order-mark at the + // start of the text. Shouldn't be present + // in utf-8. If after decoding, there is a BOM, strip it; otherwise + // will cause the parser to go straight // into head mode - if (docData.charAt(0) == 65279) + if (docData.charAt(0) == 65279) { docData = docData.substring(1); + } doc = parser.parseInput(docData, baseUri); doc.outputSettings().charset(charsetName); @@ -108,9 +153,11 @@ public class DataUtil { byte[] buffer = new byte[bufferSize]; ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize); int read; - while(true) { - read = inStream.read(buffer); - if (read == -1) break; + while (true) { + read = inStream.read(buffer); + if (read == -1) { + break; + } outStream.write(buffer, 0, read); } ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray()); @@ -119,17 +166,21 @@ public class DataUtil { /** * Parse out a charset from a content type header. - * @param contentType e.g. "text/html; charset=EUC-JP" - * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. + * + * @param contentType + * e.g. "text/html; charset=EUC-JP" + * @return "EUC-JP", or null if not found. Charset is trimmed and + * uppercased. */ static String getCharsetFromContentType(String contentType) { - if (contentType == null) return null; + if (contentType == null) { + return null; + } Matcher m = charsetPattern.matcher(contentType); if (m.find()) { return m.group(1).trim().toUpperCase(); } return null; } - - + } diff --git a/server/src/org/jsoup/helper/DescendableLinkedList.java b/server/src/org/jsoup/helper/DescendableLinkedList.java index 28ca1971eb..97595c34e6 100644 --- a/server/src/org/jsoup/helper/DescendableLinkedList.java +++ b/server/src/org/jsoup/helper/DescendableLinkedList.java @@ -5,7 +5,8 @@ import java.util.LinkedList; import java.util.ListIterator; /** - * Provides a descending iterator and other 1.6 methods to allow support on the 1.5 JRE. + * Provides a descending iterator and other 1.6 methods to allow support on the + * 1.5 JRE. */ public class DescendableLinkedList<E> extends LinkedList<E> { @@ -18,32 +19,43 @@ public class DescendableLinkedList<E> extends LinkedList<E> { /** * Add a new element to the start of the list. - * @param e element to add + * + * @param e + * element to add */ + @Override public void push(E e) { addFirst(e); } /** * Look at the last element, if there is one. + * * @return the last element, or null */ + @Override public E peekLast() { return size() == 0 ? null : getLast(); } /** * Remove and return the last element, if there is one + * * @return the last element, or null */ + @Override public E pollLast() { return size() == 0 ? null : removeLast(); } /** - * Get an iterator that starts and the end of the list and works towards the start. - * @return an iterator that starts and the end of the list and works towards the start. + * Get an iterator that starts and the end of the list and works towards the + * start. + * + * @return an iterator that starts and the end of the list and works towards + * the start. */ + @Override public Iterator<E> descendingIterator() { return new DescendingIterator<E>(size()); } @@ -58,16 +70,20 @@ public class DescendableLinkedList<E> extends LinkedList<E> { /** * Check if there is another element on the list. + * * @return if another element */ + @Override public boolean hasNext() { return iter.hasPrevious(); } /** * Get the next element. + * * @return the next element. */ + @Override public E next() { return iter.previous(); } @@ -75,6 +91,7 @@ public class DescendableLinkedList<E> extends LinkedList<E> { /** * Remove the current element. */ + @Override public void remove() { iter.remove(); } diff --git a/server/src/org/jsoup/helper/HttpConnection.java b/server/src/org/jsoup/helper/HttpConnection.java index 06200a2547..a48f8972c2 100644 --- a/server/src/org/jsoup/helper/HttpConnection.java +++ b/server/src/org/jsoup/helper/HttpConnection.java @@ -1,23 +1,32 @@ package org.jsoup.helper; -import org.jsoup.Connection; -import org.jsoup.nodes.Document; -import org.jsoup.parser.Parser; -import org.jsoup.parser.TokenQueue; - -import java.io.*; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.nio.ByteBuffer; import java.nio.charset.Charset; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; import java.util.zip.GZIPInputStream; +import org.jsoup.Connection; +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.jsoup.parser.TokenQueue; + /** * Implementation of {@link Connection}. - * @see org.jsoup.Jsoup#connect(String) + * + * @see org.jsoup.Jsoup#connect(String) */ public class HttpConnection implements Connection { public static Connection connect(String url) { @@ -35,16 +44,18 @@ public class HttpConnection implements Connection { private Connection.Request req; private Connection.Response res; - private HttpConnection() { + private HttpConnection() { req = new Request(); res = new Response(); } + @Override public Connection url(URL url) { req.url(url); return this; } + @Override public Connection url(String url) { Validate.notEmpty(url, "Must supply a valid URL"); try { @@ -55,48 +66,57 @@ public class HttpConnection implements Connection { return this; } + @Override public Connection userAgent(String userAgent) { Validate.notNull(userAgent, "User agent must not be null"); req.header("User-Agent", userAgent); return this; } + @Override public Connection timeout(int millis) { req.timeout(millis); return this; } + @Override public Connection followRedirects(boolean followRedirects) { req.followRedirects(followRedirects); return this; } + @Override public Connection referrer(String referrer) { Validate.notNull(referrer, "Referrer must not be null"); req.header("Referer", referrer); return this; } + @Override public Connection method(Method method) { req.method(method); return this; } + @Override public Connection ignoreHttpErrors(boolean ignoreHttpErrors) { - req.ignoreHttpErrors(ignoreHttpErrors); - return this; - } + req.ignoreHttpErrors(ignoreHttpErrors); + return this; + } + @Override public Connection ignoreContentType(boolean ignoreContentType) { req.ignoreContentType(ignoreContentType); return this; } + @Override public Connection data(String key, String value) { req.data(KeyVal.create(key, value)); return this; } + @Override public Connection data(Map<String, String> data) { Validate.notNull(data, "Data map must not be null"); for (Map.Entry<String, String> entry : data.entrySet()) { @@ -105,12 +125,14 @@ public class HttpConnection implements Connection { return this; } + @Override public Connection data(String... keyvals) { Validate.notNull(keyvals, "Data key value pairs must not be null"); - Validate.isTrue(keyvals.length %2 == 0, "Must supply an even number of key value pairs"); + Validate.isTrue(keyvals.length % 2 == 0, + "Must supply an even number of key value pairs"); for (int i = 0; i < keyvals.length; i += 2) { String key = keyvals[i]; - String value = keyvals[i+1]; + String value = keyvals[i + 1]; Validate.notEmpty(key, "Data key must not be empty"); Validate.notNull(value, "Data value must not be null"); req.data(KeyVal.create(key, value)); @@ -118,16 +140,19 @@ public class HttpConnection implements Connection { return this; } + @Override public Connection header(String name, String value) { req.header(name, value); return this; } + @Override public Connection cookie(String name, String value) { req.cookie(name, value); return this; } + @Override public Connection cookies(Map<String, String> cookies) { Validate.notNull(cookies, "Cookie map must not be null"); for (Map.Entry<String, String> entry : cookies.entrySet()) { @@ -136,48 +161,57 @@ public class HttpConnection implements Connection { return this; } + @Override public Connection parser(Parser parser) { req.parser(parser); return this; } + @Override public Document get() throws IOException { req.method(Method.GET); execute(); return res.parse(); } + @Override public Document post() throws IOException { req.method(Method.POST); execute(); return res.parse(); } + @Override public Connection.Response execute() throws IOException { res = Response.execute(req); return res; } + @Override public Connection.Request request() { return req; } + @Override public Connection request(Connection.Request request) { req = request; return this; } + @Override public Connection.Response response() { return res; } + @Override public Connection response(Connection.Response response) { res = response; return this; } - @SuppressWarnings({"unchecked"}) - private static abstract class Base<T extends Connection.Base> implements Connection.Base<T> { + @SuppressWarnings({ "unchecked" }) + private static abstract class Base<T extends Connection.Base> implements + Connection.Base<T> { URL url; Method method; Map<String, String> headers; @@ -188,66 +222,83 @@ public class HttpConnection implements Connection { cookies = new LinkedHashMap<String, String>(); } + @Override public URL url() { return url; } + @Override public T url(URL url) { Validate.notNull(url, "URL must not be null"); this.url = url; return (T) this; } + @Override public Method method() { return method; } + @Override public T method(Method method) { Validate.notNull(method, "Method must not be null"); this.method = method; return (T) this; } + @Override public String header(String name) { Validate.notNull(name, "Header name must not be null"); return getHeaderCaseInsensitive(name); } + @Override public T header(String name, String value) { Validate.notEmpty(name, "Header name must not be empty"); Validate.notNull(value, "Header value must not be null"); - removeHeader(name); // ensures we don't get an "accept-encoding" and a "Accept-Encoding" + removeHeader(name); // ensures we don't get an "accept-encoding" and + // a "Accept-Encoding" headers.put(name, value); return (T) this; } + @Override public boolean hasHeader(String name) { Validate.notEmpty(name, "Header name must not be empty"); return getHeaderCaseInsensitive(name) != null; } + @Override public T removeHeader(String name) { Validate.notEmpty(name, "Header name must not be empty"); - Map.Entry<String, String> entry = scanHeaders(name); // remove is case insensitive too - if (entry != null) + Map.Entry<String, String> entry = scanHeaders(name); // remove is + // case + // insensitive + // too + if (entry != null) { headers.remove(entry.getKey()); // ensures correct case + } return (T) this; } + @Override public Map<String, String> headers() { return headers; } private String getHeaderCaseInsensitive(String name) { Validate.notNull(name, "Header name must not be null"); - // quick evals for common case of title case, lower case, then scan for mixed + // quick evals for common case of title case, lower case, then scan + // for mixed String value = headers.get(name); - if (value == null) + if (value == null) { value = headers.get(name.toLowerCase()); + } if (value == null) { Map.Entry<String, String> entry = scanHeaders(name); - if (entry != null) + if (entry != null) { value = entry.getValue(); + } } return value; } @@ -255,17 +306,20 @@ public class HttpConnection implements Connection { private Map.Entry<String, String> scanHeaders(String name) { String lc = name.toLowerCase(); for (Map.Entry<String, String> entry : headers.entrySet()) { - if (entry.getKey().toLowerCase().equals(lc)) + if (entry.getKey().toLowerCase().equals(lc)) { return entry; + } } return null; } + @Override public String cookie(String name) { Validate.notNull(name, "Cookie name must not be null"); return cookies.get(name); } + @Override public T cookie(String name, String value) { Validate.notEmpty(name, "Cookie name must not be empty"); Validate.notNull(value, "Cookie value must not be null"); @@ -273,23 +327,27 @@ public class HttpConnection implements Connection { return (T) this; } + @Override public boolean hasCookie(String name) { Validate.notEmpty("Cookie name must not be empty"); return cookies.containsKey(name); } + @Override public T removeCookie(String name) { Validate.notEmpty("Cookie name must not be empty"); cookies.remove(name); return (T) this; } + @Override public Map<String, String> cookies() { return cookies; } } - public static class Request extends Base<Connection.Request> implements Connection.Request { + public static class Request extends Base<Connection.Request> implements + Connection.Request { private int timeoutMilliseconds; private boolean followRedirects; private Collection<Connection.KeyVal> data; @@ -297,7 +355,7 @@ public class HttpConnection implements Connection { private boolean ignoreContentType = false; private Parser parser; - private Request() { + private Request() { timeoutMilliseconds = 3000; followRedirects = true; data = new ArrayList<Connection.KeyVal>(); @@ -306,64 +364,78 @@ public class HttpConnection implements Connection { parser = Parser.htmlParser(); } + @Override public int timeout() { return timeoutMilliseconds; } + @Override public Request timeout(int millis) { - Validate.isTrue(millis >= 0, "Timeout milliseconds must be 0 (infinite) or greater"); + Validate.isTrue(millis >= 0, + "Timeout milliseconds must be 0 (infinite) or greater"); timeoutMilliseconds = millis; return this; } + @Override public boolean followRedirects() { return followRedirects; } + @Override public Connection.Request followRedirects(boolean followRedirects) { this.followRedirects = followRedirects; return this; } + @Override public boolean ignoreHttpErrors() { return ignoreHttpErrors; } + @Override public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) { this.ignoreHttpErrors = ignoreHttpErrors; return this; } + @Override public boolean ignoreContentType() { return ignoreContentType; } + @Override public Connection.Request ignoreContentType(boolean ignoreContentType) { this.ignoreContentType = ignoreContentType; return this; } + @Override public Request data(Connection.KeyVal keyval) { Validate.notNull(keyval, "Key val must not be null"); data.add(keyval); return this; } + @Override public Collection<Connection.KeyVal> data() { return data; } - + + @Override public Request parser(Parser parser) { this.parser = parser; return this; } - + + @Override public Parser parser() { return parser; } } - public static class Response extends Base<Connection.Response> implements Connection.Response { + public static class Response extends Base<Connection.Response> implements + Connection.Response { private static final int MAX_REDIRECTS = 20; private int statusCode; private String statusMessage; @@ -382,44 +454,65 @@ public class HttpConnection implements Connection { super(); if (previousResponse != null) { numRedirects = previousResponse.numRedirects + 1; - if (numRedirects >= MAX_REDIRECTS) - throw new IOException(String.format("Too many redirects occurred trying to load URL %s", previousResponse.url())); + if (numRedirects >= MAX_REDIRECTS) { + throw new IOException( + String.format( + "Too many redirects occurred trying to load URL %s", + previousResponse.url())); + } } } - + static Response execute(Connection.Request req) throws IOException { return execute(req, null); } - static Response execute(Connection.Request req, Response previousResponse) throws IOException { + static Response execute(Connection.Request req, + Response previousResponse) throws IOException { Validate.notNull(req, "Request must not be null"); String protocol = req.url().getProtocol(); - Validate - .isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported"); + Validate.isTrue( + protocol.equals("http") || protocol.equals("https"), + "Only http & https protocols supported"); // set up the request for execution - if (req.method() == Connection.Method.GET && req.data().size() > 0) + if (req.method() == Connection.Method.GET && req.data().size() > 0) { serialiseRequestUrl(req); // appends query string + } HttpURLConnection conn = createConnection(req); conn.connect(); - if (req.method() == Connection.Method.POST) - writePost(req.data(), conn.getOutputStream()); + if (req.method() == Connection.Method.POST) { + writePost(req.data(), conn.getOutputStream()); + } int status = conn.getResponseCode(); boolean needsRedirect = false; if (status != HttpURLConnection.HTTP_OK) { - if (status == HttpURLConnection.HTTP_MOVED_TEMP || status == HttpURLConnection.HTTP_MOVED_PERM || status == HttpURLConnection.HTTP_SEE_OTHER) + if (status == HttpURLConnection.HTTP_MOVED_TEMP + || status == HttpURLConnection.HTTP_MOVED_PERM + || status == HttpURLConnection.HTTP_SEE_OTHER) { needsRedirect = true; - else if (!req.ignoreHttpErrors()) - throw new IOException(status + " error loading URL " + req.url().toString()); + } else if (!req.ignoreHttpErrors()) { + throw new IOException(status + " error loading URL " + + req.url().toString()); + } } Response res = new Response(previousResponse); res.setupFromConnection(conn, previousResponse); if (needsRedirect && req.followRedirects()) { - req.method(Method.GET); // always redirect with a get. any data param from original req are dropped. + req.method(Method.GET); // always redirect with a get. any data + // param from original req are dropped. req.data().clear(); req.url(new URL(req.url(), res.header("Location"))); - for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add response cookies to request (for e.g. login posts) + for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add + // response + // cookies + // to + // request + // (for + // e.g. + // login + // posts) req.cookie(cookie.getKey(), cookie.getValue()); } return execute(req, res); @@ -429,77 +522,120 @@ public class HttpConnection implements Connection { InputStream bodyStream = null; InputStream dataStream = null; try { - dataStream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); - bodyStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase("gzip") ? - new BufferedInputStream(new GZIPInputStream(dataStream)) : - new BufferedInputStream(dataStream); - + dataStream = conn.getErrorStream() != null ? conn + .getErrorStream() : conn.getInputStream(); + bodyStream = res.hasHeader("Content-Encoding") + && res.header("Content-Encoding").equalsIgnoreCase( + "gzip") ? new BufferedInputStream( + new GZIPInputStream(dataStream)) + : new BufferedInputStream(dataStream); + res.byteData = DataUtil.readToByteBuffer(bodyStream); - res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it + res.charset = DataUtil + .getCharsetFromContentType(res.contentType); // may be + // null, + // readInputStream + // deals + // with it } finally { - if (bodyStream != null) bodyStream.close(); - if (dataStream != null) dataStream.close(); + if (bodyStream != null) { + bodyStream.close(); + } + if (dataStream != null) { + dataStream.close(); + } } res.executed = true; return res; } + @Override public int statusCode() { return statusCode; } + @Override public String statusMessage() { return statusMessage; } + @Override public String charset() { return charset; } + @Override public String contentType() { return contentType; } + @Override public Document parse() throws IOException { - Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); - if (!req.ignoreContentType() && (contentType == null || !(contentType.startsWith("text/") || contentType.startsWith("application/xml") || contentType.startsWith("application/xhtml+xml")))) - throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*, application/xml, or application/xhtml+xml", - contentType, url.toString())); - Document doc = DataUtil.parseByteData(byteData, charset, url.toExternalForm(), req.parser()); + Validate.isTrue( + executed, + "Request must be executed (with .execute(), .get(), or .post() before parsing response"); + if (!req.ignoreContentType() + && (contentType == null || !(contentType + .startsWith("text/") + || contentType.startsWith("application/xml") || contentType + .startsWith("application/xhtml+xml")))) { + throw new IOException( + String.format( + "Unhandled content type \"%s\" on URL %s. Must be text/*, application/xml, or application/xhtml+xml", + contentType, url.toString())); + } + Document doc = DataUtil.parseByteData(byteData, charset, + url.toExternalForm(), req.parser()); byteData.rewind(); - charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly + charset = doc.outputSettings().charset().name(); // update charset + // from meta-equiv, + // possibly return doc; } + @Override public String body() { - Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); - // charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet + Validate.isTrue( + executed, + "Request must be executed (with .execute(), .get(), or .post() before getting response body"); + // charset gets set from header on execute, and from meta-equiv on + // parse. parse may not have happened yet String body; - if (charset == null) - body = Charset.forName(DataUtil.defaultCharset).decode(byteData).toString(); - else + if (charset == null) { + body = Charset.forName(DataUtil.defaultCharset) + .decode(byteData).toString(); + } else { body = Charset.forName(charset).decode(byteData).toString(); + } byteData.rewind(); return body; } + @Override public byte[] bodyAsBytes() { - Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); + Validate.isTrue( + executed, + "Request must be executed (with .execute(), .get(), or .post() before getting response body"); return byteData.array(); } // set up connection defaults, and details from request - private static HttpURLConnection createConnection(Connection.Request req) throws IOException { - HttpURLConnection conn = (HttpURLConnection) req.url().openConnection(); + private static HttpURLConnection createConnection(Connection.Request req) + throws IOException { + HttpURLConnection conn = (HttpURLConnection) req.url() + .openConnection(); conn.setRequestMethod(req.method().name()); - conn.setInstanceFollowRedirects(false); // don't rely on native redirection support + conn.setInstanceFollowRedirects(false); // don't rely on native + // redirection support conn.setConnectTimeout(req.timeout()); conn.setReadTimeout(req.timeout()); - if (req.method() == Method.POST) + if (req.method() == Method.POST) { conn.setDoOutput(true); - if (req.cookies().size() > 0) + } + if (req.cookies().size() > 0) { conn.addRequestProperty("Cookie", getRequestCookieString(req)); + } for (Map.Entry<String, String> header : req.headers().entrySet()) { conn.addRequestProperty(header.getKey(), header.getValue()); } @@ -507,7 +643,8 @@ public class HttpConnection implements Connection { } // set up url, method, header, cookies - private void setupFromConnection(HttpURLConnection conn, Connection.Response previousResponse) throws IOException { + private void setupFromConnection(HttpURLConnection conn, + Connection.Response previousResponse) throws IOException { method = Connection.Method.valueOf(conn.getRequestMethod()); url = conn.getURL(); statusCode = conn.getResponseCode(); @@ -517,11 +654,14 @@ public class HttpConnection implements Connection { Map<String, List<String>> resHeaders = conn.getHeaderFields(); processResponseHeaders(resHeaders); - // if from a redirect, map previous response cookies into this response + // if from a redirect, map previous response cookies into this + // response if (previousResponse != null) { - for (Map.Entry<String, String> prevCookie : previousResponse.cookies().entrySet()) { - if (!hasCookie(prevCookie.getKey())) + for (Map.Entry<String, String> prevCookie : previousResponse + .cookies().entrySet()) { + if (!hasCookie(prevCookie.getKey())) { cookie(prevCookie.getKey(), prevCookie.getValue()); + } } } } @@ -529,86 +669,98 @@ public class HttpConnection implements Connection { void processResponseHeaders(Map<String, List<String>> resHeaders) { for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) { String name = entry.getKey(); - if (name == null) + if (name == null) { continue; // http/1.1 line + } List<String> values = entry.getValue(); if (name.equalsIgnoreCase("Set-Cookie")) { for (String value : values) { - if (value == null) + if (value == null) { continue; + } TokenQueue cd = new TokenQueue(value); String cookieName = cd.chompTo("=").trim(); String cookieVal = cd.consumeTo(";").trim(); - if (cookieVal == null) + if (cookieVal == null) { cookieVal = ""; + } // ignores path, date, domain, secure et al. req'd? // name not blank, value not null - if (cookieName != null && cookieName.length() > 0) + if (cookieName != null && cookieName.length() > 0) { cookie(cookieName, cookieVal); + } } } else { // only take the first instance of each header - if (!values.isEmpty()) + if (!values.isEmpty()) { header(name, values.get(0)); + } } } } - private static void writePost(Collection<Connection.KeyVal> data, OutputStream outputStream) throws IOException { - OutputStreamWriter w = new OutputStreamWriter(outputStream, DataUtil.defaultCharset); + private static void writePost(Collection<Connection.KeyVal> data, + OutputStream outputStream) throws IOException { + OutputStreamWriter w = new OutputStreamWriter(outputStream, + DataUtil.defaultCharset); boolean first = true; for (Connection.KeyVal keyVal : data) { - if (!first) + if (!first) { w.append('&'); - else + } else { first = false; - + } + w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)); w.write('='); - w.write(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); + w.write(URLEncoder.encode(keyVal.value(), + DataUtil.defaultCharset)); } w.close(); } - + private static String getRequestCookieString(Connection.Request req) { StringBuilder sb = new StringBuilder(); boolean first = true; for (Map.Entry<String, String> cookie : req.cookies().entrySet()) { - if (!first) + if (!first) { sb.append("; "); - else + } else { first = false; - sb.append(cookie.getKey()).append('=').append(cookie.getValue()); - // todo: spec says only ascii, no escaping / encoding defined. validate on set? or escape somehow here? + } + sb.append(cookie.getKey()).append('=') + .append(cookie.getValue()); + // todo: spec says only ascii, no escaping / encoding defined. + // validate on set? or escape somehow here? } return sb.toString(); } // for get url reqs, serialise the data map into the url - private static void serialiseRequestUrl(Connection.Request req) throws IOException { + private static void serialiseRequestUrl(Connection.Request req) + throws IOException { URL in = req.url(); StringBuilder url = new StringBuilder(); boolean first = true; // reconstitute the query, ready for appends - url - .append(in.getProtocol()) - .append("://") - .append(in.getAuthority()) // includes host, port - .append(in.getPath()) - .append("?"); + url.append(in.getProtocol()).append("://") + .append(in.getAuthority()) // includes host, port + .append(in.getPath()).append("?"); if (in.getQuery() != null) { url.append(in.getQuery()); first = false; } for (Connection.KeyVal keyVal : req.data()) { - if (!first) + if (!first) { url.append('&'); - else + } else { first = false; - url - .append(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)) - .append('=') - .append(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); + } + url.append( + URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)) + .append('=') + .append(URLEncoder.encode(keyVal.value(), + DataUtil.defaultCharset)); } req.url(new URL(url.toString())); req.data().clear(); // moved into url as get params @@ -630,22 +782,26 @@ public class HttpConnection implements Connection { this.value = value; } + @Override public KeyVal key(String key) { Validate.notEmpty(key, "Data key must not be empty"); this.key = key; return this; } + @Override public String key() { return key; } + @Override public KeyVal value(String value) { Validate.notNull(value, "Data value must not be null"); this.value = value; return this; } + @Override public String value() { return value; } @@ -653,6 +809,6 @@ public class HttpConnection implements Connection { @Override public String toString() { return key + "=" + value; - } + } } } diff --git a/server/src/org/jsoup/helper/StringUtil.java b/server/src/org/jsoup/helper/StringUtil.java index 071a92c7a5..5a3d19b0aa 100644 --- a/server/src/org/jsoup/helper/StringUtil.java +++ b/server/src/org/jsoup/helper/StringUtil.java @@ -8,12 +8,16 @@ import java.util.Iterator; */ public final class StringUtil { // memoised padding up to 10 - private static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ", " ", " "}; + private static final String[] padding = { "", " ", " ", " ", " ", + " ", " ", " ", " ", " ", " " }; /** * Join a collection of strings by a seperator - * @param strings collection of string objects - * @param sep string to place between strings + * + * @param strings + * collection of string objects + * @param sep + * string to place between strings * @return joined string */ public static String join(Collection strings, String sep) { @@ -22,17 +26,22 @@ public final class StringUtil { /** * Join a collection of strings by a seperator - * @param strings iterator of string objects - * @param sep string to place between strings + * + * @param strings + * iterator of string objects + * @param sep + * string to place between strings * @return joined string */ public static String join(Iterator strings, String sep) { - if (!strings.hasNext()) + if (!strings.hasNext()) { return ""; + } String start = strings.next().toString(); - if (!strings.hasNext()) // only one, avoid builder + if (!strings.hasNext()) { return start; + } StringBuilder sb = new StringBuilder(64).append(start); while (strings.hasNext()) { @@ -44,62 +53,79 @@ public final class StringUtil { /** * Returns space padding - * @param width amount of padding desired + * + * @param width + * amount of padding desired * @return string of spaces * width */ public static String padding(int width) { - if (width < 0) + if (width < 0) { throw new IllegalArgumentException("width must be > 0"); + } - if (width < padding.length) + if (width < padding.length) { return padding[width]; + } char[] out = new char[width]; - for (int i = 0; i < width; i++) + for (int i = 0; i < width; i++) { out[i] = ' '; + } return String.valueOf(out); } /** - * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, \t, etc) - * @param string string to test + * Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, + * \t, etc) + * + * @param string + * string to test * @return if string is blank */ public static boolean isBlank(String string) { - if (string == null || string.length() == 0) + if (string == null || string.length() == 0) { return true; + } int l = string.length(); for (int i = 0; i < l; i++) { - if (!StringUtil.isWhitespace(string.codePointAt(i))) + if (!StringUtil.isWhitespace(string.codePointAt(i))) { return false; + } } return true; } /** * Tests if a string is numeric, i.e. contains only digit characters - * @param string string to test - * @return true if only digit chars, false if empty or null or contains non-digit chrs + * + * @param string + * string to test + * @return true if only digit chars, false if empty or null or contains + * non-digit chrs */ public static boolean isNumeric(String string) { - if (string == null || string.length() == 0) + if (string == null || string.length() == 0) { return false; + } int l = string.length(); for (int i = 0; i < l; i++) { - if (!Character.isDigit(string.codePointAt(i))) + if (!Character.isDigit(string.codePointAt(i))) { return false; + } } return true; } /** * Tests if a code point is "whitespace" as defined in the HTML spec. - * @param c code point to test + * + * @param c + * code point to test * @return true if code point is whitespace, false otherwise */ - public static boolean isWhitespace(int c){ + public static boolean isWhitespace(int c) { return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; } @@ -117,12 +143,12 @@ public final class StringUtil { modified = true; continue; } - if (c != ' ') + if (c != ' ') { modified = true; + } sb.append(' '); lastWasWhite = true; - } - else { + } else { sb.appendCodePoint(c); lastWasWhite = false; } @@ -132,8 +158,9 @@ public final class StringUtil { public static boolean in(String needle, String... haystack) { for (String hay : haystack) { - if (hay.equals(needle)) - return true; + if (hay.equals(needle)) { + return true; + } } return false; } diff --git a/server/src/org/jsoup/helper/Validate.java b/server/src/org/jsoup/helper/Validate.java index 814bcc3a40..e9fe04f87b 100644 --- a/server/src/org/jsoup/helper/Validate.java +++ b/server/src/org/jsoup/helper/Validate.java @@ -4,69 +4,93 @@ package org.jsoup.helper; * Simple validation methods. Designed for jsoup internal use */ public final class Validate { - - private Validate() {} + + private Validate() { + } /** * Validates that the object is not null - * @param obj object to test + * + * @param obj + * object to test */ public static void notNull(Object obj) { - if (obj == null) + if (obj == null) { throw new IllegalArgumentException("Object must not be null"); + } } /** * Validates that the object is not null - * @param obj object to test - * @param msg message to output if validation fails + * + * @param obj + * object to test + * @param msg + * message to output if validation fails */ public static void notNull(Object obj, String msg) { - if (obj == null) + if (obj == null) { throw new IllegalArgumentException(msg); + } } /** * Validates that the value is true - * @param val object to test + * + * @param val + * object to test */ public static void isTrue(boolean val) { - if (!val) + if (!val) { throw new IllegalArgumentException("Must be true"); + } } /** * Validates that the value is true - * @param val object to test - * @param msg message to output if validation fails + * + * @param val + * object to test + * @param msg + * message to output if validation fails */ public static void isTrue(boolean val, String msg) { - if (!val) + if (!val) { throw new IllegalArgumentException(msg); + } } /** * Validates that the value is false - * @param val object to test + * + * @param val + * object to test */ public static void isFalse(boolean val) { - if (val) + if (val) { throw new IllegalArgumentException("Must be false"); + } } /** * Validates that the value is false - * @param val object to test - * @param msg message to output if validation fails + * + * @param val + * object to test + * @param msg + * message to output if validation fails */ public static void isFalse(boolean val, String msg) { - if (val) + if (val) { throw new IllegalArgumentException(msg); + } } /** * Validates that the array contains no null elements - * @param objects the array to test + * + * @param objects + * the array to test */ public static void noNullElements(Object[] objects) { noNullElements(objects, "Array must not contain any null objects"); @@ -74,37 +98,51 @@ public final class Validate { /** * Validates that the array contains no null elements - * @param objects the array to test - * @param msg message to output if validation fails + * + * @param objects + * the array to test + * @param msg + * message to output if validation fails */ public static void noNullElements(Object[] objects, String msg) { - for (Object obj : objects) - if (obj == null) + for (Object obj : objects) { + if (obj == null) { throw new IllegalArgumentException(msg); + } + } } /** * Validates that the string is not empty - * @param string the string to test + * + * @param string + * the string to test */ public static void notEmpty(String string) { - if (string == null || string.length() == 0) + if (string == null || string.length() == 0) { throw new IllegalArgumentException("String must not be empty"); + } } /** * Validates that the string is not empty - * @param string the string to test - * @param msg message to output if validation fails + * + * @param string + * the string to test + * @param msg + * message to output if validation fails */ public static void notEmpty(String string, String msg) { - if (string == null || string.length() == 0) + if (string == null || string.length() == 0) { throw new IllegalArgumentException(msg); + } } /** - Cause a failure. - @param msg message to output. + * Cause a failure. + * + * @param msg + * message to output. */ public static void fail(String msg) { throw new IllegalArgumentException(msg); diff --git a/server/src/org/jsoup/nodes/Attribute.java b/server/src/org/jsoup/nodes/Attribute.java index 02eb29db83..5f27b4fcc4 100644 --- a/server/src/org/jsoup/nodes/Attribute.java +++ b/server/src/org/jsoup/nodes/Attribute.java @@ -1,21 +1,26 @@ package org.jsoup.nodes; -import org.jsoup.helper.Validate; - import java.util.Map; -/** - A single key + value attribute. Keys are trimmed and normalised to lower-case. +import org.jsoup.helper.Validate; - @author Jonathan Hedley, jonathan@hedley.net */ -public class Attribute implements Map.Entry<String, String>, Cloneable { +/** + * A single key + value attribute. Keys are trimmed and normalised to + * lower-case. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class Attribute implements Map.Entry<String, String>, Cloneable { private String key; private String value; /** * Create a new attribute from unencoded (raw) key and value. - * @param key attribute key - * @param value attribute value + * + * @param key + * attribute key + * @param value + * attribute value * @see #createFromEncoded */ public Attribute(String key, String value) { @@ -26,16 +31,20 @@ public class Attribute implements Map.Entry<String, String>, Cloneable { } /** - Get the attribute key. - @return the attribute key + * Get the attribute key. + * + * @return the attribute key */ + @Override public String getKey() { return key; } /** - Set the attribute key. Gets normalised as per the constructor method. - @param key the new key; must not be null + * Set the attribute key. Gets normalised as per the constructor method. + * + * @param key + * the new key; must not be null */ public void setKey(String key) { Validate.notEmpty(key); @@ -43,17 +52,22 @@ public class Attribute implements Map.Entry<String, String>, Cloneable { } /** - Get the attribute value. - @return the attribute value + * Get the attribute value. + * + * @return the attribute value */ + @Override public String getValue() { return value; } /** - Set the attribute value. - @param value the new attribute value; must not be null + * Set the attribute value. + * + * @param value + * the new attribute value; must not be null */ + @Override public String setValue(String value) { Validate.notNull(value); String old = this.value; @@ -62,53 +76,73 @@ public class Attribute implements Map.Entry<String, String>, Cloneable { } /** - Get the HTML representation of this attribute; e.g. {@code href="index.html"}. - @return HTML + * Get the HTML representation of this attribute; e.g. + * {@code href="index.html"}. + * + * @return HTML */ public String html() { - return key + "=\"" + Entities.escape(value, (new Document("")).outputSettings()) + "\""; + return key + "=\"" + + Entities.escape(value, (new Document("")).outputSettings()) + + "\""; } - + protected void html(StringBuilder accum, Document.OutputSettings out) { - accum - .append(key) - .append("=\"") - .append(Entities.escape(value, out)) - .append("\""); + accum.append(key).append("=\"").append(Entities.escape(value, out)) + .append("\""); } /** - Get the string representation of this attribute, implemented as {@link #html()}. - @return string + * Get the string representation of this attribute, implemented as + * {@link #html()}. + * + * @return string */ + @Override public String toString() { return html(); } /** - * Create a new Attribute from an unencoded key and a HTML attribute encoded value. - * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars. - * @param encodedValue HTML attribute encoded value + * Create a new Attribute from an unencoded key and a HTML attribute encoded + * value. + * + * @param unencodedKey + * assumes the key is not encoded, as can be only run of simple + * \w chars. + * @param encodedValue + * HTML attribute encoded value * @return attribute */ - public static Attribute createFromEncoded(String unencodedKey, String encodedValue) { + public static Attribute createFromEncoded(String unencodedKey, + String encodedValue) { String value = Entities.unescape(encodedValue, true); return new Attribute(unencodedKey, value); } protected boolean isDataAttribute() { - return key.startsWith(Attributes.dataPrefix) && key.length() > Attributes.dataPrefix.length(); + return key.startsWith(Attributes.dataPrefix) + && key.length() > Attributes.dataPrefix.length(); } @Override public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Attribute)) return false; + if (this == o) { + return true; + } + if (!(o instanceof Attribute)) { + return false; + } Attribute attribute = (Attribute) o; - if (key != null ? !key.equals(attribute.key) : attribute.key != null) return false; - if (value != null ? !value.equals(attribute.value) : attribute.value != null) return false; + if (key != null ? !key.equals(attribute.key) : attribute.key != null) { + return false; + } + if (value != null ? !value.equals(attribute.value) + : attribute.value != null) { + return false; + } return true; } @@ -123,7 +157,9 @@ public class Attribute implements Map.Entry<String, String>, Cloneable { @Override public Attribute clone() { try { - return (Attribute) super.clone(); // only fields are immutable strings key and value, so no more deep copy required + return (Attribute) super.clone(); // only fields are immutable + // strings key and value, so no + // more deep copy required } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } diff --git a/server/src/org/jsoup/nodes/Attributes.java b/server/src/org/jsoup/nodes/Attributes.java index 9436750fc9..8757d1bf97 100644 --- a/server/src/org/jsoup/nodes/Attributes.java +++ b/server/src/org/jsoup/nodes/Attributes.java @@ -1,46 +1,63 @@ package org.jsoup.nodes; -import org.jsoup.helper.Validate; +import java.util.AbstractMap; +import java.util.AbstractSet; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; -import java.util.*; +import org.jsoup.helper.Validate; /** * The attributes of an Element. * <p/> - * Attributes are treated as a map: there can be only one value associated with an attribute key. + * Attributes are treated as a map: there can be only one value associated with + * an attribute key. * <p/> - * Attribute key and value comparisons are done case insensitively, and keys are normalised to - * lower-case. + * Attribute key and value comparisons are done case insensitively, and keys are + * normalised to lower-case. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Attributes implements Iterable<Attribute>, Cloneable { protected static final String dataPrefix = "data-"; - + private LinkedHashMap<String, Attribute> attributes = null; + // linked hash map to preserve insertion order. - // null be default as so many elements have no attributes -- saves a good chunk of memory + // null be default as so many elements have no attributes -- saves a good + // chunk of memory /** - Get an attribute value by key. - @param key the attribute key - @return the attribute value if set; or empty string if not set. - @see #hasKey(String) + * Get an attribute value by key. + * + * @param key + * the attribute key + * @return the attribute value if set; or empty string if not set. + * @see #hasKey(String) */ public String get(String key) { Validate.notEmpty(key); - if (attributes == null) + if (attributes == null) { return ""; + } Attribute attr = attributes.get(key.toLowerCase()); return attr != null ? attr.getValue() : ""; } /** - Set a new attribute, or replace an existing one by key. - @param key attribute key - @param value attribute value + * Set a new attribute, or replace an existing one by key. + * + * @param key + * attribute key + * @param value + * attribute value */ public void put(String key, String value) { Attribute attr = new Attribute(key, value); @@ -48,70 +65,88 @@ public class Attributes implements Iterable<Attribute>, Cloneable { } /** - Set a new attribute, or replace an existing one by key. - @param attribute attribute + * Set a new attribute, or replace an existing one by key. + * + * @param attribute + * attribute */ public void put(Attribute attribute) { Validate.notNull(attribute); - if (attributes == null) - attributes = new LinkedHashMap<String, Attribute>(2); + if (attributes == null) { + attributes = new LinkedHashMap<String, Attribute>(2); + } attributes.put(attribute.getKey(), attribute); } /** - Remove an attribute by key. - @param key attribute key to remove + * Remove an attribute by key. + * + * @param key + * attribute key to remove */ public void remove(String key) { Validate.notEmpty(key); - if (attributes == null) + if (attributes == null) { return; + } attributes.remove(key.toLowerCase()); } /** - Tests if these attributes contain an attribute with this key. - @param key key to check for - @return true if key exists, false otherwise + * Tests if these attributes contain an attribute with this key. + * + * @param key + * key to check for + * @return true if key exists, false otherwise */ public boolean hasKey(String key) { return attributes != null && attributes.containsKey(key.toLowerCase()); } /** - Get the number of attributes in this set. - @return size + * Get the number of attributes in this set. + * + * @return size */ public int size() { - if (attributes == null) + if (attributes == null) { return 0; + } return attributes.size(); } /** - Add all the attributes from the incoming set to this set. - @param incoming attributes to add to these attributes. + * Add all the attributes from the incoming set to this set. + * + * @param incoming + * attributes to add to these attributes. */ public void addAll(Attributes incoming) { - if (incoming.size() == 0) + if (incoming.size() == 0) { return; - if (attributes == null) + } + if (attributes == null) { attributes = new LinkedHashMap<String, Attribute>(incoming.size()); + } attributes.putAll(incoming.attributes); } - + + @Override public Iterator<Attribute> iterator() { return asList().iterator(); } /** - Get the attributes as a List, for iteration. Do not modify the keys of the attributes via this view, as changes - to keys will not be recognised in the containing set. - @return an view of the attributes as a List. + * Get the attributes as a List, for iteration. Do not modify the keys of + * the attributes via this view, as changes to keys will not be recognised + * in the containing set. + * + * @return an view of the attributes as a List. */ public List<Attribute> asList() { - if (attributes == null) + if (attributes == null) { return Collections.emptyList(); + } List<Attribute> list = new ArrayList<Attribute>(attributes.size()); for (Map.Entry<String, Attribute> entry : attributes.entrySet()) { @@ -121,8 +156,9 @@ public class Attributes implements Iterable<Attribute>, Cloneable { } /** - * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys - * starting with {@code data-}. + * Retrieves a filtered view of attributes that are HTML5 custom data + * attributes; that is, attributes with keys starting with {@code data-}. + * * @return map of custom data attributes. */ public Map<String, String> dataset() { @@ -130,42 +166,54 @@ public class Attributes implements Iterable<Attribute>, Cloneable { } /** - Get the HTML representation of these attributes. - @return HTML + * Get the HTML representation of these attributes. + * + * @return HTML */ public String html() { StringBuilder accum = new StringBuilder(); - html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used + html(accum, (new Document("")).outputSettings()); // output settings a + // bit funky, but this + // html() seldom used return accum.toString(); } - + void html(StringBuilder accum, Document.OutputSettings out) { - if (attributes == null) + if (attributes == null) { return; - + } + for (Map.Entry<String, Attribute> entry : attributes.entrySet()) { Attribute attribute = entry.getValue(); accum.append(" "); attribute.html(accum, out); } } - + + @Override public String toString() { return html(); } - + @Override public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Attributes)) return false; - + if (this == o) { + return true; + } + if (!(o instanceof Attributes)) { + return false; + } + Attributes that = (Attributes) o; - - if (attributes != null ? !attributes.equals(that.attributes) : that.attributes != null) return false; - + + if (attributes != null ? !attributes.equals(that.attributes) + : that.attributes != null) { + return false; + } + return true; } - + @Override public int hashCode() { return attributes != null ? attributes.hashCode() : 0; @@ -173,8 +221,9 @@ public class Attributes implements Iterable<Attribute>, Cloneable { @Override public Attributes clone() { - if (attributes == null) + if (attributes == null) { return new Attributes(); + } Attributes clone; try { @@ -182,19 +231,23 @@ public class Attributes implements Iterable<Attribute>, Cloneable { } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } - clone.attributes = new LinkedHashMap<String, Attribute>(attributes.size()); - for (Attribute attribute: this) + clone.attributes = new LinkedHashMap<String, Attribute>( + attributes.size()); + for (Attribute attribute : this) { clone.attributes.put(attribute.getKey(), attribute.clone()); + } return clone; } private class Dataset extends AbstractMap<String, String> { private Dataset() { - if (attributes == null) + if (attributes == null) { attributes = new LinkedHashMap<String, Attribute>(2); + } } + @Override public Set<Entry<String, String>> entrySet() { return new EntrySet(); } @@ -202,41 +255,54 @@ public class Attributes implements Iterable<Attribute>, Cloneable { @Override public String put(String key, String value) { String dataKey = dataKey(key); - String oldValue = hasKey(dataKey) ? attributes.get(dataKey).getValue() : null; + String oldValue = hasKey(dataKey) ? attributes.get(dataKey) + .getValue() : null; Attribute attr = new Attribute(dataKey, value); attributes.put(dataKey, attr); return oldValue; } private class EntrySet extends AbstractSet<Map.Entry<String, String>> { + @Override public Iterator<Map.Entry<String, String>> iterator() { return new DatasetIterator(); } + @Override public int size() { int count = 0; Iterator iter = new DatasetIterator(); - while (iter.hasNext()) + while (iter.hasNext()) { count++; + } return count; } } - private class DatasetIterator implements Iterator<Map.Entry<String, String>> { - private Iterator<Attribute> attrIter = attributes.values().iterator(); + private class DatasetIterator implements + Iterator<Map.Entry<String, String>> { + private Iterator<Attribute> attrIter = attributes.values() + .iterator(); private Attribute attr; + + @Override public boolean hasNext() { while (attrIter.hasNext()) { attr = attrIter.next(); - if (attr.isDataAttribute()) return true; + if (attr.isDataAttribute()) { + return true; + } } return false; } + @Override public Entry<String, String> next() { - return new Attribute(attr.getKey().substring(dataPrefix.length()), attr.getValue()); + return new Attribute(attr.getKey().substring( + dataPrefix.length()), attr.getValue()); } + @Override public void remove() { attributes.remove(attr.getKey()); } diff --git a/server/src/org/jsoup/nodes/Comment.java b/server/src/org/jsoup/nodes/Comment.java index 37fd4368fa..6abe0e3066 100644 --- a/server/src/org/jsoup/nodes/Comment.java +++ b/server/src/org/jsoup/nodes/Comment.java @@ -1,45 +1,55 @@ package org.jsoup.nodes; /** - A comment node. - - @author Jonathan Hedley, jonathan@hedley.net */ + * A comment node. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ public class Comment extends Node { private static final String COMMENT_KEY = "comment"; /** - Create a new comment node. - @param data The contents of the comment - @param baseUri base URI + * Create a new comment node. + * + * @param data + * The contents of the comment + * @param baseUri + * base URI */ public Comment(String data, String baseUri) { super(baseUri); attributes.put(COMMENT_KEY, data); } + @Override public String nodeName() { return "#comment"; } /** - Get the contents of the comment. - @return comment content + * Get the contents of the comment. + * + * @return comment content */ public String getData() { return attributes.get(COMMENT_KEY); } - void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { - if (out.prettyPrint()) + @Override + void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out) { + if (out.prettyPrint()) { indent(accum, depth, out); - accum - .append("<!--") - .append(getData()) - .append("-->"); + } + accum.append("<!--").append(getData()).append("-->"); } - void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + @Override + void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out) { + } + @Override public String toString() { return outerHtml(); } diff --git a/server/src/org/jsoup/nodes/DataNode.java b/server/src/org/jsoup/nodes/DataNode.java index a64f56f0a4..cc377a4cc8 100644 --- a/server/src/org/jsoup/nodes/DataNode.java +++ b/server/src/org/jsoup/nodes/DataNode.java @@ -1,29 +1,37 @@ package org.jsoup.nodes; /** - A data node, for contents of style, script tags etc, where contents should not show in text(). - - @author Jonathan Hedley, jonathan@hedley.net */ -public class DataNode extends Node{ + * A data node, for contents of style, script tags etc, where contents should + * not show in text(). + * + * @author Jonathan Hedley, jonathan@hedley.net + */ +public class DataNode extends Node { private static final String DATA_KEY = "data"; /** - Create a new DataNode. - @param data data contents - @param baseUri base URI + * Create a new DataNode. + * + * @param data + * data contents + * @param baseUri + * base URI */ public DataNode(String data, String baseUri) { super(baseUri); attributes.put(DATA_KEY, data); } + @Override public String nodeName() { return "#data"; } /** - Get the data contents of this node. Will be unescaped and with original new lines, space etc. - @return data + * Get the data contents of this node. Will be unescaped and with original + * new lines, space etc. + * + * @return data */ public String getWholeData() { return attributes.get(DATA_KEY); @@ -31,7 +39,9 @@ public class DataNode extends Node{ /** * Set the data contents of this node. - * @param data unencoded data + * + * @param data + * unencoded data * @return this node, for chaining */ public DataNode setWholeData(String data) { @@ -39,21 +49,31 @@ public class DataNode extends Node{ return this; } - void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { - accum.append(getWholeData()); // data is not escaped in return from data nodes, so " in script, style is plain + @Override + void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out) { + accum.append(getWholeData()); // data is not escaped in return from data + // nodes, so " in script, style is plain } - void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + @Override + void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out) { + } + @Override public String toString() { return outerHtml(); } /** - Create a new DataNode from HTML encoded data. - @param encodedData encoded data - @param baseUri bass URI - @return new DataNode + * Create a new DataNode from HTML encoded data. + * + * @param encodedData + * encoded data + * @param baseUri + * bass URI + * @return new DataNode */ public static DataNode createFromEncoded(String encodedData, String baseUri) { String data = Entities.unescape(encodedData); diff --git a/server/src/org/jsoup/nodes/Document.java b/server/src/org/jsoup/nodes/Document.java index adb371ce14..f1c4595faa 100644 --- a/server/src/org/jsoup/nodes/Document.java +++ b/server/src/org/jsoup/nodes/Document.java @@ -1,36 +1,42 @@ package org.jsoup.nodes; -import org.jsoup.helper.Validate; -import org.jsoup.parser.Tag; -import org.jsoup.select.Elements; - import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.List; -/** - A HTML Document. +import org.jsoup.helper.Validate; +import org.jsoup.parser.Tag; +import org.jsoup.select.Elements; - @author Jonathan Hedley, jonathan@hedley.net */ +/** + * A HTML Document. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ public class Document extends Element { private OutputSettings outputSettings = new OutputSettings(); private QuirksMode quirksMode = QuirksMode.noQuirks; /** - Create a new, empty Document. - @param baseUri base URI of document - @see org.jsoup.Jsoup#parse - @see #createShell + * Create a new, empty Document. + * + * @param baseUri + * base URI of document + * @see org.jsoup.Jsoup#parse + * @see #createShell */ public Document(String baseUri) { super(Tag.valueOf("#root"), baseUri); } /** - Create a valid, empty shell of a document, suitable for adding more elements to. - @param baseUri baseUri of document - @return document with html, head, and body elements. + * Create a valid, empty shell of a document, suitable for adding more + * elements to. + * + * @param baseUri + * baseUri of document + * @return document with html, head, and body elements. */ static public Document createShell(String baseUri) { Validate.notNull(baseUri); @@ -44,24 +50,27 @@ public class Document extends Element { } /** - Accessor to the document's {@code head} element. - @return {@code head} + * Accessor to the document's {@code head} element. + * + * @return {@code head} */ public Element head() { return findFirstElementByTagName("head", this); } /** - Accessor to the document's {@code body} element. - @return {@code body} + * Accessor to the document's {@code body} element. + * + * @return {@code body} */ public Element body() { return findFirstElementByTagName("body", this); } /** - Get the string contents of the document's {@code title} element. - @return Trimmed title, or empty string if none set. + * Get the string contents of the document's {@code title} element. + * + * @return Trimmed title, or empty string if none set. */ public String title() { Element titleEl = getElementsByTag("title").first(); @@ -69,9 +78,11 @@ public class Document extends Element { } /** - Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if - not present - @param title string to set as title + * Set the document's {@code title} element. Updates the existing element, + * or adds {@code title} to {@code head} if not present + * + * @param title + * string to set as title */ public void title(String title) { Validate.notNull(title); @@ -84,29 +95,38 @@ public class Document extends Element { } /** - Create a new Element, with this document's base uri. Does not make the new element a child of this document. - @param tagName element tag name (e.g. {@code a}) - @return new element + * Create a new Element, with this document's base uri. Does not make the + * new element a child of this document. + * + * @param tagName + * element tag name (e.g. {@code a}) + * @return new element */ public Element createElement(String tagName) { - return new Element(Tag.valueOf(tagName), this.baseUri()); + return new Element(Tag.valueOf(tagName), baseUri()); } /** - Normalise the document. This happens after the parse phase so generally does not need to be called. - Moves any text content that is not in the body element into the body. - @return this document after normalisation + * Normalise the document. This happens after the parse phase so generally + * does not need to be called. Moves any text content that is not in the + * body element into the body. + * + * @return this document after normalisation */ public Document normalise() { Element htmlEl = findFirstElementByTagName("html", this); - if (htmlEl == null) + if (htmlEl == null) { htmlEl = appendElement("html"); - if (head() == null) + } + if (head() == null) { htmlEl.prependElement("head"); - if (body() == null) + } + if (body() == null) { htmlEl.appendElement("body"); + } - // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care + // pull text nodes out of root, html, and head els, and push into body. + // non-text nodes are already taken care // of. do in inverse order to maintain text order. normaliseTextNodes(head()); normaliseTextNodes(htmlEl); @@ -114,22 +134,23 @@ public class Document extends Element { normaliseStructure("head", htmlEl); normaliseStructure("body", htmlEl); - + return this; } // does not recurse. private void normaliseTextNodes(Element element) { List<Node> toMove = new ArrayList<Node>(); - for (Node node: element.childNodes) { + for (Node node : element.childNodes) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; - if (!tn.isBlank()) + if (!tn.isBlank()) { toMove.add(tn); + } } } - for (int i = toMove.size()-1; i >= 0; i--) { + for (int i = toMove.size() - 1; i >= 0; i--) { Node node = toMove.get(i); element.removeChild(node); body().prependChild(new TextNode(" ", "")); @@ -137,37 +158,42 @@ public class Document extends Element { } } - // merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html> + // merge multiple <head> or <body> contents into one, delete the remainder, + // and ensure they are owned by <html> private void normaliseStructure(String tag, Element htmlEl) { - Elements elements = this.getElementsByTag(tag); - Element master = elements.first(); // will always be available as created above if not existent + Elements elements = getElementsByTag(tag); + Element master = elements.first(); // will always be available as + // created above if not existent if (elements.size() > 1) { // dupes, move contents to master List<Node> toMove = new ArrayList<Node>(); for (int i = 1; i < elements.size(); i++) { Node dupe = elements.get(i); - for (Node node : dupe.childNodes) + for (Node node : dupe.childNodes) { toMove.add(node); + } dupe.remove(); } - for (Node dupe : toMove) + for (Node dupe : toMove) { master.appendChild(dupe); + } } // ensure parented by <html> if (!master.parent().equals(htmlEl)) { - htmlEl.appendChild(master); // includes remove() + htmlEl.appendChild(master); // includes remove() } } // fast method to get first by tag name, used for html, head, body finders private Element findFirstElementByTagName(String tag, Node node) { - if (node.nodeName().equals(tag)) + if (node.nodeName().equals(tag)) { return (Element) node; - else { - for (Node child: node.childNodes) { + } else { + for (Node child : node.childNodes) { Element found = findFirstElementByTagName(tag, child); - if (found != null) + if (found != null) { return found; + } } } return null; @@ -179,9 +205,12 @@ public class Document extends Element { } /** - Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. - @param text unencoded text - @return this document + * Set the text of the {@code body} of this document. Any existing nodes + * within the body will be cleared. + * + * @param text + * unencoded text + * @return this document */ @Override public Element text(String text) { @@ -197,12 +226,13 @@ public class Document extends Element { @Override public Document clone() { Document clone = (Document) super.clone(); - clone.outputSettings = this.outputSettings.clone(); + clone.outputSettings = outputSettings.clone(); return clone; } /** - * A Document's output settings control the form of the text() and html() methods. + * A Document's output settings control the form of the text() and html() + * methods. */ public static class OutputSettings implements Cloneable { private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; @@ -211,14 +241,18 @@ public class Document extends Element { private boolean prettyPrint = true; private int indentAmount = 1; - public OutputSettings() {} + public OutputSettings() { + } /** - * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML - * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>, - * which uses the complete set of HTML named entities. + * Get the document's current HTML escape mode: <code>base</code>, which + * provides a limited set of named HTML entities and escapes other + * characters as numbered entities for maximum compatibility; or + * <code>extended</code>, which uses the complete set of HTML named + * entities. * <p> * The default escape mode is <code>base</code>. + * * @return the document's current escape mode */ public Entities.EscapeMode escapeMode() { @@ -227,7 +261,9 @@ public class Document extends Element { /** * Set the document's escape mode - * @param escapeMode the new escape mode to use + * + * @param escapeMode + * the new escape mode to use * @return the document's output settings, for chaining */ public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { @@ -236,11 +272,14 @@ public class Document extends Element { } /** - * Get the document's current output charset, which is used to control which characters are escaped when - * generating HTML (via the <code>html()</code> methods), and which are kept intact. + * Get the document's current output charset, which is used to control + * which characters are escaped when generating HTML (via the + * <code>html()</code> methods), and which are kept intact. * <p> - * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the - * input charset. Otherwise, it defaults to UTF-8. + * Where possible (when parsing from a URL or File), the document's + * output charset is automatically set to the input charset. Otherwise, + * it defaults to UTF-8. + * * @return the document's current charset. */ public Charset charset() { @@ -249,7 +288,9 @@ public class Document extends Element { /** * Update the document's output charset. - * @param charset the new charset to use. + * + * @param charset + * the new charset to use. * @return the document's output settings, for chaining */ public OutputSettings charset(Charset charset) { @@ -261,7 +302,9 @@ public class Document extends Element { /** * Update the document's output charset. - * @param charset the new charset (by name) to use. + * + * @param charset + * the new charset (by name) to use. * @return the document's output settings, for chaining */ public OutputSettings charset(String charset) { @@ -274,8 +317,10 @@ public class Document extends Element { } /** - * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format - * the output, and the output will generally look like the input. + * Get if pretty printing is enabled. Default is true. If disabled, the + * HTML output methods will not re-format the output, and the output + * will generally look like the input. + * * @return if pretty printing is enabled. */ public boolean prettyPrint() { @@ -284,7 +329,9 @@ public class Document extends Element { /** * Enable or disable pretty printing. - * @param pretty new pretty print setting + * + * @param pretty + * new pretty print setting * @return this, for chaining */ public OutputSettings prettyPrint(boolean pretty) { @@ -294,6 +341,7 @@ public class Document extends Element { /** * Get the current tag indent amount, used when pretty printing. + * * @return the current indent amount */ public int indentAmount() { @@ -302,7 +350,10 @@ public class Document extends Element { /** * Set the indent amount for pretty printing - * @param indentAmount number of spaces to use for indenting each level. Must be >= 0. + * + * @param indentAmount + * number of spaces to use for indenting each level. Must be + * >= 0. * @return this, for chaining */ public OutputSettings indentAmount(int indentAmount) { @@ -321,13 +372,15 @@ public class Document extends Element { } clone.charset(charset.name()); // new charset and charset encoder clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); - // indentAmount, prettyPrint are primitives so object.clone() will handle + // indentAmount, prettyPrint are primitives so object.clone() will + // handle return clone; } } /** * Get the document's current output settings. + * * @return the document's current output settings. */ public OutputSettings outputSettings() { @@ -347,4 +400,3 @@ public class Document extends Element { return this; } } - diff --git a/server/src/org/jsoup/nodes/DocumentType.java b/server/src/org/jsoup/nodes/DocumentType.java index f8c79f0d18..13ff78dc8b 100644 --- a/server/src/org/jsoup/nodes/DocumentType.java +++ b/server/src/org/jsoup/nodes/DocumentType.java @@ -11,12 +11,18 @@ public class DocumentType extends Node { /** * Create a new doctype element. - * @param name the doctype's name - * @param publicId the doctype's public ID - * @param systemId the doctype's system ID - * @param baseUri the doctype's base URI + * + * @param name + * the doctype's name + * @param publicId + * the doctype's public ID + * @param systemId + * the doctype's system ID + * @param baseUri + * the doctype's base URI */ - public DocumentType(String name, String publicId, String systemId, String baseUri) { + public DocumentType(String name, String publicId, String systemId, + String baseUri) { super(baseUri); Validate.notEmpty(name); @@ -31,16 +37,20 @@ public class DocumentType extends Node { } @Override - void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out) { accum.append("<!DOCTYPE ").append(attr("name")); - if (!StringUtil.isBlank(attr("publicId"))) + if (!StringUtil.isBlank(attr("publicId"))) { accum.append(" PUBLIC \"").append(attr("publicId")).append("\""); - if (!StringUtil.isBlank(attr("systemId"))) + } + if (!StringUtil.isBlank(attr("systemId"))) { accum.append(" \"").append(attr("systemId")).append("\""); + } accum.append('>'); } @Override - void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { + void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out) { } } diff --git a/server/src/org/jsoup/nodes/Element.java b/server/src/org/jsoup/nodes/Element.java index 5c1894c934..ff9e68b962 100644 --- a/server/src/org/jsoup/nodes/Element.java +++ b/server/src/org/jsoup/nodes/Element.java @@ -1,5 +1,15 @@ package org.jsoup.nodes; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; import org.jsoup.parser.Parser; @@ -9,44 +19,46 @@ import org.jsoup.select.Elements; import org.jsoup.select.Evaluator; import org.jsoup.select.Selector; -import java.util.*; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - /** - * A HTML element consists of a tag name, attributes, and child nodes (including text nodes and - * other elements). + * A HTML element consists of a tag name, attributes, and child nodes (including + * text nodes and other elements). * - * From an Element, you can extract data, traverse the node graph, and manipulate the HTML. + * From an Element, you can extract data, traverse the node graph, and + * manipulate the HTML. * * @author Jonathan Hedley, jonathan@hedley.net */ public class Element extends Node { private Tag tag; private Set<String> classNames; - + /** * Create a new, standalone Element. (Standalone in that is has no parent.) * - * @param tag tag of this element - * @param baseUri the base URI - * @param attributes initial attributes + * @param tag + * tag of this element + * @param baseUri + * the base URI + * @param attributes + * initial attributes * @see #appendChild(Node) * @see #appendElement(String) */ public Element(Tag tag, String baseUri, Attributes attributes) { super(baseUri, attributes); - - Validate.notNull(tag); + + Validate.notNull(tag); this.tag = tag; } - + /** * Create a new Element from a tag and a base URI. * - * @param tag element tag - * @param baseUri the base URI of this element. It is acceptable for the base URI to be an empty - * string, but not null. + * @param tag + * element tag + * @param baseUri + * the base URI of this element. It is acceptable for the base + * URI to be an empty string, but not null. * @see Tag#valueOf(String) */ public Element(Tag tag, String baseUri) { @@ -68,10 +80,11 @@ public class Element extends Node { } /** - * Change the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with - * {@code el.tagName("div");}. - * - * @param tagName new tag name for this element + * Change the tag of this element. For example, convert a {@code <span>} to + * a {@code <div>} with {@code el.tagName("div");}. + * + * @param tagName + * new tag name for this element * @return this element, for chaining */ public Element tagName(String tagName) { @@ -88,10 +101,10 @@ public class Element extends Node { public Tag tag() { return tag; } - + /** - * Test if this element is a block-level element. (E.g. {@code <div> == true} or an inline element - * {@code <p> == false}). + * Test if this element is a block-level element. (E.g. + * {@code <div> == true} or an inline element {@code <p> == false}). * * @return true if block, false if not (and thus inline) */ @@ -110,27 +123,32 @@ public class Element extends Node { } /** - * Set an attribute value on this element. If this element already has an attribute with the - * key, its value is updated; otherwise, a new attribute is added. + * Set an attribute value on this element. If this element already has an + * attribute with the key, its value is updated; otherwise, a new attribute + * is added. * * @return this element */ + @Override public Element attr(String attributeKey, String attributeValue) { super.attr(attributeKey, attributeValue); return this; } /** - * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key - * starting with "data-" is included the dataset. + * Get this element's HTML5 custom data attributes. Each attribute in the + * element that has a key starting with "data-" is included the dataset. * <p> - * E.g., the element {@code <div data-package="jsoup" data-language="Java" class="group">...} has the dataset - * {@code package=jsoup, language=java}. + * E.g., the element + * {@code <div data-package="jsoup" data-language="Java" class="group">...} + * has the dataset {@code package=jsoup, language=java}. * <p> - * This map is a filtered view of the element's attribute map. Changes to one map (add, remove, update) are reflected - * in the other map. + * This map is a filtered view of the element's attribute map. Changes to + * one map (add, remove, update) are reflected in the other map. * <p> - * You can find elements that have data attributes using the {@code [^data-]} attribute key prefix selector. + * You can find elements that have data attributes using the + * {@code [^data-]} attribute key prefix selector. + * * @return a map of {@code key=value} custom data attributes. */ public Map<String, String> dataset() { @@ -144,6 +162,7 @@ public class Element extends Node { /** * Get this element's parent and ancestors, up to the document root. + * * @return this element's stack of parents, closest first. */ public Elements parents() { @@ -163,10 +182,12 @@ public class Element extends Node { /** * Get a child element of this element, by its 0-based index number. * <p/> - * Note that an element can have both mixed Nodes and Elements as children. This method inspects - * a filtered list of children that are elements, and the index is based on that filtered list. + * Note that an element can have both mixed Nodes and Elements as children. + * This method inspects a filtered list of children that are elements, and + * the index is based on that filtered list. * - * @param index the index number of the element to retrieve + * @param index + * the index number of the element to retrieve * @return the child element, if it exists, or {@code null} if absent. * @see #childNode(int) */ @@ -177,140 +198,165 @@ public class Element extends Node { /** * Get this element's child elements. * <p/> - * This is effectively a filter on {@link #childNodes()} to get Element nodes. - * @return child elements. If this element has no children, returns an - * empty list. + * This is effectively a filter on {@link #childNodes()} to get Element + * nodes. + * + * @return child elements. If this element has no children, returns an empty + * list. * @see #childNodes() */ public Elements children() { - // create on the fly rather than maintaining two lists. if gets slow, memoize, and mark dirty on change + // create on the fly rather than maintaining two lists. if gets slow, + // memoize, and mark dirty on change List<Element> elements = new ArrayList<Element>(); for (Node node : childNodes) { - if (node instanceof Element) + if (node instanceof Element) { elements.add((Element) node); + } } return new Elements(elements); } /** - * Get this element's child text nodes. The list is unmodifiable but the text nodes may be manipulated. + * Get this element's child text nodes. The list is unmodifiable but the + * text nodes may be manipulated. * <p/> * This is effectively a filter on {@link #childNodes()} to get Text nodes. + * * @return child text nodes. If this element has no text nodes, returns an - * empty list. - * <p/> - * For example, with the input HTML: {@code <p>One <span>Two</span> Three <br> Four</p>} with the {@code p} element selected: - * <ul> - * <li>{@code p.text()} = {@code "One Two Three Four"}</li> - * <li>{@code p.ownText()} = {@code "One Three Four"}</li> - * <li>{@code p.children()} = {@code Elements[<span>, <br>]}</li> - * <li>{@code p.childNodes()} = {@code List<Node>["One ", <span>, " Three ", <br>, " Four"]}</li> - * <li>{@code p.textNodes()} = {@code List<TextNode>["One ", " Three ", " Four"]}</li> - * </ul> + * empty list. + * <p/> + * For example, with the input HTML: + * {@code <p>One <span>Two</span> Three <br> Four</p>} with the + * {@code p} element selected: <ul> + * <li>{@code p.text()} = {@code "One Two Three Four"}</li> + * <li>{@code p.ownText()} = {@code "One Three Four"}</li> + * <li>{@code p.children()} = {@code Elements[<span>, <br> + * ]}</li> + * <li>{@code p.childNodes()} = {@code List<Node>["One ", <span>, " Three ", + * <br> + * , " Four"]}</li> + * <li>{@code p.textNodes()} = + * {@code List<TextNode>["One ", " Three ", " Four"]}</li> + * </ul> */ public List<TextNode> textNodes() { List<TextNode> textNodes = new ArrayList<TextNode>(); for (Node node : childNodes) { - if (node instanceof TextNode) + if (node instanceof TextNode) { textNodes.add((TextNode) node); + } } return Collections.unmodifiableList(textNodes); } /** - * Get this element's child data nodes. The list is unmodifiable but the data nodes may be manipulated. + * Get this element's child data nodes. The list is unmodifiable but the + * data nodes may be manipulated. * <p/> * This is effectively a filter on {@link #childNodes()} to get Data nodes. + * * @return child data nodes. If this element has no data nodes, returns an - * empty list. + * empty list. * @see #data() */ public List<DataNode> dataNodes() { List<DataNode> dataNodes = new ArrayList<DataNode>(); for (Node node : childNodes) { - if (node instanceof DataNode) + if (node instanceof DataNode) { dataNodes.add((DataNode) node); + } } return Collections.unmodifiableList(dataNodes); } /** - * Find elements that match the {@link Selector} CSS query, with this element as the starting context. Matched elements - * may include this element, or any of its children. + * Find elements that match the {@link Selector} CSS query, with this + * element as the starting context. Matched elements may include this + * element, or any of its children. * <p/> - * This method is generally more powerful to use than the DOM-type {@code getElementBy*} methods, because - * multiple filters can be combined, e.g.: + * This method is generally more powerful to use than the DOM-type + * {@code getElementBy*} methods, because multiple filters can be combined, + * e.g.: * <ul> - * <li>{@code el.select("a[href]")} - finds links ({@code a} tags with {@code href} attributes) - * <li>{@code el.select("a[href*=example.com]")} - finds links pointing to example.com (loosely) + * <li>{@code el.select("a[href]")} - finds links ({@code a} tags with + * {@code href} attributes) + * <li>{@code el.select("a[href*=example.com]")} - finds links pointing to + * example.com (loosely) * </ul> * <p/> * See the query syntax documentation in {@link org.jsoup.select.Selector}. - * - * @param cssQuery a {@link Selector} CSS-like query + * + * @param cssQuery + * a {@link Selector} CSS-like query * @return elements that match the query (empty if none match) * @see org.jsoup.select.Selector */ public Elements select(String cssQuery) { return Selector.select(cssQuery, this); } - + /** * Add a node child node to this element. * - * @param child node to add. Must not already have a parent. + * @param child + * node to add. Must not already have a parent. * @return this element, so that you can add more child nodes or elements. */ public Element appendChild(Node child) { Validate.notNull(child); - + addChildren(child); return this; } - + /** * Add a node to the start of this element's children. * - * @param child node to add. Must not already have a parent. + * @param child + * node to add. Must not already have a parent. * @return this element, so that you can add more child nodes or elements. */ public Element prependChild(Node child) { Validate.notNull(child); - + addChildren(0, child); return this; } - + /** * Create a new element by tag name, and add it as the last child. * - * @param tagName the name of the tag (e.g. {@code div}). + * @param tagName + * the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: - * {@code parent.appendElement("h1").attr("id", "header").text("Welcome");} + * {@code parent.appendElement("h1").attr("id", "header").text("Welcome");} */ public Element appendElement(String tagName) { Element child = new Element(Tag.valueOf(tagName), baseUri()); appendChild(child); return child; } - + /** * Create a new element by tag name, and add it as the first child. * - * @param tagName the name of the tag (e.g. {@code div}). + * @param tagName + * the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: - * {@code parent.prependElement("h1").attr("id", "header").text("Welcome");} + * {@code parent.prependElement("h1").attr("id", "header").text("Welcome");} */ public Element prependElement(String tagName) { Element child = new Element(Tag.valueOf(tagName), baseUri()); prependChild(child); return child; } - + /** * Create and append a new TextNode to this element. * - * @param text the unencoded text to add + * @param text + * the unencoded text to add * @return this element */ public Element appendText(String text) { @@ -318,11 +364,12 @@ public class Element extends Node { appendChild(node); return this; } - + /** * Create and prepend a new TextNode to this element. * - * @param text the unencoded text to add + * @param text + * the unencoded text to add * @return this element */ public Element prependText(String text) { @@ -330,10 +377,13 @@ public class Element extends Node { prependChild(node); return this; } - + /** - * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children. - * @param html HTML to add inside this element, after the existing HTML + * Add inner HTML to this element. The supplied HTML will be parsed, and + * each node appended to the end of the children. + * + * @param html + * HTML to add inside this element, after the existing HTML * @return this element * @see #html(String) */ @@ -344,25 +394,30 @@ public class Element extends Node { addChildren(nodes.toArray(new Node[nodes.size()])); return this; } - + /** - * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children. - * @param html HTML to add inside this element, before the existing HTML + * Add inner HTML into this element. The supplied HTML will be parsed, and + * each node prepended to the start of the element's children. + * + * @param html + * HTML to add inside this element, before the existing HTML * @return this element * @see #html(String) */ public Element prepend(String html) { Validate.notNull(html); - + List<Node> nodes = Parser.parseFragment(html, this, baseUri()); addChildren(0, nodes.toArray(new Node[nodes.size()])); return this; } /** - * Insert the specified HTML into the DOM before this element (i.e. as a preceding sibling). - * - * @param html HTML to add before this element + * Insert the specified HTML into the DOM before this element (i.e. as a + * preceding sibling). + * + * @param html + * HTML to add before this element * @return this element, for chaining * @see #after(String) */ @@ -372,8 +427,11 @@ public class Element extends Node { } /** - * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). - * @param node to add before this element + * Insert the specified node into the DOM before this node (i.e. as a + * preceding sibling). + * + * @param node + * to add before this element * @return this Element, for chaining * @see #after(Node) */ @@ -383,9 +441,11 @@ public class Element extends Node { } /** - * Insert the specified HTML into the DOM after this element (i.e. as a following sibling). - * - * @param html HTML to add after this element + * Insert the specified HTML into the DOM after this element (i.e. as a + * following sibling). + * + * @param html + * HTML to add after this element * @return this element, for chaining * @see #before(String) */ @@ -395,8 +455,11 @@ public class Element extends Node { } /** - * Insert the specified node into the DOM after this node (i.e. as a following sibling). - * @param node to add after this element + * Insert the specified node into the DOM after this node (i.e. as a + * following sibling). + * + * @param node + * to add after this element * @return this element, for chaining * @see #before(Node) */ @@ -407,6 +470,7 @@ public class Element extends Node { /** * Remove all of the element's child nodes. Any attributes are left as-is. + * * @return this element */ public Element empty() { @@ -416,8 +480,10 @@ public class Element extends Node { /** * Wrap the supplied HTML around this element. - * - * @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. + * + * @param html + * HTML to wrap around this element, e.g. + * {@code <div class="head"></div>}. Can be arbitrarily deep. * @return this element, for chaining. */ @Override @@ -426,94 +492,118 @@ public class Element extends Node { } /** - * Get sibling elements. If the element has no sibling elements, returns an empty list. An element is not a sibling - * of itself, so will not be included in the returned list. + * Get sibling elements. If the element has no sibling elements, returns an + * empty list. An element is not a sibling of itself, so will not be + * included in the returned list. + * * @return sibling elements */ public Elements siblingElements() { - if (parentNode == null) + if (parentNode == null) { return new Elements(0); + } List<Element> elements = parent().children(); Elements siblings = new Elements(elements.size() - 1); - for (Element el: elements) - if (el != this) + for (Element el : elements) { + if (el != this) { siblings.add(el); + } + } return siblings; } /** - * Gets the next sibling element of this element. E.g., if a {@code div} contains two {@code p}s, - * the {@code nextElementSibling} of the first {@code p} is the second {@code p}. + * Gets the next sibling element of this element. E.g., if a {@code div} + * contains two {@code p}s, the {@code nextElementSibling} of the first + * {@code p} is the second {@code p}. * <p/> - * This is similar to {@link #nextSibling()}, but specifically finds only Elements + * This is similar to {@link #nextSibling()}, but specifically finds only + * Elements + * * @return the next element, or null if there is no next element * @see #previousElementSibling() */ public Element nextElementSibling() { - if (parentNode == null) return null; + if (parentNode == null) { + return null; + } List<Element> siblings = parent().children(); Integer index = indexInList(this, siblings); Validate.notNull(index); - if (siblings.size() > index+1) - return siblings.get(index+1); - else + if (siblings.size() > index + 1) { + return siblings.get(index + 1); + } else { return null; + } } /** * Gets the previous element sibling of this element. + * * @return the previous element, or null if there is no previous element * @see #nextElementSibling() */ public Element previousElementSibling() { - if (parentNode == null) return null; + if (parentNode == null) { + return null; + } List<Element> siblings = parent().children(); Integer index = indexInList(this, siblings); Validate.notNull(index); - if (index > 0) - return siblings.get(index-1); - else + if (index > 0) { + return siblings.get(index - 1); + } else { return null; + } } /** * Gets the first element sibling of this element. - * @return the first sibling that is an element (aka the parent's first element child) + * + * @return the first sibling that is an element (aka the parent's first + * element child) */ public Element firstElementSibling() { // todo: should firstSibling() exclude this? List<Element> siblings = parent().children(); return siblings.size() > 1 ? siblings.get(0) : null; } - + /** - * Get the list index of this element in its element sibling list. I.e. if this is the first element - * sibling, returns 0. + * Get the list index of this element in its element sibling list. I.e. if + * this is the first element sibling, returns 0. + * * @return position in element sibling list */ public Integer elementSiblingIndex() { - if (parent() == null) return 0; - return indexInList(this, parent().children()); + if (parent() == null) { + return 0; + } + return indexInList(this, parent().children()); } /** * Gets the last element sibling of this element - * @return the last sibling that is an element (aka the parent's last element child) + * + * @return the last sibling that is an element (aka the parent's last + * element child) */ public Element lastElementSibling() { List<Element> siblings = parent().children(); return siblings.size() > 1 ? siblings.get(siblings.size() - 1) : null; } - - private static <E extends Element> Integer indexInList(Element search, List<E> elements) { + + private static <E extends Element> Integer indexInList(Element search, + List<E> elements) { Validate.notNull(search); Validate.notNull(elements); for (int i = 0; i < elements.size(); i++) { E element = elements.get(i); - if (element.equals(search)) + if (element.equals(search)) { return i; + } } return null; } @@ -521,9 +611,13 @@ public class Element extends Node { // DOM type methods /** - * Finds elements, including and recursively under this element, with the specified tag name. - * @param tagName The tag name to search for (case insensitively). - * @return a matching unmodifiable list of elements. Will be empty if this element and none of its children match. + * Finds elements, including and recursively under this element, with the + * specified tag name. + * + * @param tagName + * The tag name to search for (case insensitively). + * @return a matching unmodifiable list of elements. Will be empty if this + * element and none of its children match. */ public Elements getElementsByTag(String tagName) { Validate.notEmpty(tagName); @@ -535,29 +629,37 @@ public class Element extends Node { /** * Find an element by ID, including or under this element. * <p> - * Note that this finds the first matching ID, starting with this element. If you search down from a different - * starting point, it is possible to find a different element by ID. For unique element by ID within a Document, - * use {@link Document#getElementById(String)} - * @param id The ID to search for. - * @return The first matching element by ID, starting with this element, or null if none found. + * Note that this finds the first matching ID, starting with this element. + * If you search down from a different starting point, it is possible to + * find a different element by ID. For unique element by ID within a + * Document, use {@link Document#getElementById(String)} + * + * @param id + * The ID to search for. + * @return The first matching element by ID, starting with this element, or + * null if none found. */ public Element getElementById(String id) { Validate.notEmpty(id); - + Elements elements = Collector.collect(new Evaluator.Id(id), this); - if (elements.size() > 0) + if (elements.size() > 0) { return elements.get(0); - else + } else { return null; + } } /** - * Find elements that have this class, including or under this element. Case insensitive. + * Find elements that have this class, including or under this element. Case + * insensitive. * <p> - * Elements can have multiple classes (e.g. {@code <div class="header round first">}. This method - * checks each class, so you can find the above with {@code el.getElementsByClass("header");}. + * Elements can have multiple classes (e.g. + * {@code <div class="header round first">}. This method checks each class, + * so you can find the above with {@code el.getElementsByClass("header");}. * - * @param className the name of the class to search for. + * @param className + * the name of the class to search for. * @return elements with the supplied class name, empty if none * @see #hasClass(String) * @see #classNames() @@ -570,8 +672,9 @@ public class Element extends Node { /** * Find elements that have a named attribute set. Case insensitive. - * - * @param key name of the attribute, e.g. {@code href} + * + * @param key + * name of the attribute, e.g. {@code href} * @return elements that have this attribute, empty if none */ public Elements getElementsByAttribute(String key) { @@ -582,88 +685,129 @@ public class Element extends Node { } /** - * Find elements that have an attribute name starting with the supplied prefix. Use {@code data-} to find elements - * that have HTML5 datasets. - * @param keyPrefix name prefix of the attribute e.g. {@code data-} - * @return elements that have attribute names that start with with the prefix, empty if none. + * Find elements that have an attribute name starting with the supplied + * prefix. Use {@code data-} to find elements that have HTML5 datasets. + * + * @param keyPrefix + * name prefix of the attribute e.g. {@code data-} + * @return elements that have attribute names that start with with the + * prefix, empty if none. */ public Elements getElementsByAttributeStarting(String keyPrefix) { Validate.notEmpty(keyPrefix); keyPrefix = keyPrefix.trim().toLowerCase(); - return Collector.collect(new Evaluator.AttributeStarting(keyPrefix), this); + return Collector.collect(new Evaluator.AttributeStarting(keyPrefix), + this); } /** - * Find elements that have an attribute with the specific value. Case insensitive. + * Find elements that have an attribute with the specific value. Case + * insensitive. * - * @param key name of the attribute - * @param value value of the attribute + * @param key + * name of the attribute + * @param value + * value of the attribute * @return elements that have this attribute with this value, empty if none */ public Elements getElementsByAttributeValue(String key, String value) { - return Collector.collect(new Evaluator.AttributeWithValue(key, value), this); + return Collector.collect(new Evaluator.AttributeWithValue(key, value), + this); } /** - * Find elements that either do not have this attribute, or have it with a different value. Case insensitive. + * Find elements that either do not have this attribute, or have it with a + * different value. Case insensitive. * - * @param key name of the attribute - * @param value value of the attribute + * @param key + * name of the attribute + * @param value + * value of the attribute * @return elements that do not have a matching attribute */ public Elements getElementsByAttributeValueNot(String key, String value) { - return Collector.collect(new Evaluator.AttributeWithValueNot(key, value), this); + return Collector.collect( + new Evaluator.AttributeWithValueNot(key, value), this); } /** - * Find elements that have attributes that start with the value prefix. Case insensitive. + * Find elements that have attributes that start with the value prefix. Case + * insensitive. * - * @param key name of the attribute - * @param valuePrefix start of attribute value + * @param key + * name of the attribute + * @param valuePrefix + * start of attribute value * @return elements that have attributes that start with the value prefix */ - public Elements getElementsByAttributeValueStarting(String key, String valuePrefix) { - return Collector.collect(new Evaluator.AttributeWithValueStarting(key, valuePrefix), this); + public Elements getElementsByAttributeValueStarting(String key, + String valuePrefix) { + return Collector.collect(new Evaluator.AttributeWithValueStarting(key, + valuePrefix), this); } /** - * Find elements that have attributes that end with the value suffix. Case insensitive. + * Find elements that have attributes that end with the value suffix. Case + * insensitive. * - * @param key name of the attribute - * @param valueSuffix end of the attribute value + * @param key + * name of the attribute + * @param valueSuffix + * end of the attribute value * @return elements that have attributes that end with the value suffix */ - public Elements getElementsByAttributeValueEnding(String key, String valueSuffix) { - return Collector.collect(new Evaluator.AttributeWithValueEnding(key, valueSuffix), this); + public Elements getElementsByAttributeValueEnding(String key, + String valueSuffix) { + return Collector.collect(new Evaluator.AttributeWithValueEnding(key, + valueSuffix), this); } /** - * Find elements that have attributes whose value contains the match string. Case insensitive. + * Find elements that have attributes whose value contains the match string. + * Case insensitive. * - * @param key name of the attribute - * @param match substring of value to search for + * @param key + * name of the attribute + * @param match + * substring of value to search for * @return elements that have attributes containing this text */ - public Elements getElementsByAttributeValueContaining(String key, String match) { - return Collector.collect(new Evaluator.AttributeWithValueContaining(key, match), this); + public Elements getElementsByAttributeValueContaining(String key, + String match) { + return Collector.collect(new Evaluator.AttributeWithValueContaining( + key, match), this); } - + /** - * Find elements that have attributes whose values match the supplied regular expression. - * @param key name of the attribute - * @param pattern compiled regular expression to match against attribute values + * Find elements that have attributes whose values match the supplied + * regular expression. + * + * @param key + * name of the attribute + * @param pattern + * compiled regular expression to match against attribute values * @return elements that have attributes matching this regular expression */ - public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) { - return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this); - + public Elements getElementsByAttributeValueMatching(String key, + Pattern pattern) { + return Collector.collect(new Evaluator.AttributeWithValueMatching(key, + pattern), this); + } - + /** - * Find elements that have attributes whose values match the supplied regular expression. - * @param key name of the attribute - * @param regex regular expression to match against attribute values. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. + * Find elements that have attributes whose values match the supplied + * regular expression. + * + * @param key + * name of the attribute + * @param regex + * regular expression to match against attribute values. You can + * use <a href= + * "http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded" + * >embedded flags</a> (such as (?i) and (?m) to control regex + * options. * @return elements that have attributes matching this regular expression */ public Elements getElementsByAttributeValueMatching(String key, String regex) { @@ -671,73 +815,94 @@ public class Element extends Node { try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { - throw new IllegalArgumentException("Pattern syntax error: " + regex, e); + throw new IllegalArgumentException( + "Pattern syntax error: " + regex, e); } return getElementsByAttributeValueMatching(key, pattern); } - + /** * Find elements whose sibling index is less than the supplied index. - * @param index 0-based index + * + * @param index + * 0-based index * @return elements less than index */ public Elements getElementsByIndexLessThan(int index) { return Collector.collect(new Evaluator.IndexLessThan(index), this); } - + /** * Find elements whose sibling index is greater than the supplied index. - * @param index 0-based index + * + * @param index + * 0-based index * @return elements greater than index */ public Elements getElementsByIndexGreaterThan(int index) { return Collector.collect(new Evaluator.IndexGreaterThan(index), this); } - + /** * Find elements whose sibling index is equal to the supplied index. - * @param index 0-based index + * + * @param index + * 0-based index * @return elements equal to index */ public Elements getElementsByIndexEquals(int index) { return Collector.collect(new Evaluator.IndexEquals(index), this); } - + /** - * Find elements that contain the specified string. The search is case insensitive. The text may appear directly - * in the element, or in any of its descendants. - * @param searchText to look for in the element's text + * Find elements that contain the specified string. The search is case + * insensitive. The text may appear directly in the element, or in any of + * its descendants. + * + * @param searchText + * to look for in the element's text * @return elements that contain the string, case insensitive. * @see Element#text() */ public Elements getElementsContainingText(String searchText) { return Collector.collect(new Evaluator.ContainsText(searchText), this); } - + /** - * Find elements that directly contain the specified string. The search is case insensitive. The text must appear directly - * in the element, not in any of its descendants. - * @param searchText to look for in the element's own text + * Find elements that directly contain the specified string. The search is + * case insensitive. The text must appear directly in the element, not in + * any of its descendants. + * + * @param searchText + * to look for in the element's own text * @return elements that contain the string, case insensitive. * @see Element#ownText() */ public Elements getElementsContainingOwnText(String searchText) { - return Collector.collect(new Evaluator.ContainsOwnText(searchText), this); + return Collector.collect(new Evaluator.ContainsOwnText(searchText), + this); } - + /** * Find elements whose text matches the supplied regular expression. - * @param pattern regular expression to match text against + * + * @param pattern + * regular expression to match text against * @return elements matching the supplied regular expression. * @see Element#text() */ public Elements getElementsMatchingText(Pattern pattern) { return Collector.collect(new Evaluator.Matches(pattern), this); } - + /** * Find elements whose text matches the supplied regular expression. - * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. + * + * @param regex + * regular expression to match text against. You can use <a href= + * "http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded" + * >embedded flags</a> (such as (?i) and (?m) to control regex + * options. * @return elements matching the supplied regular expression. * @see Element#text() */ @@ -746,24 +911,32 @@ public class Element extends Node { try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { - throw new IllegalArgumentException("Pattern syntax error: " + regex, e); + throw new IllegalArgumentException( + "Pattern syntax error: " + regex, e); } return getElementsMatchingText(pattern); } - + /** * Find elements whose own text matches the supplied regular expression. - * @param pattern regular expression to match text against + * + * @param pattern + * regular expression to match text against * @return elements matching the supplied regular expression. * @see Element#ownText() */ public Elements getElementsMatchingOwnText(Pattern pattern) { return Collector.collect(new Evaluator.MatchesOwn(pattern), this); } - + /** * Find elements whose text matches the supplied regular expression. - * @param regex regular expression to match text against. You can use <a href="http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded">embedded flags</a> (such as (?i) and (?m) to control regex options. + * + * @param regex + * regular expression to match text against. You can use <a href= + * "http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded" + * >embedded flags</a> (such as (?i) and (?m) to control regex + * options. * @return elements matching the supplied regular expression. * @see Element#ownText() */ @@ -772,13 +945,15 @@ public class Element extends Node { try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { - throw new IllegalArgumentException("Pattern syntax error: " + regex, e); + throw new IllegalArgumentException( + "Pattern syntax error: " + regex, e); } return getElementsMatchingOwnText(pattern); } - + /** - * Find all elements under this element (including self, and children of children). + * Find all elements under this element (including self, and children of + * children). * * @return all elements */ @@ -789,8 +964,9 @@ public class Element extends Node { /** * Gets the combined text of this element and all its children. * <p> - * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.text()} returns {@code "Hello there now!"} - * + * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, + * {@code p.text()} returns {@code "Hello there now!"} + * * @return unencoded text, or empty string if none. * @see #ownText() * @see #textNodes() @@ -803,27 +979,32 @@ public class Element extends Node { private void text(StringBuilder accum) { appendWhitespaceIfBr(this, accum); - + for (Node child : childNodes) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; appendNormalisedText(accum, textNode); } else if (child instanceof Element) { Element element = (Element) child; - if (accum.length() > 0 && element.isBlock() && !TextNode.lastCharIsWhitespace(accum)) + if (accum.length() > 0 && element.isBlock() + && !TextNode.lastCharIsWhitespace(accum)) { accum.append(" "); + } element.text(accum); } } } /** - * Gets the text owned by this element only; does not get the combined text of all children. + * Gets the text owned by this element only; does not get the combined text + * of all children. * <p> - * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, {@code p.ownText()} returns {@code "Hello now!"}, - * whereas {@code p.text()} returns {@code "Hello there now!"}. - * Note that the text within the {@code b} element is not returned, as it is not a direct child of the {@code p} element. - * + * For example, given HTML {@code <p>Hello <b>there</b> now!</p>}, + * {@code p.ownText()} returns {@code "Hello now!"}, whereas + * {@code p.text()} returns {@code "Hello there now!"}. Note that the text + * within the {@code b} element is not returned, as it is not a direct child + * of the {@code p} element. + * * @return unencoded text, or empty string if none. * @see #text() * @see #textNodes() @@ -850,24 +1031,32 @@ public class Element extends Node { if (!preserveWhitespace()) { text = TextNode.normaliseWhitespace(text); - if (TextNode.lastCharIsWhitespace(accum)) + if (TextNode.lastCharIsWhitespace(accum)) { text = TextNode.stripLeadingWhitespace(text); + } } accum.append(text); } - private static void appendWhitespaceIfBr(Element element, StringBuilder accum) { - if (element.tag.getName().equals("br") && !TextNode.lastCharIsWhitespace(accum)) + private static void appendWhitespaceIfBr(Element element, + StringBuilder accum) { + if (element.tag.getName().equals("br") + && !TextNode.lastCharIsWhitespace(accum)) { accum.append(" "); + } } boolean preserveWhitespace() { - return tag.preserveWhitespace() || parent() != null && parent().preserveWhitespace(); + return tag.preserveWhitespace() || parent() != null + && parent().preserveWhitespace(); } /** - * Set the text of this element. Any existing contents (text or elements) will be cleared - * @param text unencoded text + * Set the text of this element. Any existing contents (text or elements) + * will be cleared + * + * @param text + * unencoded text * @return this element */ public Element text(String text) { @@ -881,28 +1070,33 @@ public class Element extends Node { } /** - Test if this element has any text content (that is not just whitespace). - @return true if element has non-blank text content. + * Test if this element has any text content (that is not just whitespace). + * + * @return true if element has non-blank text content. */ public boolean hasText() { - for (Node child: childNodes) { + for (Node child : childNodes) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; - if (!textNode.isBlank()) + if (!textNode.isBlank()) { return true; + } } else if (child instanceof Element) { Element el = (Element) child; - if (el.hasText()) + if (el.hasText()) { return true; + } } } return false; } /** - * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. + * Get the combined data of this element. Data is e.g. the inside of a + * {@code script} tag. + * * @return the data, or empty string if none - * + * * @see #dataNodes() */ public String data() { @@ -919,21 +1113,28 @@ public class Element extends Node { } } return sb.toString(); - } + } /** - * Gets the literal value of this element's "class" attribute, which may include multiple class names, space - * separated. (E.g. on <code><div class="header gray"></code> returns, "<code>header gray</code>") - * @return The literal class attribute, or <b>empty string</b> if no class attribute set. + * Gets the literal value of this element's "class" attribute, which may + * include multiple class names, space separated. (E.g. on + * <code><div class="header gray"></code> returns, " + * <code>header gray</code>") + * + * @return The literal class attribute, or <b>empty string</b> if no class + * attribute set. */ public String className() { return attr("class"); } /** - * Get all of the element's class names. E.g. on element {@code <div class="header gray"}>}, - * returns a set of two elements {@code "header", "gray"}. Note that modifications to this set are not pushed to - * the backing {@code class} attribute; use the {@link #classNames(java.util.Set)} method to persist them. + * Get all of the element's class names. E.g. on element + * {@code <div class="header gray"}>}, returns a set of two elements + * {@code "header", "gray"}. Note that modifications to this set are not + * pushed to the backing {@code class} attribute; use the + * {@link #classNames(java.util.Set)} method to persist them. + * * @return set of classnames, empty if no class attribute */ public Set<String> classNames() { @@ -945,9 +1146,11 @@ public class Element extends Node { } /** - Set the element's {@code class} attribute to the supplied class names. - @param classNames set of classes - @return this element, for chaining + * Set the element's {@code class} attribute to the supplied class names. + * + * @param classNames + * set of classes + * @return this element, for chaining */ public Element classNames(Set<String> classNames) { Validate.notNull(classNames); @@ -957,22 +1160,27 @@ public class Element extends Node { /** * Tests if this element has a class. Case insensitive. - * @param className name of class to check for + * + * @param className + * name of class to check for * @return true if it does, false if not */ public boolean hasClass(String className) { Set<String> classNames = classNames(); for (String name : classNames) { - if (className.equalsIgnoreCase(name)) + if (className.equalsIgnoreCase(name)) { return true; + } } return false; } /** - Add a class name to this element's {@code class} attribute. - @param className class name to add - @return this element + * Add a class name to this element's {@code class} attribute. + * + * @param className + * class name to add + * @return this element */ public Element addClass(String className) { Validate.notNull(className); @@ -985,9 +1193,11 @@ public class Element extends Node { } /** - Remove a class name from this element's {@code class} attribute. - @param className class name to remove - @return this element + * Remove a class name from this element's {@code class} attribute. + * + * @param className + * class name to remove + * @return this element */ public Element removeClass(String className) { Validate.notNull(className); @@ -1000,90 +1210,114 @@ public class Element extends Node { } /** - Toggle a class name on this element's {@code class} attribute: if present, remove it; otherwise add it. - @param className class name to toggle - @return this element + * Toggle a class name on this element's {@code class} attribute: if + * present, remove it; otherwise add it. + * + * @param className + * class name to toggle + * @return this element */ public Element toggleClass(String className) { Validate.notNull(className); Set<String> classes = classNames(); - if (classes.contains(className)) + if (classes.contains(className)) { classes.remove(className); - else + } else { classes.add(className); + } classNames(classes); return this; } - + /** * Get the value of a form element (input, textarea, etc). + * * @return the value of the form element, or empty string if not set. */ public String val() { - if (tagName().equals("textarea")) + if (tagName().equals("textarea")) { return text(); - else + } else { return attr("value"); + } } - + /** * Set the value of a form element (input, textarea, etc). - * @param value value to set + * + * @param value + * value to set * @return this element (for chaining) */ public Element val(String value) { - if (tagName().equals("textarea")) + if (tagName().equals("textarea")) { text(value); - else + } else { attr("value", value); + } return this; } - void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { - if (accum.length() > 0 && out.prettyPrint() && (tag.formatAsBlock() || (parent() != null && parent().tag().formatAsBlock()))) + @Override + void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out) { + if (accum.length() > 0 + && out.prettyPrint() + && (tag.formatAsBlock() || (parent() != null && parent().tag() + .formatAsBlock()))) { indent(accum, depth, out); - accum - .append("<") - .append(tagName()); + } + accum.append("<").append(tagName()); attributes.html(accum, out); - if (childNodes.isEmpty() && tag.isSelfClosing()) + if (childNodes.isEmpty() && tag.isSelfClosing()) { accum.append(" />"); - else + } else { accum.append(">"); + } } - void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) { + @Override + void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out) { if (!(childNodes.isEmpty() && tag.isSelfClosing())) { - if (out.prettyPrint() && !childNodes.isEmpty() && tag.formatAsBlock()) + if (out.prettyPrint() && !childNodes.isEmpty() + && tag.formatAsBlock()) { indent(accum, depth, out); + } accum.append("</").append(tagName()).append(">"); } } /** - * Retrieves the element's inner HTML. E.g. on a {@code <div>} with one empty {@code <p>}, would return - * {@code <p></p>}. (Whereas {@link #outerHtml()} would return {@code <div><p></p></div>}.) + * Retrieves the element's inner HTML. E.g. on a {@code <div>} with one + * empty {@code <p>}, would return {@code <p></p>}. (Whereas + * {@link #outerHtml()} would return {@code <div> + * <p></p> + * </div>}.) * * @return String of HTML. * @see #outerHtml() */ public String html() { StringBuilder accum = new StringBuilder(); - html(accum); + html(accum); return accum.toString().trim(); } private void html(StringBuilder accum) { - for (Node node : childNodes) + for (Node node : childNodes) { node.outerHtml(accum); + } } - + /** * Set this element's inner HTML. Clears the existing HTML first. - * @param html HTML to parse and set into this element + * + * @param html + * HTML to parse and set into this element * @return this element * @see #append(String) */ @@ -1093,6 +1327,7 @@ public class Element extends Node { return this; } + @Override public String toString() { return outerHtml(); } @@ -1113,7 +1348,8 @@ public class Element extends Node { @Override public Element clone() { Element clone = (Element) super.clone(); - clone.classNames(); // creates linked set of class names from class attribute + clone.classNames(); // creates linked set of class names from class + // attribute return clone; } } diff --git a/server/src/org/jsoup/nodes/Entities.java b/server/src/org/jsoup/nodes/Entities.java index 0ae83e1fc0..24b50d7344 100644 --- a/server/src/org/jsoup/nodes/Entities.java +++ b/server/src/org/jsoup/nodes/Entities.java @@ -3,18 +3,24 @@ package org.jsoup.nodes; import java.io.IOException; import java.io.InputStream; import java.nio.charset.CharsetEncoder; -import java.util.*; +import java.util.HashMap; +import java.util.Map; +import java.util.MissingResourceException; +import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * HTML entities, and escape routines. - * Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C HTML - * named character references</a>. + * HTML entities, and escape routines. Source: <a href= + * "http://www.w3.org/TR/html5/named-character-references.html#named-character-references" + * >W3C HTML named character references</a>. */ public class Entities { public enum EscapeMode { - /** Restricted entities suitable for XHTML output: lt, gt, amp, apos, and quot only. */ + /** + * Restricted entities suitable for XHTML output: lt, gt, amp, apos, and + * quot only. + */ xhtml(xhtmlByVal), /** Default HTML output entities. */ base(baseByVal), @@ -36,21 +42,26 @@ public class Entities { private static final Map<Character, String> xhtmlByVal; private static final Map<Character, String> baseByVal; private static final Map<Character, String> fullByVal; - private static final Pattern unescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); - private static final Pattern strictUnescapePattern = Pattern.compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); + private static final Pattern unescapePattern = Pattern + .compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);?"); + private static final Pattern strictUnescapePattern = Pattern + .compile("&(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]+\\d*);"); - private Entities() {} + private Entities() { + } /** * Check if the input is a known named entity - * @param name the possible entity name (e.g. "lt" or "amp" + * + * @param name + * the possible entity name (e.g. "lt" or "amp" * @return true if a known named entity */ public static boolean isNamedEntity(String name) { return full.containsKey(name); } - /** +/** * Get the Character value of the named entity * @param name named entity (e.g. "lt" or "amp") * @return the Character value of the named entity (e.g. '<' or '&') @@ -58,23 +69,25 @@ public class Entities { public static Character getCharacterByName(String name) { return full.get(name); } - + static String escape(String string, Document.OutputSettings out) { return escape(string, out.encoder(), out.escapeMode()); } - static String escape(String string, CharsetEncoder encoder, EscapeMode escapeMode) { + static String escape(String string, CharsetEncoder encoder, + EscapeMode escapeMode) { StringBuilder accum = new StringBuilder(string.length() * 2); Map<Character, String> map = escapeMode.getMap(); for (int pos = 0; pos < string.length(); pos++) { Character c = string.charAt(pos); - if (map.containsKey(c)) + if (map.containsKey(c)) { accum.append('&').append(map.get(c)).append(';'); - else if (encoder.canEncode(c)) + } else if (encoder.canEncode(c)) { accum.append(c.charValue()); - else + } else { accum.append("&#").append((int) c).append(';'); + } } return accum.toString(); @@ -86,39 +99,53 @@ public class Entities { /** * Unescape the input string. + * * @param string - * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) + * @param strict + * if "strict" (that is, requires trailing ';' char, otherwise + * that's optional) * @return */ static String unescape(String string, boolean strict) { // todo: change this method to use Tokeniser.consumeCharacterReference - if (!string.contains("&")) + if (!string.contains("&")) { return string; + } - Matcher m = strict? strictUnescapePattern.matcher(string) : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? - StringBuffer accum = new StringBuffer(string.length()); // pity matcher can't use stringbuilder, avoid syncs - // todo: replace m.appendReplacement with own impl, so StringBuilder and quoteReplacement not required + Matcher m = strict ? strictUnescapePattern.matcher(string) + : unescapePattern.matcher(string); // &(#(x|X)?([0-9a-fA-F]+)|[a-zA-Z]\\d*);? + StringBuffer accum = new StringBuffer(string.length()); // pity matcher + // can't use + // stringbuilder, + // avoid syncs + // todo: replace m.appendReplacement with own impl, so StringBuilder and + // quoteReplacement not required while (m.find()) { int charval = -1; String num = m.group(3); if (num != null) { try { - int base = m.group(2) != null ? 16 : 10; // 2 is hex indicator + int base = m.group(2) != null ? 16 : 10; // 2 is hex + // indicator charval = Integer.valueOf(num, base); } catch (NumberFormatException e) { } // skip } else { String name = m.group(1); - if (full.containsKey(name)) + if (full.containsKey(name)) { charval = full.get(name); + } } if (charval != -1 || charval > 0xFFFF) { // out of range String c = Character.toString((char) charval); m.appendReplacement(accum, Matcher.quoteReplacement(c)); } else { - m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace with original string + m.appendReplacement(accum, Matcher.quoteReplacement(m.group(0))); // replace + // with + // original + // string } } m.appendTail(accum); @@ -126,22 +153,23 @@ public class Entities { } // xhtml has restricted entities - private static final Object[][] xhtmlArray = { - {"quot", 0x00022}, - {"amp", 0x00026}, - {"apos", 0x00027}, - {"lt", 0x0003C}, - {"gt", 0x0003E} - }; + private static final Object[][] xhtmlArray = { { "quot", 0x00022 }, + { "amp", 0x00026 }, { "apos", 0x00027 }, { "lt", 0x0003C }, + { "gt", 0x0003E } }; static { xhtmlByVal = new HashMap<Character, String>(); - baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most common / default - full = loadEntities("entities-full.properties"); // extended and overblown. + baseByVal = toCharacterKey(loadEntities("entities-base.properties")); // most + // common + // / + // default + full = loadEntities("entities-full.properties"); // extended and + // overblown. fullByVal = toCharacterKey(full); for (Object[] entity : xhtmlArray) { - Character c = Character.valueOf((char) ((Integer) entity[1]).intValue()); + Character c = Character.valueOf((char) ((Integer) entity[1]) + .intValue()); xhtmlByVal.put(c, ((String) entity[0])); } } @@ -154,27 +182,32 @@ public class Entities { properties.load(in); in.close(); } catch (IOException e) { - throw new MissingResourceException("Error loading entities resource: " + e.getMessage(), "Entities", filename); + throw new MissingResourceException( + "Error loading entities resource: " + e.getMessage(), + "Entities", filename); } - for (Map.Entry entry: properties.entrySet()) { - Character val = Character.valueOf((char) Integer.parseInt((String) entry.getValue(), 16)); + for (Map.Entry entry : properties.entrySet()) { + Character val = Character.valueOf((char) Integer.parseInt( + (String) entry.getValue(), 16)); String name = (String) entry.getKey(); entities.put(name, val); } return entities; } - private static Map<Character, String> toCharacterKey(Map<String, Character> inMap) { + private static Map<Character, String> toCharacterKey( + Map<String, Character> inMap) { Map<Character, String> outMap = new HashMap<Character, String>(); - for (Map.Entry<String, Character> entry: inMap.entrySet()) { + for (Map.Entry<String, Character> entry : inMap.entrySet()) { Character character = entry.getValue(); String name = entry.getKey(); if (outMap.containsKey(character)) { // dupe, prefer the lower case version - if (name.toLowerCase().equals(name)) + if (name.toLowerCase().equals(name)) { outMap.put(character, name); + } } else { outMap.put(character, name); } diff --git a/server/src/org/jsoup/nodes/Node.java b/server/src/org/jsoup/nodes/Node.java index eb2b40ee73..72b8dcbd47 100644 --- a/server/src/org/jsoup/nodes/Node.java +++ b/server/src/org/jsoup/nodes/Node.java @@ -1,21 +1,23 @@ package org.jsoup.nodes; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.parser.Parser; -import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; - import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; -/** - The base, abstract Node model. Elements, Documents, Comments etc are all Node instances. +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.parser.Parser; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; - @author Jonathan Hedley, jonathan@hedley.net */ +/** + * The base, abstract Node model. Elements, Documents, Comments etc are all Node + * instances. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ public abstract class Node implements Cloneable { Node parentNode; List<Node> childNodes; @@ -24,14 +26,17 @@ public abstract class Node implements Cloneable { int siblingIndex; /** - Create a new Node. - @param baseUri base URI - @param attributes attributes (not null, but may be empty) + * Create a new Node. + * + * @param baseUri + * base URI + * @param attributes + * attributes (not null, but may be empty) */ protected Node(String baseUri, Attributes attributes) { Validate.notNull(baseUri); Validate.notNull(attributes); - + childNodes = new ArrayList<Node>(4); this.baseUri = baseUri.trim(); this.attributes = attributes; @@ -42,7 +47,8 @@ public abstract class Node implements Cloneable { } /** - * Default constructor. Doesn't setup base uri, children, or attributes; use with caution. + * Default constructor. Doesn't setup base uri, children, or attributes; use + * with caution. */ protected Node() { childNodes = Collections.emptyList(); @@ -50,18 +56,23 @@ public abstract class Node implements Cloneable { } /** - Get the node name of this node. Use for debugging purposes and not logic switching (for that, use instanceof). - @return node name + * Get the node name of this node. Use for debugging purposes and not logic + * switching (for that, use instanceof). + * + * @return node name */ public abstract String nodeName(); /** * Get an attribute's value by its key. * <p/> - * To get an absolute URL from an attribute that may be a relative URL, prefix the key with <code><b>abs</b></code>, - * which is a shortcut to the {@link #absUrl} method. - * E.g.: <blockquote><code>String url = a.attr("abs:href");</code></blockquote> - * @param attributeKey The attribute key. + * To get an absolute URL from an attribute that may be a relative URL, + * prefix the key with <code><b>abs</b></code>, which is a shortcut to the + * {@link #absUrl} method. E.g.: <blockquote> + * <code>String url = a.attr("abs:href");</code></blockquote> + * + * @param attributeKey + * The attribute key. * @return The attribute, or empty string if not present (to avoid nulls). * @see #attributes() * @see #hasAttr(String) @@ -70,25 +81,33 @@ public abstract class Node implements Cloneable { public String attr(String attributeKey) { Validate.notNull(attributeKey); - if (attributes.hasKey(attributeKey)) + if (attributes.hasKey(attributeKey)) { return attributes.get(attributeKey); - else if (attributeKey.toLowerCase().startsWith("abs:")) + } else if (attributeKey.toLowerCase().startsWith("abs:")) { return absUrl(attributeKey.substring("abs:".length())); - else return ""; + } else { + return ""; + } } /** * Get all of the element's attributes. - * @return attributes (which implements iterable, in same order as presented in original HTML). + * + * @return attributes (which implements iterable, in same order as presented + * in original HTML). */ public Attributes attributes() { return attributes; } /** - * Set an attribute (key=value). If the attribute already exists, it is replaced. - * @param attributeKey The attribute key. - * @param attributeValue The attribute value. + * Set an attribute (key=value). If the attribute already exists, it is + * replaced. + * + * @param attributeKey + * The attribute key. + * @param attributeValue + * The attribute value. * @return this (for chaining) */ public Node attr(String attributeKey, String attributeValue) { @@ -98,7 +117,9 @@ public abstract class Node implements Cloneable { /** * Test if this element has an attribute. - * @param attributeKey The attribute key to check. + * + * @param attributeKey + * The attribute key to check. * @return true if the attribute exists, false if not. */ public boolean hasAttr(String attributeKey) { @@ -106,15 +127,18 @@ public abstract class Node implements Cloneable { if (attributeKey.toLowerCase().startsWith("abs:")) { String key = attributeKey.substring("abs:".length()); - if (attributes.hasKey(key) && !absUrl(key).equals("")) + if (attributes.hasKey(key) && !absUrl(key).equals("")) { return true; + } } return attributes.hasKey(attributeKey); } /** * Remove an attribute from this element. - * @param attributeKey The attribute to remove. + * + * @param attributeKey + * The attribute to remove. * @return this (for chaining) */ public Node removeAttr(String attributeKey) { @@ -124,47 +148,56 @@ public abstract class Node implements Cloneable { } /** - Get the base URI of this node. - @return base URI + * Get the base URI of this node. + * + * @return base URI */ public String baseUri() { return baseUri; } /** - Update the base URI of this node and all of its descendants. - @param baseUri base URI to set + * Update the base URI of this node and all of its descendants. + * + * @param baseUri + * base URI to set */ public void setBaseUri(final String baseUri) { Validate.notNull(baseUri); traverse(new NodeVisitor() { + @Override public void head(Node node, int depth) { node.baseUri = baseUri; } + @Override public void tail(Node node, int depth) { } }); } /** - * Get an absolute URL from a URL attribute that may be relative (i.e. an <code><a href></code> or - * <code><img src></code>). + * Get an absolute URL from a URL attribute that may be relative (i.e. an + * <code><a href></code> or <code><img src></code>). * <p/> * E.g.: <code>String absUrl = linkEl.absUrl("href");</code> * <p/> - * If the attribute value is already absolute (i.e. it starts with a protocol, like - * <code>http://</code> or <code>https://</code> etc), and it successfully parses as a URL, the attribute is - * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made - * absolute using that. + * If the attribute value is already absolute (i.e. it starts with a + * protocol, like <code>http://</code> or <code>https://</code> etc), and it + * successfully parses as a URL, the attribute is returned directly. + * Otherwise, it is treated as a URL relative to the element's + * {@link #baseUri}, and made absolute using that. * <p/> - * As an alternate, you can use the {@link #attr} method with the <code>abs:</code> prefix, e.g.: + * As an alternate, you can use the {@link #attr} method with the + * <code>abs:</code> prefix, e.g.: * <code>String absUrl = linkEl.attr("abs:href");</code> - * - * @param attributeKey The attribute key - * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or - * could not be made successfully into a URL. + * + * @param attributeKey + * The attribute key + * @return An absolute URL if one could be made, or an empty string (not + * null) if the attribute was missing or could not be made + * successfully into a URL. * @see #attr * @see java.net.URL#URL(java.net.URL, String) */ @@ -180,13 +213,16 @@ public abstract class Node implements Cloneable { try { base = new URL(baseUri); } catch (MalformedURLException e) { - // the base is unsuitable, but the attribute may be abs on its own, so try that + // the base is unsuitable, but the attribute may be abs on + // its own, so try that URL abs = new URL(relUrl); return abs.toExternalForm(); } - // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired - if (relUrl.startsWith("?")) + // workaround: java resolves '//path/file + ?foo' to + // '//path/?foo', not '//path/file?foo' as desired + if (relUrl.startsWith("?")) { relUrl = base.getPath() + relUrl; + } URL abs = new URL(base, relUrl); return abs.toExternalForm(); } catch (MalformedURLException e) { @@ -196,50 +232,58 @@ public abstract class Node implements Cloneable { } /** - Get a child node by index - @param index index of child node - @return the child node at this index. + * Get a child node by index + * + * @param index + * index of child node + * @return the child node at this index. */ public Node childNode(int index) { return childNodes.get(index); } /** - Get this node's children. Presented as an unmodifiable list: new children can not be added, but the child nodes - themselves can be manipulated. - @return list of children. If no children, returns an empty list. + * Get this node's children. Presented as an unmodifiable list: new children + * can not be added, but the child nodes themselves can be manipulated. + * + * @return list of children. If no children, returns an empty list. */ public List<Node> childNodes() { return Collections.unmodifiableList(childNodes); } - + protected Node[] childNodesAsArray() { return childNodes.toArray(new Node[childNodes().size()]); } /** - Gets this node's parent node. - @return parent node; or null if no parent. + * Gets this node's parent node. + * + * @return parent node; or null if no parent. */ public Node parent() { return parentNode; } - + /** - * Gets the Document associated with this Node. - * @return the Document associated with this Node, or null if there is no such Document. + * Gets the Document associated with this Node. + * + * @return the Document associated with this Node, or null if there is no + * such Document. */ public Document ownerDocument() { - if (this instanceof Document) + if (this instanceof Document) { return (Document) this; - else if (parentNode == null) + } else if (parentNode == null) { return null; - else + } else { return parentNode.ownerDocument(); + } } - + /** - * Remove (delete) this node from the DOM tree. If this node has children, they are also removed. + * Remove (delete) this node from the DOM tree. If this node has children, + * they are also removed. */ public void remove() { Validate.notNull(parentNode); @@ -247,8 +291,11 @@ public abstract class Node implements Cloneable { } /** - * Insert the specified HTML into the DOM before this node (i.e. as a preceding sibling). - * @param html HTML to add before this node + * Insert the specified HTML into the DOM before this node (i.e. as a + * preceding sibling). + * + * @param html + * HTML to add before this node * @return this node, for chaining * @see #after(String) */ @@ -258,8 +305,11 @@ public abstract class Node implements Cloneable { } /** - * Insert the specified node into the DOM before this node (i.e. as a preceding sibling). - * @param node to add before this node + * Insert the specified node into the DOM before this node (i.e. as a + * preceding sibling). + * + * @param node + * to add before this node * @return this node, for chaining * @see #after(Node) */ @@ -272,19 +322,25 @@ public abstract class Node implements Cloneable { } /** - * Insert the specified HTML into the DOM after this node (i.e. as a following sibling). - * @param html HTML to add after this node + * Insert the specified HTML into the DOM after this node (i.e. as a + * following sibling). + * + * @param html + * HTML to add after this node * @return this node, for chaining * @see #before(String) */ public Node after(String html) { - addSiblingHtml(siblingIndex()+1, html); + addSiblingHtml(siblingIndex() + 1, html); return this; } /** - * Insert the specified node into the DOM after this node (i.e. as a following sibling). - * @param node to add after this node + * Insert the specified node into the DOM after this node (i.e. as a + * following sibling). + * + * @param node + * to add after this node * @return this node, for chaining * @see #before(Node) */ @@ -292,7 +348,7 @@ public abstract class Node implements Cloneable { Validate.notNull(node); Validate.notNull(parentNode); - parentNode.addChildren(siblingIndex()+1, node); + parentNode.addChildren(siblingIndex() + 1, node); return this; } @@ -300,31 +356,39 @@ public abstract class Node implements Cloneable { Validate.notNull(html); Validate.notNull(parentNode); - Element context = parent() instanceof Element ? (Element) parent() : null; + Element context = parent() instanceof Element ? (Element) parent() + : null; List<Node> nodes = Parser.parseFragment(html, context, baseUri()); parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); } /** - Wrap the supplied HTML around this node. - @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. - @return this node, for chaining. + * Wrap the supplied HTML around this node. + * + * @param html + * HTML to wrap around this element, e.g. + * {@code <div class="head"></div>}. Can be arbitrarily deep. + * @return this node, for chaining. */ public Node wrap(String html) { Validate.notEmpty(html); - Element context = parent() instanceof Element ? (Element) parent() : null; - List<Node> wrapChildren = Parser.parseFragment(html, context, baseUri()); + Element context = parent() instanceof Element ? (Element) parent() + : null; + List<Node> wrapChildren = Parser + .parseFragment(html, context, baseUri()); Node wrapNode = wrapChildren.get(0); - if (wrapNode == null || !(wrapNode instanceof Element)) // nothing to wrap with; noop + if (wrapNode == null || !(wrapNode instanceof Element)) { return null; + } Element wrap = (Element) wrapNode; Element deepest = getDeepChild(wrap); parentNode.replaceChild(this, wrap); deepest.addChildren(this); - // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is remainder + // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is + // remainder if (wrapChildren.size() > 0) { for (int i = 0; i < wrapChildren.size(); i++) { Node remainder = wrapChildren.get(i); @@ -336,15 +400,19 @@ public abstract class Node implements Cloneable { } /** - * Removes this node from the DOM, and moves its children up into the node's parent. This has the effect of dropping - * the node but keeping its children. + * Removes this node from the DOM, and moves its children up into the node's + * parent. This has the effect of dropping the node but keeping its + * children. * <p/> * For example, with the input html:<br/> * {@code <div>One <span>Two <b>Three</b></span></div>}<br/> - * Calling {@code element.unwrap()} on the {@code span} element will result in the html:<br/> + * Calling {@code element.unwrap()} on the {@code span} element will result + * in the html:<br/> * {@code <div>One Two <b>Three</b></div>}<br/> * and the {@code "Two "} {@link TextNode} being returned. - * @return the first child of this node, after the node has been unwrapped. Null if the node had no children. + * + * @return the first child of this node, after the node has been unwrapped. + * Null if the node had no children. * @see #remove() * @see #wrap(String) */ @@ -353,23 +421,26 @@ public abstract class Node implements Cloneable { int index = siblingIndex; Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null; - parentNode.addChildren(index, this.childNodesAsArray()); - this.remove(); + parentNode.addChildren(index, childNodesAsArray()); + remove(); return firstChild; } private Element getDeepChild(Element el) { List<Element> children = el.children(); - if (children.size() > 0) + if (children.size() > 0) { return getDeepChild(children.get(0)); - else + } else { return el; + } } - + /** * Replace this node in the DOM with the supplied node. - * @param in the node that will will replace the existing node. + * + * @param in + * the node that will will replace the existing node. */ public void replaceWith(Node in) { Validate.notNull(in); @@ -378,17 +449,19 @@ public abstract class Node implements Cloneable { } protected void setParentNode(Node parentNode) { - if (this.parentNode != null) + if (this.parentNode != null) { this.parentNode.removeChild(this); + } this.parentNode = parentNode; } protected void replaceChild(Node out, Node in) { Validate.isTrue(out.parentNode == this); Validate.notNull(in); - if (in.parentNode != null) + if (in.parentNode != null) { in.parentNode.removeChild(in); - + } + Integer index = out.siblingIndex(); childNodes.set(index, in); in.parentNode = this; @@ -405,11 +478,12 @@ public abstract class Node implements Cloneable { } protected void addChildren(Node... children) { - //most used. short circuit addChildren(int), which hits reindex children and array copy - for (Node child: children) { + // most used. short circuit addChildren(int), which hits reindex + // children and array copy + for (Node child : children) { reparentChild(child); childNodes.add(child); - child.setSiblingIndex(childNodes.size()-1); + child.setSiblingIndex(childNodes.size() - 1); } } @@ -424,85 +498,100 @@ public abstract class Node implements Cloneable { } private void reparentChild(Node child) { - if (child.parentNode != null) + if (child.parentNode != null) { child.parentNode.removeChild(child); + } child.setParentNode(this); } - + private void reindexChildren() { for (int i = 0; i < childNodes.size(); i++) { childNodes.get(i).setSiblingIndex(i); } } - + /** - Retrieves this node's sibling nodes. Similar to {@link #childNodes() node.parent.childNodes()}, but does not - include this node (a node is not a sibling of itself). - @return node siblings. If the node has no parent, returns an empty list. + * Retrieves this node's sibling nodes. Similar to {@link #childNodes() + * node.parent.childNodes()}, but does not include this node (a node is not + * a sibling of itself). + * + * @return node siblings. If the node has no parent, returns an empty list. */ public List<Node> siblingNodes() { - if (parentNode == null) + if (parentNode == null) { return Collections.emptyList(); + } List<Node> nodes = parentNode.childNodes; List<Node> siblings = new ArrayList<Node>(nodes.size() - 1); - for (Node node: nodes) - if (node != this) + for (Node node : nodes) { + if (node != this) { siblings.add(node); + } + } return siblings; } /** - Get this node's next sibling. - @return next sibling, or null if this is the last sibling + * Get this node's next sibling. + * + * @return next sibling, or null if this is the last sibling */ public Node nextSibling() { - if (parentNode == null) + if (parentNode == null) { return null; // root - + } + List<Node> siblings = parentNode.childNodes; Integer index = siblingIndex(); Validate.notNull(index); - if (siblings.size() > index+1) - return siblings.get(index+1); - else + if (siblings.size() > index + 1) { + return siblings.get(index + 1); + } else { return null; + } } /** - Get this node's previous sibling. - @return the previous sibling, or null if this is the first sibling + * Get this node's previous sibling. + * + * @return the previous sibling, or null if this is the first sibling */ public Node previousSibling() { - if (parentNode == null) + if (parentNode == null) { return null; // root + } List<Node> siblings = parentNode.childNodes; Integer index = siblingIndex(); Validate.notNull(index); - if (index > 0) - return siblings.get(index-1); - else + if (index > 0) { + return siblings.get(index - 1); + } else { return null; + } } /** - * Get the list index of this node in its node sibling list. I.e. if this is the first node - * sibling, returns 0. + * Get the list index of this node in its node sibling list. I.e. if this is + * the first node sibling, returns 0. + * * @return position in node sibling list * @see org.jsoup.nodes.Element#elementSiblingIndex() */ public int siblingIndex() { return siblingIndex; } - + protected void setSiblingIndex(int siblingIndex) { this.siblingIndex = siblingIndex; } /** * Perform a depth-first traversal through this node and its descendants. - * @param nodeVisitor the visitor callbacks to perform on each node + * + * @param nodeVisitor + * the visitor callbacks to perform on each node * @return this node, for chaining */ public Node traverse(NodeVisitor nodeVisitor) { @@ -513,8 +602,9 @@ public abstract class Node implements Cloneable { } /** - Get the outer HTML of this node. - @return HTML + * Get the outer HTML of this node. + * + * @return HTML */ public String outerHtml() { StringBuilder accum = new StringBuilder(128); @@ -523,34 +613,47 @@ public abstract class Node implements Cloneable { } protected void outerHtml(StringBuilder accum) { - new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())).traverse(this); + new NodeTraversor(new OuterHtmlVisitor(accum, getOutputSettings())) + .traverse(this); } - // if this node has no document (or parent), retrieve the default output settings + // if this node has no document (or parent), retrieve the default output + // settings private Document.OutputSettings getOutputSettings() { - return ownerDocument() != null ? ownerDocument().outputSettings() : (new Document("")).outputSettings(); + return ownerDocument() != null ? ownerDocument().outputSettings() + : (new Document("")).outputSettings(); } /** - Get the outer HTML of this node. - @param accum accumulator to place HTML into + * Get the outer HTML of this node. + * + * @param accum + * accumulator to place HTML into */ - abstract void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out); + abstract void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out); - abstract void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out); + abstract void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out); + @Override public String toString() { return outerHtml(); } - protected void indent(StringBuilder accum, int depth, Document.OutputSettings out) { - accum.append("\n").append(StringUtil.padding(depth * out.indentAmount())); + protected void indent(StringBuilder accum, int depth, + Document.OutputSettings out) { + accum.append("\n").append( + StringUtil.padding(depth * out.indentAmount())); } @Override public boolean equals(Object o) { - if (this == o) return true; - // todo: have nodes hold a child index, compare against that and parent (not children) + if (this == o) { + return true; + } + // todo: have nodes hold a child index, compare against that and parent + // (not children) return false; } @@ -563,11 +666,14 @@ public abstract class Node implements Cloneable { } /** - * Create a stand-alone, deep copy of this node, and all of its children. The cloned node will have no siblings or - * parent node. As a stand-alone object, any changes made to the clone or any of its children will not impact the - * original node. + * Create a stand-alone, deep copy of this node, and all of its children. + * The cloned node will have no siblings or parent node. As a stand-alone + * object, any changes made to the clone or any of its children will not + * impact the original node. * <p> - * The cloned node may be adopted into another Document or node structure using {@link Element#appendChild(Node)}. + * The cloned node may be adopted into another Document or node structure + * using {@link Element#appendChild(Node)}. + * * @return stand-alone cloned node */ @Override @@ -588,8 +694,11 @@ public abstract class Node implements Cloneable { clone.attributes = attributes != null ? attributes.clone() : null; clone.baseUri = baseUri; clone.childNodes = new ArrayList<Node>(childNodes.size()); - for (Node child: childNodes) - clone.childNodes.add(child.doClone(clone)); // clone() creates orphans, doClone() keeps parent + for (Node child : childNodes) { + clone.childNodes.add(child.doClone(clone)); // clone() creates + // orphans, doClone() + // keeps parent + } return clone; } @@ -603,13 +712,16 @@ public abstract class Node implements Cloneable { this.out = out; } + @Override public void head(Node node, int depth) { node.outerHtmlHead(accum, depth, out); } + @Override public void tail(Node node, int depth) { - if (!node.nodeName().equals("#text")) // saves a void hit. + if (!node.nodeName().equals("#text")) { node.outerHtmlTail(accum, depth, out); + } } } } diff --git a/server/src/org/jsoup/nodes/TextNode.java b/server/src/org/jsoup/nodes/TextNode.java index 9fd0feac8f..594e38593e 100644 --- a/server/src/org/jsoup/nodes/TextNode.java +++ b/server/src/org/jsoup/nodes/TextNode.java @@ -4,111 +4,142 @@ import org.jsoup.helper.StringUtil; import org.jsoup.helper.Validate; /** - A text node. - - @author Jonathan Hedley, jonathan@hedley.net */ + * A text node. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ public class TextNode extends Node { /* - TextNode is a node, and so by default comes with attributes and children. The attributes are seldom used, but use - memory, and the child nodes are never used. So we don't have them, and override accessors to attributes to create - them as needed on the fly. + * TextNode is a node, and so by default comes with attributes and children. + * The attributes are seldom used, but use memory, and the child nodes are + * never used. So we don't have them, and override accessors to attributes + * to create them as needed on the fly. */ private static final String TEXT_KEY = "text"; String text; /** - Create a new TextNode representing the supplied (unencoded) text). - - @param text raw text - @param baseUri base uri - @see #createFromEncoded(String, String) + * Create a new TextNode representing the supplied (unencoded) text). + * + * @param text + * raw text + * @param baseUri + * base uri + * @see #createFromEncoded(String, String) */ public TextNode(String text, String baseUri) { this.baseUri = baseUri; this.text = text; } + @Override public String nodeName() { return "#text"; } - + /** * Get the text content of this text node. + * * @return Unencoded, normalised text. * @see TextNode#getWholeText() */ public String text() { return normaliseWhitespace(getWholeText()); } - + /** * Set the text content of this text node. - * @param text unencoded text + * + * @param text + * unencoded text * @return this, for chaining */ public TextNode text(String text) { this.text = text; - if (attributes != null) + if (attributes != null) { attributes.put(TEXT_KEY, text); + } return this; } /** - Get the (unencoded) text of this text node, including any newlines and spaces present in the original. - @return text + * Get the (unencoded) text of this text node, including any newlines and + * spaces present in the original. + * + * @return text */ public String getWholeText() { return attributes == null ? text : attributes.get(TEXT_KEY); } /** - Test if this text node is blank -- that is, empty or only whitespace (including newlines). - @return true if this document is empty or only whitespace, false if it contains any text content. + * Test if this text node is blank -- that is, empty or only whitespace + * (including newlines). + * + * @return true if this document is empty or only whitespace, false if it + * contains any text content. */ public boolean isBlank() { return StringUtil.isBlank(getWholeText()); } /** - * Split this text node into two nodes at the specified string offset. After splitting, this node will contain the - * original text up to the offset, and will have a new text node sibling containing the text after the offset. - * @param offset string offset point to split node at. + * Split this text node into two nodes at the specified string offset. After + * splitting, this node will contain the original text up to the offset, and + * will have a new text node sibling containing the text after the offset. + * + * @param offset + * string offset point to split node at. * @return the newly created text node containing the text after the offset. */ public TextNode splitText(int offset) { Validate.isTrue(offset >= 0, "Split offset must be not be negative"); - Validate.isTrue(offset < text.length(), "Split offset must not be greater than current text length"); + Validate.isTrue(offset < text.length(), + "Split offset must not be greater than current text length"); String head = getWholeText().substring(0, offset); String tail = getWholeText().substring(offset); text(head); - TextNode tailNode = new TextNode(tail, this.baseUri()); - if (parent() != null) - parent().addChildren(siblingIndex()+1, tailNode); + TextNode tailNode = new TextNode(tail, baseUri()); + if (parent() != null) { + parent().addChildren(siblingIndex() + 1, tailNode); + } return tailNode; } - void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { + @Override + void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out) { String html = Entities.escape(getWholeText(), out); - if (out.prettyPrint() && parent() instanceof Element && !((Element) parent()).preserveWhitespace()) { + if (out.prettyPrint() && parent() instanceof Element + && !((Element) parent()).preserveWhitespace()) { html = normaliseWhitespace(html); } - if (out.prettyPrint() && siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) + if (out.prettyPrint() && siblingIndex() == 0 + && parentNode instanceof Element + && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) { indent(accum, depth, out); + } accum.append(html); } - void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + @Override + void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out) { + } + @Override public String toString() { return outerHtml(); } /** * Create a new TextNode from HTML encoded (aka escaped) data. - * @param encodedText Text containing encoded HTML (e.g. &lt;) + * + * @param encodedText + * Text containing encoded HTML (e.g. &lt;) * @return TextNode containing unencoded data (e.g. <) */ public static TextNode createFromEncoded(String encodedText, String baseUri) { diff --git a/server/src/org/jsoup/nodes/XmlDeclaration.java b/server/src/org/jsoup/nodes/XmlDeclaration.java index 80d4a0152f..ce6ac678a5 100644 --- a/server/src/org/jsoup/nodes/XmlDeclaration.java +++ b/server/src/org/jsoup/nodes/XmlDeclaration.java @@ -1,47 +1,60 @@ package org.jsoup.nodes; /** - An XML Declaration. - - @author Jonathan Hedley, jonathan@hedley.net */ + * An XML Declaration. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ public class XmlDeclaration extends Node { private static final String DECL_KEY = "declaration"; - private final boolean isProcessingInstruction; // <! if true, <? if false, declaration (and last data char should be ?) + private final boolean isProcessingInstruction; // <! if true, <? if false, + // declaration (and last data + // char should be ?) /** - Create a new XML declaration - @param data data - @param baseUri base uri - @param isProcessingInstruction is processing instruction + * Create a new XML declaration + * + * @param data + * data + * @param baseUri + * base uri + * @param isProcessingInstruction + * is processing instruction */ - public XmlDeclaration(String data, String baseUri, boolean isProcessingInstruction) { + public XmlDeclaration(String data, String baseUri, + boolean isProcessingInstruction) { super(baseUri); attributes.put(DECL_KEY, data); this.isProcessingInstruction = isProcessingInstruction; } + @Override public String nodeName() { return "#declaration"; } /** - Get the unencoded XML declaration. - @return XML declaration + * Get the unencoded XML declaration. + * + * @return XML declaration */ public String getWholeDeclaration() { return attributes.get(DECL_KEY); } - void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) { - accum - .append("<") - .append(isProcessingInstruction ? "!" : "?") - .append(getWholeDeclaration()) - .append(">"); + @Override + void outerHtmlHead(StringBuilder accum, int depth, + Document.OutputSettings out) { + accum.append("<").append(isProcessingInstruction ? "!" : "?") + .append(getWholeDeclaration()).append(">"); } - void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {} + @Override + void outerHtmlTail(StringBuilder accum, int depth, + Document.OutputSettings out) { + } + @Override public String toString() { return outerHtml(); } diff --git a/server/src/org/jsoup/parser/CharacterReader.java b/server/src/org/jsoup/parser/CharacterReader.java index b549a571a0..30fbca07f1 100644 --- a/server/src/org/jsoup/parser/CharacterReader.java +++ b/server/src/org/jsoup/parser/CharacterReader.java @@ -3,7 +3,7 @@ package org.jsoup.parser; import org.jsoup.helper.Validate; /** - CharacterReader consumes tokens off a string. To replace the old TokenQueue. + * CharacterReader consumes tokens off a string. To replace the old TokenQueue. */ class CharacterReader { static final char EOF = (char) -1; @@ -15,10 +15,11 @@ class CharacterReader { CharacterReader(String input) { Validate.notNull(input); - input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns to newlines + input = input.replaceAll("\r\n?", "\n"); // normalise carriage returns + // to newlines this.input = input; - this.length = input.length(); + length = input.length(); } int pos() { @@ -87,8 +88,9 @@ class CharacterReader { OUTER: while (!isEmpty()) { char c = input.charAt(pos); for (char seek : seq) { - if (seek == c) + if (seek == c) { break OUTER; + } } pos++; } @@ -106,10 +108,11 @@ class CharacterReader { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { pos++; - else + } else { break; + } } return input.substring(start, pos); @@ -119,17 +122,19 @@ class CharacterReader { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); - if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { pos++; - else + } else { break; + } } while (!isEmpty()) { char c = input.charAt(pos); - if (c >= '0' && c <= '9') + if (c >= '0' && c <= '9') { pos++; - else + } else { break; + } } return input.substring(start, pos); @@ -139,10 +144,12 @@ class CharacterReader { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); - if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) + if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') + || (c >= 'a' && c <= 'f')) { pos++; - else + } else { break; + } } return input.substring(start, pos); } @@ -151,10 +158,11 @@ class CharacterReader { int start = pos; while (!isEmpty()) { char c = input.charAt(pos); - if (c >= '0' && c <= '9') + if (c >= '0' && c <= '9') { pos++; - else + } else { break; + } } return input.substring(start, pos); } @@ -173,27 +181,31 @@ class CharacterReader { } boolean matchesAny(char... seq) { - if (isEmpty()) + if (isEmpty()) { return false; + } char c = input.charAt(pos); for (char seek : seq) { - if (seek == c) + if (seek == c) { return true; + } } return false; } boolean matchesLetter() { - if (isEmpty()) + if (isEmpty()) { return false; + } char c = input.charAt(pos); return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } boolean matchesDigit() { - if (isEmpty()) + if (isEmpty()) { return false; + } char c = input.charAt(pos); return (c >= '0' && c <= '9'); } @@ -217,10 +229,12 @@ class CharacterReader { } boolean containsIgnoreCase(String seq) { - // used to check presence of </title>, </style>. only finds consistent case. + // used to check presence of </title>, </style>. only finds consistent + // case. String loScan = seq.toLowerCase(); String hiScan = seq.toUpperCase(); - return (input.indexOf(loScan, pos) > -1) || (input.indexOf(hiScan, pos) > -1); + return (input.indexOf(loScan, pos) > -1) + || (input.indexOf(hiScan, pos) > -1); } @Override diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilder.java b/server/src/org/jsoup/parser/HtmlTreeBuilder.java index 457a4c3249..f09ab8794c 100644 --- a/server/src/org/jsoup/parser/HtmlTreeBuilder.java +++ b/server/src/org/jsoup/parser/HtmlTreeBuilder.java @@ -1,15 +1,20 @@ package org.jsoup.parser; -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.*; - import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import org.jsoup.helper.DescendableLinkedList; +import org.jsoup.helper.StringUtil; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Comment; +import org.jsoup.nodes.DataNode; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; + /** * HTML Tree Builder; creates a DOM from Tokens. */ @@ -21,15 +26,26 @@ class HtmlTreeBuilder extends TreeBuilder { private boolean baseUriSetFromDoc = false; private Element headElement; // the current head element private Element formElement; // the current form element - private Element contextElement; // fragment parse context -- could be null even if fragment parsing - private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active (open) formatting elements - private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars in table to be shifted out + private Element contextElement; // fragment parse context -- could be null + // even if fragment parsing + private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active + // (open) + // formatting + // elements + private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars + // in + // table + // to + // be + // shifted + // out private boolean framesetOk = true; // if ok to go into frameset private boolean fosterInserts = false; // if next inserts should be fostered private boolean fragmentParsing = false; // if parsing a fragment of html - HtmlTreeBuilder() {} + HtmlTreeBuilder() { + } @Override Document parse(String input, String baseUri, ParseErrorList errors) { @@ -37,7 +53,8 @@ class HtmlTreeBuilder extends TreeBuilder { return super.parse(input, baseUri, errors); } - List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors) { + List<Node> parseFragment(String inputFragment, Element context, + String baseUri, ParseErrorList errors) { // context may be null state = HtmlTreeBuilderState.Initial; initialiseParse(inputFragment, baseUri, errors); @@ -46,42 +63,48 @@ class HtmlTreeBuilder extends TreeBuilder { Element root = null; if (context != null) { - if (context.ownerDocument() != null) // quirks setup: + if (context.ownerDocument() != null) { doc.quirksMode(context.ownerDocument().quirksMode()); + } // initialise the tokeniser state: String contextTag = context.tagName(); - if (StringUtil.in(contextTag, "title", "textarea")) + if (StringUtil.in(contextTag, "title", "textarea")) { tokeniser.transition(TokeniserState.Rcdata); - else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) + } else if (StringUtil.in(contextTag, "iframe", "noembed", + "noframes", "style", "xmp")) { tokeniser.transition(TokeniserState.Rawtext); - else if (contextTag.equals("script")) + } else if (contextTag.equals("script")) { tokeniser.transition(TokeniserState.ScriptData); - else if (contextTag.equals(("noscript"))) - tokeniser.transition(TokeniserState.Data); // if scripting enabled, rawtext - else if (contextTag.equals("plaintext")) + } else if (contextTag.equals(("noscript"))) { + tokeniser.transition(TokeniserState.Data); // if scripting + // enabled, rawtext + } else if (contextTag.equals("plaintext")) { tokeniser.transition(TokeniserState.Data); - else + } else { tokeniser.transition(TokeniserState.Data); // default + } root = new Element(Tag.valueOf("html"), baseUri); doc.appendChild(root); stack.push(root); resetInsertionMode(); - // todo: setup form element to nearest form on context (up ancestor chain) + // todo: setup form element to nearest form on context (up ancestor + // chain) } runParser(); - if (context != null) + if (context != null) { return root.childNodes(); - else + } else { return doc.childNodes(); + } } @Override protected boolean process(Token token) { currentToken = token; - return this.state.process(token, this); + return state.process(token, this); } boolean process(Token token, HtmlTreeBuilderState state) { @@ -122,14 +145,17 @@ class HtmlTreeBuilder extends TreeBuilder { } void maybeSetBaseUri(Element base) { - if (baseUriSetFromDoc) // only listen to the first <base href> in parse + if (baseUriSetFromDoc) { return; + } String href = base.absUrl("href"); if (href.length() != 0) { // ignore <base target> etc baseUri = href; baseUriSetFromDoc = true; - doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants + doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) + // will get updated base, and to update all + // descendants } } @@ -138,20 +164,26 @@ class HtmlTreeBuilder extends TreeBuilder { } void error(HtmlTreeBuilderState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken.tokenType(), state)); + if (errors.canAddError()) { + errors.add(new ParseError(reader.pos(), + "Unexpected token [%s] when in state [%s]", currentToken + .tokenType(), state)); + } } Element insert(Token.StartTag startTag) { // handle empty unknown tags - // when the spec expects an empty tag, will directly hit insertEmpty, so won't generate fake end tag. + // when the spec expects an empty tag, will directly hit insertEmpty, so + // won't generate fake end tag. if (startTag.isSelfClosing() && !Tag.isKnownTag(startTag.name())) { Element el = insertEmpty(startTag); - process(new Token.EndTag(el.tagName())); // ensure we get out of whatever state we are in + process(new Token.EndTag(el.tagName())); // ensure we get out of + // whatever state we are in return el; } - - Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes); + + Element el = new Element(Tag.valueOf(startTag.name()), baseUri, + startTag.attributes); insert(el); return el; } @@ -173,8 +205,9 @@ class HtmlTreeBuilder extends TreeBuilder { insertNode(el); if (startTag.isSelfClosing()) { tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output + if (!tag.isKnownTag()) { tag.setSelfClosing(); + } } return el; } @@ -187,29 +220,37 @@ class HtmlTreeBuilder extends TreeBuilder { void insert(Token.Character characterToken) { Node node; // characters in script and style go in as datanodes, not text nodes - if (StringUtil.in(currentElement().tagName(), "script", "style")) + if (StringUtil.in(currentElement().tagName(), "script", "style")) { node = new DataNode(characterToken.getData(), baseUri); - else + } else { node = new TextNode(characterToken.getData(), baseUri); - currentElement().appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. + } + currentElement().appendChild(node); // doesn't use insertNode, because + // we don't foster these; and will + // always have a stack. } private void insertNode(Node node) { - // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc - if (stack.size() == 0) + // if the stack hasn't been set up yet, elements (doctype, comments) go + // into the doc + if (stack.size() == 0) { doc.appendChild(node); - else if (isFosterInserts()) + } else if (isFosterInserts()) { insertInFosterParent(node); - else + } else { currentElement().appendChild(node); + } } Element pop() { // todo - dev, remove validation check - if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell")) + if (stack.peekLast().nodeName().equals("td") + && !state.name().equals("InCell")) { Validate.isFalse(true, "pop td not in cell"); - if (stack.peekLast().nodeName().equals("html")) + } + if (stack.peekLast().nodeName().equals("html")) { Validate.isFalse(true, "popping html!"); + } return stack.pollLast(); } @@ -225,7 +266,8 @@ class HtmlTreeBuilder extends TreeBuilder { return isElementInQueue(stack, el); } - private boolean isElementInQueue(DescendableLinkedList<Element> queue, Element element) { + private boolean isElementInQueue(DescendableLinkedList<Element> queue, + Element element) { Iterator<Element> it = queue.descendingIterator(); while (it.hasNext()) { Element next = it.next(); @@ -313,10 +355,12 @@ class HtmlTreeBuilder extends TreeBuilder { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); - if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html")) + if (StringUtil.in(next.nodeName(), nodeNames) + || next.nodeName().equals("html")) { break; - else + } else { it.remove(); + } } } @@ -335,14 +379,15 @@ class HtmlTreeBuilder extends TreeBuilder { void insertOnStackAfter(Element after, Element in) { int i = stack.lastIndexOf(after); Validate.isTrue(i != -1); - stack.add(i+1, in); + stack.add(i + 1, in); } void replaceOnStack(Element out, Element in) { replaceInQueue(stack, out, in); } - private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) { + private void replaceInQueue(LinkedList<Element> queue, Element out, + Element in) { int i = queue.lastIndexOf(out); Validate.isTrue(i != -1); queue.remove(i); @@ -368,7 +413,8 @@ class HtmlTreeBuilder extends TreeBuilder { } else if ("tr".equals(name)) { transition(HtmlTreeBuilderState.InRow); break; - } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) { + } else if ("tbody".equals(name) || "thead".equals(name) + || "tfoot".equals(name)) { transition(HtmlTreeBuilderState.InTableBody); break; } else if ("caption".equals(name)) { @@ -400,28 +446,35 @@ class HtmlTreeBuilder extends TreeBuilder { } // todo: tidy up in specific scope methods - private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { - return inSpecificScope(new String[]{targetName}, baseTypes, extraTypes); + private boolean inSpecificScope(String targetName, String[] baseTypes, + String[] extraTypes) { + return inSpecificScope(new String[] { targetName }, baseTypes, + extraTypes); } - private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) { + private boolean inSpecificScope(String[] targetNames, String[] baseTypes, + String[] extraTypes) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element el = it.next(); String elName = el.nodeName(); - if (StringUtil.in(elName, targetNames)) + if (StringUtil.in(elName, targetNames)) { return true; - if (StringUtil.in(elName, baseTypes)) + } + if (StringUtil.in(elName, baseTypes)) { return false; - if (extraTypes != null && StringUtil.in(elName, extraTypes)) + } + if (extraTypes != null && StringUtil.in(elName, extraTypes)) { return false; + } } Validate.fail("Should not be reachable"); return false; } boolean inScope(String[] targetNames) { - return inSpecificScope(targetNames, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, null); + return inSpecificScope(targetNames, new String[] { "applet", "caption", + "html", "table", "td", "th", "marquee", "object" }, null); } boolean inScope(String targetName) { @@ -429,21 +482,23 @@ class HtmlTreeBuilder extends TreeBuilder { } boolean inScope(String targetName, String[] extras) { - return inSpecificScope(targetName, new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}, extras); + return inSpecificScope(targetName, new String[] { "applet", "caption", + "html", "table", "td", "th", "marquee", "object" }, extras); // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml // todo: in svg namespace: forignOjbect, desc, title } boolean inListItemScope(String targetName) { - return inScope(targetName, new String[]{"ol", "ul"}); + return inScope(targetName, new String[] { "ol", "ul" }); } boolean inButtonScope(String targetName) { - return inScope(targetName, new String[]{"button"}); + return inScope(targetName, new String[] { "button" }); } boolean inTableScope(String targetName) { - return inSpecificScope(targetName, new String[]{"html", "table"}, null); + return inSpecificScope(targetName, new String[] { "html", "table" }, + null); } boolean inSelectScope(String targetName) { @@ -451,10 +506,12 @@ class HtmlTreeBuilder extends TreeBuilder { while (it.hasNext()) { Element el = it.next(); String elName = el.nodeName(); - if (elName.equals(targetName)) + if (elName.equals(targetName)) { return true; - if (!StringUtil.in(elName, "optgroup", "option")) // all elements except + } + if (!StringUtil.in(elName, "optgroup", "option")) { return false; + } } Validate.fail("Should not be reachable"); return false; @@ -497,18 +554,26 @@ class HtmlTreeBuilder extends TreeBuilder { } /** - 11.2.5.2 Closing elements that have implied end tags<p/> - When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a - dt element, an li element, an option element, an optgroup element, a p element, an rp element, or an rt element, - the UA must pop the current node off the stack of open elements. - - @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the - process, then the UA must perform the above steps as if that element was not in the above list. + * 11.2.5.2 Closing elements that have implied end tags + * <p/> + * When the steps below require the UA to generate implied end tags, then, + * while the current node is a dd element, a dt element, an li element, an + * option element, an optgroup element, a p element, an rp element, or an rt + * element, the UA must pop the current node off the stack of open elements. + * + * @param excludeTag + * If a step requires the UA to generate implied end tags but + * lists an element to exclude from the process, then the UA must + * perform the above steps as if that element was not in the + * above list. */ void generateImpliedEndTags(String excludeTag) { - while ((excludeTag != null && !currentElement().nodeName().equals(excludeTag)) && - StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) + while ((excludeTag != null && !currentElement().nodeName().equals( + excludeTag)) + && StringUtil.in(currentElement().nodeName(), "dd", "dt", "li", + "option", "optgroup", "p", "rp", "rt")) { pop(); + } } void generateImpliedEndTags() { @@ -519,14 +584,18 @@ class HtmlTreeBuilder extends TreeBuilder { // todo: mathml's mi, mo, mn // todo: svg's foreigObject, desc, title String name = el.nodeName(); - return StringUtil.in(name, "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", - "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", - "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", - "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", - "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", - "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", - "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", - "title", "tr", "ul", "wbr", "xmp"); + return StringUtil.in(name, "address", "applet", "area", "article", + "aside", "base", "basefont", "bgsound", "blockquote", "body", + "br", "button", "caption", "center", "col", "colgroup", + "command", "dd", "details", "dir", "div", "dl", "dt", "embed", + "fieldset", "figcaption", "figure", "footer", "form", "frame", + "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", + "header", "hgroup", "hr", "html", "iframe", "img", "input", + "isindex", "li", "link", "listing", "marquee", "menu", "meta", + "nav", "noembed", "noframes", "noscript", "object", "ol", "p", + "param", "plaintext", "pre", "script", "section", "select", + "style", "summary", "table", "tbody", "td", "textarea", + "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp"); } // active formatting elements @@ -534,12 +603,14 @@ class HtmlTreeBuilder extends TreeBuilder { int numSeen = 0; Iterator<Element> iter = formattingElements.descendingIterator(); while (iter.hasNext()) { - Element el = iter.next(); - if (el == null) // marker + Element el = iter.next(); + if (el == null) { break; + } - if (isSameFormattingElement(in, el)) + if (isSameFormattingElement(in, el)) { numSeen++; + } if (numSeen == 3) { iter.remove(); @@ -550,17 +621,20 @@ class HtmlTreeBuilder extends TreeBuilder { } private boolean isSameFormattingElement(Element a, Element b) { - // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children + // same if: same namespace, tag, and attributes. Element.equals only + // checks tag, might in future check children return a.nodeName().equals(b.nodeName()) && - // a.namespace().equals(b.namespace()) && + // a.namespace().equals(b.namespace()) && a.attributes().equals(b.attributes()); // todo: namespaces } void reconstructFormattingElements() { int size = formattingElements.size(); - if (size == 0 || formattingElements.getLast() == null || onStack(formattingElements.getLast())) + if (size == 0 || formattingElements.getLast() == null + || onStack(formattingElements.getLast())) { return; + } Element entry = formattingElements.getLast(); int pos = size - 1; @@ -570,18 +644,24 @@ class HtmlTreeBuilder extends TreeBuilder { skip = true; break; } - entry = formattingElements.get(--pos); // step 5. one earlier than entry - if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack + entry = formattingElements.get(--pos); // step 5. one earlier than + // entry + if (entry == null || onStack(entry)) { break; // jump to 8, else continue back to 4 + } } - while(true) { - if (!skip) // step 7: on later than entry + while (true) { + if (!skip) { entry = formattingElements.get(++pos); - Validate.notNull(entry); // should not occur, as we break at last element + } + Validate.notNull(entry); // should not occur, as we break at last + // element - // 8. create new element from element, 9 insert into current node, onto stack + // 8. create new element from element, 9 insert into current node, + // onto stack skip = false; // can only skip increment from 4. - Element newEl = insert(entry.nodeName()); // todo: avoid fostering here? + Element newEl = insert(entry.nodeName()); // todo: avoid fostering + // here? // newEl.namespace(entry.namespace()); // todo: namespaces newEl.attributes().addAll(entry.attributes()); @@ -590,8 +670,9 @@ class HtmlTreeBuilder extends TreeBuilder { formattingElements.remove(pos + 1); // 11 - if (pos == size-1) // if not last entry in list, jump to 7 + if (pos == size - 1) { break; + } } } @@ -599,8 +680,9 @@ class HtmlTreeBuilder extends TreeBuilder { while (!formattingElements.isEmpty()) { Element el = formattingElements.peekLast(); formattingElements.removeLast(); - if (el == null) + if (el == null) { break; + } } } @@ -623,10 +705,11 @@ class HtmlTreeBuilder extends TreeBuilder { Iterator<Element> it = formattingElements.descendingIterator(); while (it.hasNext()) { Element next = it.next(); - if (next == null) // scope marker + if (next == null) { break; - else if (next.nodeName().equals(nodeName)) + } else if (next.nodeName().equals(nodeName)) { return next; + } } return null; } @@ -647,26 +730,25 @@ class HtmlTreeBuilder extends TreeBuilder { if (lastTable.parent() != null) { fosterParent = lastTable.parent(); isLastTableParent = true; - } else + } else { fosterParent = aboveOnStack(lastTable); + } } else { // no table == frag fosterParent = stack.get(0); } if (isLastTableParent) { - Validate.notNull(lastTable); // last table cannot be null by this point. + Validate.notNull(lastTable); // last table cannot be null by this + // point. lastTable.before(in); - } - else + } else { fosterParent.appendChild(in); + } } @Override public String toString() { - return "TreeBuilder{" + - "currentToken=" + currentToken + - ", state=" + state + - ", currentElement=" + currentElement() + - '}'; + return "TreeBuilder{" + "currentToken=" + currentToken + ", state=" + + state + ", currentElement=" + currentElement() + '}'; } } diff --git a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java index ceab9faa5a..258d547a49 100644 --- a/server/src/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/server/src/org/jsoup/parser/HtmlTreeBuilderState.java @@ -1,17 +1,24 @@ package org.jsoup.parser; -import org.jsoup.helper.DescendableLinkedList; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.*; - import java.util.Iterator; import java.util.LinkedList; +import org.jsoup.helper.DescendableLinkedList; +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.DocumentType; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + /** - * The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states. + * The Tree Builder's current state. Each state embodies the processing for the + * state, and transitions to other states. */ enum HtmlTreeBuilderState { Initial { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return true; // ignore whitespace @@ -21,10 +28,13 @@ enum HtmlTreeBuilderState { // todo: parse error check on expected doctypes // todo: quirk state check on doctype ids Token.Doctype d = t.asDoctype(); - DocumentType doctype = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri()); + DocumentType doctype = new DocumentType(d.getName(), + d.getPublicIdentifier(), d.getSystemIdentifier(), + tb.getBaseUri()); tb.getDocument().appendChild(doctype); - if (d.isForceQuirks()) + if (d.isForceQuirks()) { tb.getDocument().quirksMode(Document.QuirksMode.quirks); + } tb.transition(BeforeHtml); } else { // todo: check not iframe srcdoc @@ -35,6 +45,7 @@ enum HtmlTreeBuilderState { } }, BeforeHtml { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isDoctype()) { tb.error(this); @@ -46,7 +57,9 @@ enum HtmlTreeBuilderState { } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { tb.insert(t.asStartTag()); tb.transition(BeforeHead); - } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { + } else if (t.isEndTag() + && (StringUtil.in(t.asEndTag().name(), "head", "body", + "html", "br"))) { return anythingElse(t, tb); } else if (t.isEndTag()) { tb.error(this); @@ -64,6 +77,7 @@ enum HtmlTreeBuilderState { } }, BeforeHead { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return true; @@ -78,7 +92,9 @@ enum HtmlTreeBuilderState { Element head = tb.insert(t.asStartTag()); tb.setHeadElement(head); tb.transition(InHead); - } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { + } else if (t.isEndTag() + && (StringUtil.in(t.asEndTag().name(), "head", "body", + "html", "br"))) { tb.process(new Token.StartTag("head")); return tb.process(t); } else if (t.isEndTag()) { @@ -92,67 +108,71 @@ enum HtmlTreeBuilderState { } }, InHead { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + return false; + case StartTag: + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) { + return InBody.process(t, tb); + } else if (StringUtil.in(name, "base", "basefont", "bgsound", + "command", "link")) { + Element el = tb.insertEmpty(start); + // jsoup special: update base the frist time it is seen + if (name.equals("base") && el.hasAttr("href")) { + tb.maybeSetBaseUri(el); + } + } else if (name.equals("meta")) { + Element meta = tb.insertEmpty(start); + // todo: charset switches + } else if (name.equals("title")) { + handleRcData(start, tb); + } else if (StringUtil.in(name, "noframes", "style")) { + handleRawtext(start, tb); + } else if (name.equals("noscript")) { + // else if noscript && scripting flag = true: rawtext (jsoup + // doesn't run script, to handle as noscript) + tb.insert(start); + tb.transition(InHeadNoscript); + } else if (name.equals("script")) { + // skips some script rules as won't execute them + tb.insert(start); + tb.tokeniser.transition(TokeniserState.ScriptData); + tb.markInsertionMode(); + tb.transition(Text); + } else if (name.equals("head")) { tb.error(this); return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) { - return InBody.process(t, tb); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { - Element el = tb.insertEmpty(start); - // jsoup special: update base the frist time it is seen - if (name.equals("base") && el.hasAttr("href")) - tb.maybeSetBaseUri(el); - } else if (name.equals("meta")) { - Element meta = tb.insertEmpty(start); - // todo: charset switches - } else if (name.equals("title")) { - handleRcData(start, tb); - } else if (StringUtil.in(name, "noframes", "style")) { - handleRawtext(start, tb); - } else if (name.equals("noscript")) { - // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) - tb.insert(start); - tb.transition(InHeadNoscript); - } else if (name.equals("script")) { - // skips some script rules as won't execute them - tb.insert(start); - tb.tokeniser.transition(TokeniserState.ScriptData); - tb.markInsertionMode(); - tb.transition(Text); - } else if (name.equals("head")) { - tb.error(this); - return false; - } else { - return anythingElse(t, tb); - } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("head")) { - tb.pop(); - tb.transition(AfterHead); - } else if (StringUtil.in(name, "body", "html", "br")) { - return anythingElse(t, tb); - } else { - tb.error(this); - return false; - } - break; - default: + } else { return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag end = t.asEndTag(); + name = end.name(); + if (name.equals("head")) { + tb.pop(); + tb.transition(AfterHead); + } else if (StringUtil.in(name, "body", "html", "br")) { + return anythingElse(t, tb); + } else { + tb.error(this); + return false; + } + break; + default: + return anythingElse(t, tb); } return true; } @@ -163,6 +183,7 @@ enum HtmlTreeBuilderState { } }, InHeadNoscript { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isDoctype()) { tb.error(this); @@ -171,12 +192,16 @@ enum HtmlTreeBuilderState { } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { tb.pop(); tb.transition(InHead); - } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "basefont", "bgsound", "link", "meta", "noframes", "style"))) { + } else if (isWhitespace(t) + || t.isComment() + || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), + "basefont", "bgsound", "link", "meta", "noframes", + "style"))) { return tb.process(t, InHead); } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { return anythingElse(t, tb); - } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) { + } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), + "head", "noscript")) || t.isEndTag()) { tb.error(this); return false; } else { @@ -192,6 +217,7 @@ enum HtmlTreeBuilderState { } }, AfterHead { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); @@ -211,7 +237,8 @@ enum HtmlTreeBuilderState { } else if (name.equals("frameset")) { tb.insert(startTag); tb.transition(InFrameset); - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { + } else if (StringUtil.in(name, "base", "basefont", "bgsound", + "link", "meta", "noframes", "script", "style", "title")) { tb.error(this); Element head = tb.getHeadElement(); tb.push(head); @@ -243,519 +270,604 @@ enum HtmlTreeBuilderState { } }, InBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case Character: { - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - // todo confirm that check - tb.error(this); - return false; - } else if (isWhitespace(c)) { - tb.reconstructFormattingElements(); - tb.insert(c); - } else { - tb.reconstructFormattingElements(); - tb.insert(c); - tb.framesetOk(false); - } - break; - } - case Comment: { - tb.insert(t.asComment()); - break; - } - case Doctype: { + case Character: { + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + // todo confirm that check tb.error(this); return false; + } else if (isWhitespace(c)) { + tb.reconstructFormattingElements(); + tb.insert(c); + } else { + tb.reconstructFormattingElements(); + tb.insert(c); + tb.framesetOk(false); } - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) { - tb.error(this); - // merge attributes onto real html - Element html = tb.getStack().getFirst(); - for (Attribute attribute : startTag.getAttributes()) { - if (!html.hasAttr(attribute.getKey())) - html.attributes().put(attribute); - } - } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title")) { - return tb.process(t, InHead); - } else if (name.equals("body")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { - // only in fragment case - return false; // ignore - } else { - tb.framesetOk(false); - Element body = stack.get(1); - for (Attribute attribute : startTag.getAttributes()) { - if (!body.hasAttr(attribute.getKey())) - body.attributes().put(attribute); - } - } - } else if (name.equals("frameset")) { - tb.error(this); - LinkedList<Element> stack = tb.getStack(); - if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName().equals("body"))) { - // only in fragment case - return false; // ignore - } else if (!tb.framesetOk()) { - return false; // ignore frameset - } else { - Element second = stack.get(1); - if (second.parent() != null) - second.remove(); - // pop up to html element - while (stack.size() > 1) - stack.removeLast(); - tb.insert(startTag); - tb.transition(InFrameset); - } - } else if (StringUtil.in(name, - "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", - "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", - "p", "section", "summary", "ul")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - if (StringUtil.in(tb.currentElement().nodeName(), "h1", "h2", "h3", "h4", "h5", "h6")) { - tb.error(this); - tb.pop(); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "pre", "listing")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - // todo: ignore LF if next token - tb.framesetOk(false); - } else if (name.equals("form")) { - if (tb.getFormElement() != null) { - tb.error(this); - return false; - } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + break; + } + case Comment: { + tb.insert(t.asComment()); + break; + } + case Doctype: { + tb.error(this); + return false; + } + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) { + tb.error(this); + // merge attributes onto real html + Element html = tb.getStack().getFirst(); + for (Attribute attribute : startTag.getAttributes()) { + if (!html.hasAttr(attribute.getKey())) { + html.attributes().put(attribute); } - Element form = tb.insert(startTag); - tb.setFormElement(form); - } else if (name.equals("li")) { + } + } else if (StringUtil.in(name, "base", "basefont", "bgsound", + "command", "link", "meta", "noframes", "script", + "style", "title")) { + return tb.process(t, InHead); + } else if (name.equals("body")) { + tb.error(this); + LinkedList<Element> stack = tb.getStack(); + if (stack.size() == 1 + || (stack.size() > 2 && !stack.get(1).nodeName() + .equals("body"))) { + // only in fragment case + return false; // ignore + } else { tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (el.nodeName().equals("li")) { - tb.process(new Token.EndTag("li")); - break; + Element body = stack.get(1); + for (Attribute attribute : startTag.getAttributes()) { + if (!body.hasAttr(attribute.getKey())) { + body.attributes().put(attribute); } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) - break; } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insert(startTag); - } else if (StringUtil.in(name, "dd", "dt")) { - tb.framesetOk(false); - LinkedList<Element> stack = tb.getStack(); - for (int i = stack.size() - 1; i > 0; i--) { - Element el = stack.get(i); - if (StringUtil.in(el.nodeName(), "dd", "dt")) { - tb.process(new Token.EndTag(el.nodeName())); - break; - } - if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), "address", "div", "p")) - break; + } + } else if (name.equals("frameset")) { + tb.error(this); + LinkedList<Element> stack = tb.getStack(); + if (stack.size() == 1 + || (stack.size() > 2 && !stack.get(1).nodeName() + .equals("body"))) { + // only in fragment case + return false; // ignore + } else if (!tb.framesetOk()) { + return false; // ignore frameset + } else { + Element second = stack.get(1); + if (second.parent() != null) { + second.remove(); } - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + // pop up to html element + while (stack.size() > 1) { + stack.removeLast(); } tb.insert(startTag); - } else if (name.equals("plaintext")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + tb.transition(InFrameset); + } + } else if (StringUtil.in(name, "address", "article", "aside", + "blockquote", "center", "details", "dir", "div", "dl", + "fieldset", "figcaption", "figure", "footer", "header", + "hgroup", "menu", "nav", "ol", "p", "section", + "summary", "ul")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", + "h6")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + if (StringUtil.in(tb.currentElement().nodeName(), "h1", + "h2", "h3", "h4", "h5", "h6")) { + tb.error(this); + tb.pop(); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "pre", "listing")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + // todo: ignore LF if next token + tb.framesetOk(false); + } else if (name.equals("form")) { + if (tb.getFormElement() != null) { + tb.error(this); + return false; + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + Element form = tb.insert(startTag); + tb.setFormElement(form); + } else if (name.equals("li")) { + tb.framesetOk(false); + LinkedList<Element> stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (el.nodeName().equals("li")) { + tb.process(new Token.EndTag("li")); + break; } - tb.insert(startTag); - tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once in, never gets out - } else if (name.equals("button")) { - if (tb.inButtonScope("button")) { - // close and reprocess - tb.error(this); - tb.process(new Token.EndTag("button")); - tb.process(startTag); - } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); + if (tb.isSpecial(el) + && !StringUtil.in(el.nodeName(), "address", + "div", "p")) { + break; } - } else if (name.equals("a")) { - if (tb.getActiveFormattingElement("a") != null) { - tb.error(this); - tb.process(new Token.EndTag("a")); - - // still on stack? - Element remainingA = tb.getFromStack("a"); - if (remainingA != null) { - tb.removeFromActiveFormattingElements(remainingA); - tb.removeFromStack(remainingA); - } + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (StringUtil.in(name, "dd", "dt")) { + tb.framesetOk(false); + LinkedList<Element> stack = tb.getStack(); + for (int i = stack.size() - 1; i > 0; i--) { + Element el = stack.get(i); + if (StringUtil.in(el.nodeName(), "dd", "dt")) { + tb.process(new Token.EndTag(el.nodeName())); + break; } - tb.reconstructFormattingElements(); - Element a = tb.insert(startTag); - tb.pushActiveFormattingElements(a); - } else if (StringUtil.in(name, - "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) { - tb.reconstructFormattingElements(); - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (name.equals("nobr")) { - tb.reconstructFormattingElements(); - if (tb.inScope("nobr")) { - tb.error(this); - tb.process(new Token.EndTag("nobr")); - tb.reconstructFormattingElements(); + if (tb.isSpecial(el) + && !StringUtil.in(el.nodeName(), "address", + "div", "p")) { + break; } - Element el = tb.insert(startTag); - tb.pushActiveFormattingElements(el); - } else if (StringUtil.in(name, "applet", "marquee", "object")) { + } + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + } else if (name.equals("plaintext")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once + // in, + // never + // gets + // out + } else if (name.equals("button")) { + if (tb.inButtonScope("button")) { + // close and reprocess + tb.error(this); + tb.process(new Token.EndTag("button")); + tb.process(startTag); + } else { tb.reconstructFormattingElements(); tb.insert(startTag); - tb.insertMarkerToFormattingElements(); tb.framesetOk(false); - } else if (name.equals("table")) { - if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); + } + } else if (name.equals("a")) { + if (tb.getActiveFormattingElement("a") != null) { + tb.error(this); + tb.process(new Token.EndTag("a")); + + // still on stack? + Element remainingA = tb.getFromStack("a"); + if (remainingA != null) { + tb.removeFromActiveFormattingElements(remainingA); + tb.removeFromStack(remainingA); } - tb.insert(startTag); - tb.framesetOk(false); - tb.transition(InTable); - } else if (StringUtil.in(name, "area", "br", "embed", "img", "keygen", "wbr")) { - tb.reconstructFormattingElements(); - tb.insertEmpty(startTag); - tb.framesetOk(false); - } else if (name.equals("input")) { + } + tb.reconstructFormattingElements(); + Element a = tb.insert(startTag); + tb.pushActiveFormattingElements(a); + } else if (StringUtil.in(name, "b", "big", "code", "em", + "font", "i", "s", "small", "strike", "strong", "tt", + "u")) { + tb.reconstructFormattingElements(); + Element el = tb.insert(startTag); + tb.pushActiveFormattingElements(el); + } else if (name.equals("nobr")) { + tb.reconstructFormattingElements(); + if (tb.inScope("nobr")) { + tb.error(this); + tb.process(new Token.EndTag("nobr")); tb.reconstructFormattingElements(); - Element el = tb.insertEmpty(startTag); - if (!el.attr("type").equalsIgnoreCase("hidden")) - tb.framesetOk(false); - } else if (StringUtil.in(name, "param", "source", "track")) { - tb.insertEmpty(startTag); - } else if (name.equals("hr")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.insertEmpty(startTag); + } + Element el = tb.insert(startTag); + tb.pushActiveFormattingElements(el); + } else if (StringUtil.in(name, "applet", "marquee", "object")) { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.insertMarkerToFormattingElements(); + tb.framesetOk(false); + } else if (name.equals("table")) { + if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks + && tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insert(startTag); + tb.framesetOk(false); + tb.transition(InTable); + } else if (StringUtil.in(name, "area", "br", "embed", "img", + "keygen", "wbr")) { + tb.reconstructFormattingElements(); + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (name.equals("input")) { + tb.reconstructFormattingElements(); + Element el = tb.insertEmpty(startTag); + if (!el.attr("type").equalsIgnoreCase("hidden")) { tb.framesetOk(false); - } else if (name.equals("image")) { - // we're not supposed to ask. - startTag.name("img"); - return tb.process(startTag); - } else if (name.equals("isindex")) { - // how much do we care about the early 90s? - tb.error(this); - if (tb.getFormElement() != null) - return false; + } + } else if (StringUtil.in(name, "param", "source", "track")) { + tb.insertEmpty(startTag); + } else if (name.equals("hr")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.insertEmpty(startTag); + tb.framesetOk(false); + } else if (name.equals("image")) { + // we're not supposed to ask. + startTag.name("img"); + return tb.process(startTag); + } else if (name.equals("isindex")) { + // how much do we care about the early 90s? + tb.error(this); + if (tb.getFormElement() != null) { + return false; + } - tb.tokeniser.acknowledgeSelfClosingFlag(); - tb.process(new Token.StartTag("form")); - if (startTag.attributes.hasKey("action")) { - Element form = tb.getFormElement(); - form.attr("action", startTag.attributes.get("action")); - } - tb.process(new Token.StartTag("hr")); - tb.process(new Token.StartTag("label")); - // hope you like english. - String prompt = startTag.attributes.hasKey("prompt") ? - startTag.attributes.get("prompt") : - "This is a searchable index. Enter search keywords: "; + tb.tokeniser.acknowledgeSelfClosingFlag(); + tb.process(new Token.StartTag("form")); + if (startTag.attributes.hasKey("action")) { + Element form = tb.getFormElement(); + form.attr("action", startTag.attributes.get("action")); + } + tb.process(new Token.StartTag("hr")); + tb.process(new Token.StartTag("label")); + // hope you like english. + String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes + .get("prompt") + : "This is a searchable index. Enter search keywords: "; - tb.process(new Token.Character(prompt)); + tb.process(new Token.Character(prompt)); - // input - Attributes inputAttribs = new Attributes(); - for (Attribute attr : startTag.attributes) { - if (!StringUtil.in(attr.getKey(), "name", "action", "prompt")) - inputAttribs.put(attr); + // input + Attributes inputAttribs = new Attributes(); + for (Attribute attr : startTag.attributes) { + if (!StringUtil.in(attr.getKey(), "name", "action", + "prompt")) { + inputAttribs.put(attr); } - inputAttribs.put("name", "isindex"); - tb.process(new Token.StartTag("input", inputAttribs)); - tb.process(new Token.EndTag("label")); - tb.process(new Token.StartTag("hr")); - tb.process(new Token.EndTag("form")); - } else if (name.equals("textarea")) { - tb.insert(startTag); - // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) - tb.tokeniser.transition(TokeniserState.Rcdata); - tb.markInsertionMode(); - tb.framesetOk(false); - tb.transition(Text); - } else if (name.equals("xmp")) { - if (tb.inButtonScope("p")) { - tb.process(new Token.EndTag("p")); - } - tb.reconstructFormattingElements(); - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("iframe")) { - tb.framesetOk(false); - handleRawtext(startTag, tb); - } else if (name.equals("noembed")) { - // also handle noscript if script enabled - handleRawtext(startTag, tb); - } else if (name.equals("select")) { - tb.reconstructFormattingElements(); - tb.insert(startTag); - tb.framesetOk(false); + } + inputAttribs.put("name", "isindex"); + tb.process(new Token.StartTag("input", inputAttribs)); + tb.process(new Token.EndTag("label")); + tb.process(new Token.StartTag("hr")); + tb.process(new Token.EndTag("form")); + } else if (name.equals("textarea")) { + tb.insert(startTag); + // todo: If the next token is a U+000A LINE FEED (LF) + // character token, then ignore that token and move on to + // the next one. (Newlines at the start of textarea elements + // are ignored as an authoring convenience.) + tb.tokeniser.transition(TokeniserState.Rcdata); + tb.markInsertionMode(); + tb.framesetOk(false); + tb.transition(Text); + } else if (name.equals("xmp")) { + if (tb.inButtonScope("p")) { + tb.process(new Token.EndTag("p")); + } + tb.reconstructFormattingElements(); + tb.framesetOk(false); + handleRawtext(startTag, tb); + } else if (name.equals("iframe")) { + tb.framesetOk(false); + handleRawtext(startTag, tb); + } else if (name.equals("noembed")) { + // also handle noscript if script enabled + handleRawtext(startTag, tb); + } else if (name.equals("select")) { + tb.reconstructFormattingElements(); + tb.insert(startTag); + tb.framesetOk(false); - HtmlTreeBuilderState state = tb.state(); - if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) - tb.transition(InSelectInTable); - else - tb.transition(InSelect); - } else if (StringUtil.in("optgroup", "option")) { - if (tb.currentElement().nodeName().equals("option")) - tb.process(new Token.EndTag("option")); - tb.reconstructFormattingElements(); - tb.insert(startTag); - } else if (StringUtil.in("rp", "rt")) { - if (tb.inScope("ruby")) { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("ruby")) { - tb.error(this); - tb.popStackToBefore("ruby"); // i.e. close up to but not include name - } - tb.insert(startTag); + HtmlTreeBuilderState state = tb.state(); + if (state.equals(InTable) || state.equals(InCaption) + || state.equals(InTableBody) || state.equals(InRow) + || state.equals(InCell)) { + tb.transition(InSelectInTable); + } else { + tb.transition(InSelect); + } + } else if (StringUtil.in("optgroup", "option")) { + if (tb.currentElement().nodeName().equals("option")) { + tb.process(new Token.EndTag("option")); + } + tb.reconstructFormattingElements(); + tb.insert(startTag); + } else if (StringUtil.in("rp", "rt")) { + if (tb.inScope("ruby")) { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals("ruby")) { + tb.error(this); + tb.popStackToBefore("ruby"); // i.e. close up to but + // not include name } - } else if (name.equals("math")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) - tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (name.equals("svg")) { - tb.reconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, svg) tb.insert(startTag); - tb.tokeniser.acknowledgeSelfClosingFlag(); - } else if (StringUtil.in(name, - "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } + } else if (name.equals("math")) { + tb.reconstructFormattingElements(); + // todo: handle A start tag whose tag name is "math" (i.e. + // foreign, mathml) + tb.insert(startTag); + tb.tokeniser.acknowledgeSelfClosingFlag(); + } else if (name.equals("svg")) { + tb.reconstructFormattingElements(); + // todo: handle A start tag whose tag name is "svg" (xlink, + // svg) + tb.insert(startTag); + tb.tokeniser.acknowledgeSelfClosingFlag(); + } else if (StringUtil.in(name, "caption", "col", "colgroup", + "frame", "head", "tbody", "td", "tfoot", "th", "thead", + "tr")) { + tb.error(this); + return false; + } else { + tb.reconstructFormattingElements(); + tb.insert(startTag); + } + break; + + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (name.equals("body")) { + if (!tb.inScope("body")) { tb.error(this); return false; } else { - tb.reconstructFormattingElements(); - tb.insert(startTag); + // todo: error if stack contains something not dd, dt, + // li, optgroup, option, p, rp, rt, tbody, td, tfoot, + // th, thead, tr, body, html + tb.transition(AfterBody); } - break; - - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("body")) { - if (!tb.inScope("body")) { + } else if (name.equals("html")) { + boolean notIgnored = tb.process(new Token.EndTag("body")); + if (notIgnored) { + return tb.process(endTag); + } + } else if (StringUtil.in(name, "address", "article", "aside", + "blockquote", "button", "center", "details", "dir", + "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "listing", "menu", "nav", + "ol", "pre", "section", "summary", "ul")) { + // todo: refactor these lookups + if (!tb.inScope(name)) { + // nothing to close + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - tb.transition(AfterBody); } - } else if (name.equals("html")) { - boolean notIgnored = tb.process(new Token.EndTag("body")); - if (notIgnored) - return tb.process(endTag); - } else if (StringUtil.in(name, - "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", - "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", - "nav", "ol", "pre", "section", "summary", "ul")) { - // todo: refactor these lookups - if (!tb.inScope(name)) { - // nothing to close + tb.popStackToClose(name); + } + } else if (name.equals("form")) { + Element currentForm = tb.getFormElement(); + tb.setFormElement(null); + if (currentForm == null || !tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (name.equals("form")) { - Element currentForm = tb.getFormElement(); - tb.setFormElement(null); - if (currentForm == null || !tb.inScope(name)) { + // remove currentForm from stack. will shift anything + // under up. + tb.removeFromStack(currentForm); + } + } else if (name.equals("p")) { + if (!tb.inButtonScope(name)) { + tb.error(this); + tb.process(new Token.StartTag(name)); // if no p to + // close, creates + // an empty + // <p></p> + return tb.process(endTag); + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - // remove currentForm from stack. will shift anything under up. - tb.removeFromStack(currentForm); } - } else if (name.equals("p")) { - if (!tb.inButtonScope(name)) { + tb.popStackToClose(name); + } + } else if (name.equals("li")) { + if (!tb.inListItemScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - tb.process(new Token.StartTag(name)); // if no p to close, creates an empty <p></p> - return tb.process(endTag); - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (name.equals("li")) { - if (!tb.inListItemScope(name)) { + tb.popStackToClose(name); + } + } else if (StringUtil.in(name, "dd", "dt")) { + if (!tb.inScope(name)) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (StringUtil.in(name, "dd", "dt")) { - if (!tb.inScope(name)) { + tb.popStackToClose(name); + } + } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", + "h6")) { + if (!tb.inScope(new String[] { "h1", "h2", "h3", "h4", + "h5", "h6" })) { + tb.error(this); + return false; + } else { + tb.generateImpliedEndTags(name); + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); - return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); } - } else if (StringUtil.in(name, "h1", "h2", "h3", "h4", "h5", "h6")) { - if (!tb.inScope(new String[]{"h1", "h2", "h3", "h4", "h5", "h6"})) { + tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); + } + } else if (name.equals("sarcasm")) { + // *sigh* + return anyOtherEndTag(t, tb); + } else if (StringUtil.in(name, "a", "b", "big", "code", "em", + "font", "i", "nobr", "s", "small", "strike", "strong", + "tt", "u")) { + // Adoption Agency Algorithm. + OUTER: for (int i = 0; i < 8; i++) { + Element formatEl = tb.getActiveFormattingElement(name); + if (formatEl == null) { + return anyOtherEndTag(t, tb); + } else if (!tb.onStack(formatEl)) { + tb.error(this); + tb.removeFromActiveFormattingElements(formatEl); + return true; + } else if (!tb.inScope(formatEl.nodeName())) { tb.error(this); return false; - } else { - tb.generateImpliedEndTags(name); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); + } else if (tb.currentElement() != formatEl) { + tb.error(this); + } + + Element furthestBlock = null; + Element commonAncestor = null; + boolean seenFormattingElement = false; + LinkedList<Element> stack = tb.getStack(); + for (int si = 0; si < stack.size(); si++) { + Element el = stack.get(si); + if (el == formatEl) { + commonAncestor = stack.get(si - 1); + seenFormattingElement = true; + } else if (seenFormattingElement + && tb.isSpecial(el)) { + furthestBlock = el; + break; + } + } + if (furthestBlock == null) { + tb.popStackToClose(formatEl.nodeName()); + tb.removeFromActiveFormattingElements(formatEl); + return true; } - } else if (name.equals("sarcasm")) { - // *sigh* - return anyOtherEndTag(t, tb); - } else if (StringUtil.in(name, - "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u")) { - // Adoption Agency Algorithm. - OUTER: - for (int i = 0; i < 8; i++) { - Element formatEl = tb.getActiveFormattingElement(name); - if (formatEl == null) - return anyOtherEndTag(t, tb); - else if (!tb.onStack(formatEl)) { - tb.error(this); - tb.removeFromActiveFormattingElements(formatEl); - return true; - } else if (!tb.inScope(formatEl.nodeName())) { - tb.error(this); - return false; - } else if (tb.currentElement() != formatEl) - tb.error(this); - Element furthestBlock = null; - Element commonAncestor = null; - boolean seenFormattingElement = false; - LinkedList<Element> stack = tb.getStack(); - for (int si = 0; si < stack.size(); si++) { - Element el = stack.get(si); - if (el == formatEl) { - commonAncestor = stack.get(si - 1); - seenFormattingElement = true; - } else if (seenFormattingElement && tb.isSpecial(el)) { - furthestBlock = el; - break; - } + // todo: Let a bookmark note the position of the + // formatting element in the list of active formatting + // elements relative to the elements on either side of + // it in the list. + // does that mean: int pos of format el in list? + Element node = furthestBlock; + Element lastNode = furthestBlock; + INNER: for (int j = 0; j < 3; j++) { + if (tb.onStack(node)) { + node = tb.aboveOnStack(node); } - if (furthestBlock == null) { - tb.popStackToClose(formatEl.nodeName()); - tb.removeFromActiveFormattingElements(formatEl); - return true; + if (!tb.isInActiveFormattingElements(node)) { // note + // no + // bookmark + // check + tb.removeFromStack(node); + continue INNER; + } else if (node == formatEl) { + break INNER; } - // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. - // does that mean: int pos of format el in list? - Element node = furthestBlock; - Element lastNode = furthestBlock; - INNER: - for (int j = 0; j < 3; j++) { - if (tb.onStack(node)) - node = tb.aboveOnStack(node); - if (!tb.isInActiveFormattingElements(node)) { // note no bookmark check - tb.removeFromStack(node); - continue INNER; - } else if (node == formatEl) - break INNER; + Element replacement = new Element(Tag.valueOf(node + .nodeName()), tb.getBaseUri()); + tb.replaceActiveFormattingElement(node, replacement); + tb.replaceOnStack(node, replacement); + node = replacement; - Element replacement = new Element(Tag.valueOf(node.nodeName()), tb.getBaseUri()); - tb.replaceActiveFormattingElement(node, replacement); - tb.replaceOnStack(node, replacement); - node = replacement; + if (lastNode == furthestBlock) { + // todo: move the aforementioned bookmark to be + // immediately after the new node in the list of + // active formatting elements. + // not getting how this bookmark both straddles + // the element above, but is inbetween here... + } + if (lastNode.parent() != null) { + lastNode.remove(); + } + node.appendChild(lastNode); - if (lastNode == furthestBlock) { - // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements. - // not getting how this bookmark both straddles the element above, but is inbetween here... - } - if (lastNode.parent() != null) - lastNode.remove(); - node.appendChild(lastNode); + lastNode = node; + } - lastNode = node; + if (StringUtil.in(commonAncestor.nodeName(), "table", + "tbody", "tfoot", "thead", "tr")) { + if (lastNode.parent() != null) { + lastNode.remove(); } - - if (StringUtil.in(commonAncestor.nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - if (lastNode.parent() != null) - lastNode.remove(); - tb.insertInFosterParent(lastNode); - } else { - if (lastNode.parent() != null) - lastNode.remove(); - commonAncestor.appendChild(lastNode); + tb.insertInFosterParent(lastNode); + } else { + if (lastNode.parent() != null) { + lastNode.remove(); } + commonAncestor.appendChild(lastNode); + } - Element adopter = new Element(Tag.valueOf(name), tb.getBaseUri()); - Node[] childNodes = furthestBlock.childNodes().toArray(new Node[furthestBlock.childNodes().size()]); - for (Node childNode : childNodes) { - adopter.appendChild(childNode); // append will reparent. thus the clone to avoid concurrent mod. - } - furthestBlock.appendChild(adopter); - tb.removeFromActiveFormattingElements(formatEl); - // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. - tb.removeFromStack(formatEl); - tb.insertOnStackAfter(furthestBlock, adopter); + Element adopter = new Element(Tag.valueOf(name), + tb.getBaseUri()); + Node[] childNodes = furthestBlock.childNodes().toArray( + new Node[furthestBlock.childNodes().size()]); + for (Node childNode : childNodes) { + adopter.appendChild(childNode); // append will + // reparent. thus + // the clone to + // avoid concurrent + // mod. } - } else if (StringUtil.in(name, "applet", "marquee", "object")) { - if (!tb.inScope("name")) { - if (!tb.inScope(name)) { - tb.error(this); - return false; - } - tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) - tb.error(this); - tb.popStackToClose(name); - tb.clearFormattingElementsToLastMarker(); + furthestBlock.appendChild(adopter); + tb.removeFromActiveFormattingElements(formatEl); + // todo: insert the new element into the list of active + // formatting elements at the position of the + // aforementioned bookmark. + tb.removeFromStack(formatEl); + tb.insertOnStackAfter(furthestBlock, adopter); + } + } else if (StringUtil.in(name, "applet", "marquee", "object")) { + if (!tb.inScope("name")) { + if (!tb.inScope(name)) { + tb.error(this); + return false; } - } else if (name.equals("br")) { - tb.error(this); - tb.process(new Token.StartTag("br")); - return false; - } else { - return anyOtherEndTag(t, tb); + tb.generateImpliedEndTags(); + if (!tb.currentElement().nodeName().equals(name)) { + tb.error(this); + } + tb.popStackToClose(name); + tb.clearFormattingElementsToLastMarker(); } + } else if (name.equals("br")) { + tb.error(this); + tb.process(new Token.StartTag("br")); + return false; + } else { + return anyOtherEndTag(t, tb); + } - break; - case EOF: - // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html - // stop parsing - break; + break; + case EOF: + // todo: error if stack contains something not dd, dt, li, p, + // tbody, td, tfoot, th, thead, tr, body, html + // stop parsing + break; } return true; } @@ -768,8 +880,9 @@ enum HtmlTreeBuilderState { Element node = it.next(); if (node.nodeName().equals(name)) { tb.generateImpliedEndTags(name); - if (!name.equals(tb.currentElement().nodeName())) + if (!name.equals(tb.currentElement().nodeName())) { tb.error(this); + } tb.popStackToClose(name); break; } else { @@ -784,6 +897,7 @@ enum HtmlTreeBuilderState { }, Text { // in script, style etc. normally treated as data tags + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isCharacter()) { tb.insert(t.asCharacter()); @@ -794,7 +908,8 @@ enum HtmlTreeBuilderState { tb.transition(tb.originalState()); return tb.process(t); } else if (t.isEndTag()) { - // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts + // if: An end tag whose tag name is "script" -- scripting + // nesting level, if evaluating scripts tb.pop(); tb.transition(tb.originalState()); } @@ -802,6 +917,7 @@ enum HtmlTreeBuilderState { } }, InTable { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isCharacter()) { tb.newPendingTableCharacters(); @@ -839,21 +955,23 @@ enum HtmlTreeBuilderState { } else if (name.equals("table")) { tb.error(this); boolean processed = tb.process(new Token.EndTag("table")); - if (processed) // only ignored if in fragment + if (processed) { return tb.process(t); + } } else if (StringUtil.in(name, "style", "script")) { return tb.process(t, InHead); } else if (name.equals("input")) { - if (!startTag.attributes.get("type").equalsIgnoreCase("hidden")) { + if (!startTag.attributes.get("type").equalsIgnoreCase( + "hidden")) { return anythingElse(t, tb); } else { tb.insertEmpty(startTag); } } else if (name.equals("form")) { tb.error(this); - if (tb.getFormElement() != null) + if (tb.getFormElement() != null) { return false; - else { + } else { Element form = tb.insertEmpty(startTag); tb.setFormElement(form); } @@ -872,16 +990,18 @@ enum HtmlTreeBuilderState { tb.popStackToClose("table"); } tb.resetInsertionMode(); - } else if (StringUtil.in(name, - "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr")) { tb.error(this); return false; } else { return anythingElse(t, tb); } } else if (t.isEOF()) { - if (tb.currentElement().nodeName().equals("html")) + if (tb.currentElement().nodeName().equals("html")) { tb.error(this); + } return true; // stops parsing } return anythingElse(t, tb); @@ -890,7 +1010,8 @@ enum HtmlTreeBuilderState { boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.error(this); boolean processed = true; - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { + if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", + "tfoot", "thead", "tr")) { tb.setFosterInserts(true); processed = tb.process(t, InBody); tb.setFosterInserts(false); @@ -901,42 +1022,47 @@ enum HtmlTreeBuilderState { } }, InTableText { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.getPendingTableCharacters().add(c); - } - break; - default: - if (tb.getPendingTableCharacters().size() > 0) { - for (Token.Character character : tb.getPendingTableCharacters()) { - if (!isWhitespace(character)) { - // InTable anything else section: - tb.error(this); - if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { - tb.setFosterInserts(true); - tb.process(character, InBody); - tb.setFosterInserts(false); - } else { - tb.process(character, InBody); - } - } else - tb.insert(character); + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { + tb.error(this); + return false; + } else { + tb.getPendingTableCharacters().add(c); + } + break; + default: + if (tb.getPendingTableCharacters().size() > 0) { + for (Token.Character character : tb + .getPendingTableCharacters()) { + if (!isWhitespace(character)) { + // InTable anything else section: + tb.error(this); + if (StringUtil.in(tb.currentElement().nodeName(), + "table", "tbody", "tfoot", "thead", "tr")) { + tb.setFosterInserts(true); + tb.process(character, InBody); + tb.setFosterInserts(false); + } else { + tb.process(character, InBody); + } + } else { + tb.insert(character); } - tb.newPendingTableCharacters(); } - tb.transition(tb.originalState()); - return tb.process(t); + tb.newPendingTableCharacters(); + } + tb.transition(tb.originalState()); + return tb.process(t); } return true; } }, InCaption { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isEndTag() && t.asEndTag().name().equals("caption")) { Token.EndTag endTag = t.asEndTag(); @@ -946,23 +1072,27 @@ enum HtmlTreeBuilderState { return false; } else { tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals("caption")) + if (!tb.currentElement().nodeName().equals("caption")) { tb.error(this); + } tb.popStackToClose("caption"); tb.clearFormattingElementsToLastMarker(); tb.transition(InTable); } - } else if (( - t.isStartTag() && StringUtil.in(t.asStartTag().name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || - t.isEndTag() && t.asEndTag().name().equals("table")) - ) { + } else if ((t.isStartTag() + && StringUtil.in(t.asStartTag().name(), "caption", "col", + "colgroup", "tbody", "td", "tfoot", "th", "thead", + "tr") || t.isEndTag() + && t.asEndTag().name().equals("table"))) { tb.error(this); boolean processed = tb.process(new Token.EndTag("caption")); - if (processed) + if (processed) { return tb.process(t); - } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), - "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } + } else if (t.isEndTag() + && StringUtil.in(t.asEndTag().name(), "body", "col", + "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr")) { tb.error(this); return false; } else { @@ -972,113 +1102,127 @@ enum HtmlTreeBuilderState { } }, InColumnGroup { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: - tb.error(this); - break; - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("html")) - return tb.process(t, InBody); - else if (name.equals("col")) - tb.insertEmpty(startTag); - else - return anythingElse(t, tb); - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (name.equals("colgroup")) { - if (tb.currentElement().nodeName().equals("html")) { // frag case - tb.error(this); - return false; - } else { - tb.pop(); - tb.transition(InTable); - } - } else - return anythingElse(t, tb); - break; - case EOF: - if (tb.currentElement().nodeName().equals("html")) - return true; // stop parsing; frag case - else - return anythingElse(t, tb); - default: + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + break; + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("html")) { + return tb.process(t, InBody); + } else if (name.equals("col")) { + tb.insertEmpty(startTag); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (name.equals("colgroup")) { + if (tb.currentElement().nodeName().equals("html")) { // frag + // case + tb.error(this); + return false; + } else { + tb.pop(); + tb.transition(InTable); + } + } else { + return anythingElse(t, tb); + } + break; + case EOF: + if (tb.currentElement().nodeName().equals("html")) { + return true; // stop parsing; frag case + } else { return anythingElse(t, tb); + } + default: + return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("colgroup")); - if (processed) // only ignored in frag case + if (processed) { return tb.process(t); + } return true; } }, InTableBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case StartTag: - Token.StartTag startTag = t.asStartTag(); - String name = startTag.name(); - if (name.equals("tr")) { - tb.clearStackToTableBodyContext(); - tb.insert(startTag); - tb.transition(InRow); - } else if (StringUtil.in(name, "th", "td")) { - tb.error(this); - tb.process(new Token.StartTag("tr")); - return tb.process(startTag); - } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { - return exitTableBody(t, tb); - } else - return anythingElse(t, tb); - break; - case EndTag: - Token.EndTag endTag = t.asEndTag(); - name = endTag.name(); - if (StringUtil.in(name, "tbody", "tfoot", "thead")) { - if (!tb.inTableScope(name)) { - tb.error(this); - return false; - } else { - tb.clearStackToTableBodyContext(); - tb.pop(); - tb.transition(InTable); - } - } else if (name.equals("table")) { - return exitTableBody(t, tb); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { + case StartTag: + Token.StartTag startTag = t.asStartTag(); + String name = startTag.name(); + if (name.equals("tr")) { + tb.clearStackToTableBodyContext(); + tb.insert(startTag); + tb.transition(InRow); + } else if (StringUtil.in(name, "th", "td")) { + tb.error(this); + tb.process(new Token.StartTag("tr")); + return tb.process(startTag); + } else if (StringUtil.in(name, "caption", "col", "colgroup", + "tbody", "tfoot", "thead")) { + return exitTableBody(t, tb); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag endTag = t.asEndTag(); + name = endTag.name(); + if (StringUtil.in(name, "tbody", "tfoot", "thead")) { + if (!tb.inTableScope(name)) { tb.error(this); return false; - } else - return anythingElse(t, tb); - break; - default: + } else { + tb.clearStackToTableBodyContext(); + tb.pop(); + tb.transition(InTable); + } + } else if (name.equals("table")) { + return exitTableBody(t, tb); + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html", "td", "th", "tr")) { + tb.error(this); + return false; + } else { return anythingElse(t, tb); + } + break; + default: + return anythingElse(t, tb); } return true; } private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { - if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb.inScope("tfoot"))) { + if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb + .inScope("tfoot"))) { // frag case tb.error(this); return false; } tb.clearStackToTableBodyContext(); - tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, tfoot, thead + tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, + // tfoot, + // thead return tb.process(t); } @@ -1087,6 +1231,7 @@ enum HtmlTreeBuilderState { } }, InRow { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); @@ -1097,7 +1242,8 @@ enum HtmlTreeBuilderState { tb.insert(startTag); tb.transition(InCell); tb.insertMarkerToFormattingElements(); - } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { + } else if (StringUtil.in(name, "caption", "col", "colgroup", + "tbody", "tfoot", "thead", "tr")) { return handleMissingTr(t, tb); } else { return anythingElse(t, tb); @@ -1123,7 +1269,8 @@ enum HtmlTreeBuilderState { } tb.process(new Token.EndTag("tr")); return tb.process(t); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html", "td", "th")) { tb.error(this); return false; } else { @@ -1141,13 +1288,15 @@ enum HtmlTreeBuilderState { private boolean handleMissingTr(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("tr")); - if (processed) + if (processed) { return tb.process(t); - else + } else { return false; + } } }, InCell { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); @@ -1156,19 +1305,24 @@ enum HtmlTreeBuilderState { if (StringUtil.in(name, "td", "th")) { if (!tb.inTableScope(name)) { tb.error(this); - tb.transition(InRow); // might not be in scope if empty: <td /> and processing fake end tag + tb.transition(InRow); // might not be in scope if empty: + // <td /> and processing fake end + // tag return false; } tb.generateImpliedEndTags(); - if (!tb.currentElement().nodeName().equals(name)) + if (!tb.currentElement().nodeName().equals(name)) { tb.error(this); + } tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); tb.transition(InRow); - } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { + } else if (StringUtil.in(name, "body", "caption", "col", + "colgroup", "html")) { tb.error(this); return false; - } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { + } else if (StringUtil.in(name, "table", "tbody", "tfoot", + "thead", "tr")) { if (!tb.inTableScope(name)) { tb.error(this); return false; @@ -1178,9 +1332,10 @@ enum HtmlTreeBuilderState { } else { return anythingElse(t, tb); } - } else if (t.isStartTag() && - StringUtil.in(t.asStartTag().name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { + } else if (t.isStartTag() + && StringUtil.in(t.asStartTag().name(), "caption", "col", + "colgroup", "tbody", "td", "tfoot", "th", "thead", + "tr")) { if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { tb.error(this); return false; @@ -1198,91 +1353,105 @@ enum HtmlTreeBuilderState { } private void closeCell(HtmlTreeBuilder tb) { - if (tb.inTableScope("td")) + if (tb.inTableScope("td")) { tb.process(new Token.EndTag("td")); - else - tb.process(new Token.EndTag("th")); // only here if th or td in scope + } else { + tb.process(new Token.EndTag("th")); // only here if th or td in + // scope + } } }, InSelect { + @Override boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { - case Character: - Token.Character c = t.asCharacter(); - if (c.getData().equals(nullString)) { - tb.error(this); - return false; - } else { - tb.insert(c); - } - break; - case Comment: - tb.insert(t.asComment()); - break; - case Doctype: + case Character: + Token.Character c = t.asCharacter(); + if (c.getData().equals(nullString)) { tb.error(this); return false; - case StartTag: - Token.StartTag start = t.asStartTag(); - String name = start.name(); - if (name.equals("html")) - return tb.process(start, InBody); - else if (name.equals("option")) { + } else { + tb.insert(c); + } + break; + case Comment: + tb.insert(t.asComment()); + break; + case Doctype: + tb.error(this); + return false; + case StartTag: + Token.StartTag start = t.asStartTag(); + String name = start.name(); + if (name.equals("html")) { + return tb.process(start, InBody); + } else if (name.equals("option")) { + tb.process(new Token.EndTag("option")); + tb.insert(start); + } else if (name.equals("optgroup")) { + if (tb.currentElement().nodeName().equals("option")) { tb.process(new Token.EndTag("option")); - tb.insert(start); - } else if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option")) - tb.process(new Token.EndTag("option")); - else if (tb.currentElement().nodeName().equals("optgroup")) - tb.process(new Token.EndTag("optgroup")); - tb.insert(start); - } else if (name.equals("select")) { - tb.error(this); - return tb.process(new Token.EndTag("select")); - } else if (StringUtil.in(name, "input", "keygen", "textarea")) { + } else if (tb.currentElement().nodeName() + .equals("optgroup")) { + tb.process(new Token.EndTag("optgroup")); + } + tb.insert(start); + } else if (name.equals("select")) { + tb.error(this); + return tb.process(new Token.EndTag("select")); + } else if (StringUtil.in(name, "input", "keygen", "textarea")) { + tb.error(this); + if (!tb.inSelectScope("select")) { + return false; // frag + } + tb.process(new Token.EndTag("select")); + return tb.process(start); + } else if (name.equals("script")) { + return tb.process(t, InHead); + } else { + return anythingElse(t, tb); + } + break; + case EndTag: + Token.EndTag end = t.asEndTag(); + name = end.name(); + if (name.equals("optgroup")) { + if (tb.currentElement().nodeName().equals("option") + && tb.aboveOnStack(tb.currentElement()) != null + && tb.aboveOnStack(tb.currentElement()).nodeName() + .equals("optgroup")) { + tb.process(new Token.EndTag("option")); + } + if (tb.currentElement().nodeName().equals("optgroup")) { + tb.pop(); + } else { tb.error(this); - if (!tb.inSelectScope("select")) - return false; // frag - tb.process(new Token.EndTag("select")); - return tb.process(start); - } else if (name.equals("script")) { - return tb.process(t, InHead); + } + } else if (name.equals("option")) { + if (tb.currentElement().nodeName().equals("option")) { + tb.pop(); } else { - return anythingElse(t, tb); + tb.error(this); } - break; - case EndTag: - Token.EndTag end = t.asEndTag(); - name = end.name(); - if (name.equals("optgroup")) { - if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName().equals("optgroup")) - tb.process(new Token.EndTag("option")); - if (tb.currentElement().nodeName().equals("optgroup")) - tb.pop(); - else - tb.error(this); - } else if (name.equals("option")) { - if (tb.currentElement().nodeName().equals("option")) - tb.pop(); - else - tb.error(this); - } else if (name.equals("select")) { - if (!tb.inSelectScope(name)) { - tb.error(this); - return false; - } else { - tb.popStackToClose(name); - tb.resetInsertionMode(); - } - } else - return anythingElse(t, tb); - break; - case EOF: - if (!tb.currentElement().nodeName().equals("html")) + } else if (name.equals("select")) { + if (!tb.inSelectScope(name)) { tb.error(this); - break; - default: + return false; + } else { + tb.popStackToClose(name); + tb.resetInsertionMode(); + } + } else { return anythingElse(t, tb); + } + break; + case EOF: + if (!tb.currentElement().nodeName().equals("html")) { + tb.error(this); + } + break; + default: + return anythingElse(t, tb); } return true; } @@ -1293,24 +1462,31 @@ enum HtmlTreeBuilderState { } }, InSelectInTable { + @Override boolean process(Token t, HtmlTreeBuilder tb) { - if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { + if (t.isStartTag() + && StringUtil.in(t.asStartTag().name(), "caption", "table", + "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); tb.process(new Token.EndTag("select")); return tb.process(t); - } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { + } else if (t.isEndTag() + && StringUtil.in(t.asEndTag().name(), "caption", "table", + "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); if (tb.inTableScope(t.asEndTag().name())) { tb.process(new Token.EndTag("select")); return (tb.process(t)); - } else + } else { return false; + } } else { return tb.process(t, InSelect); } } }, AfterBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return tb.process(t, InBody); @@ -1339,6 +1515,7 @@ enum HtmlTreeBuilderState { } }, InFrameset { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); @@ -1368,7 +1545,9 @@ enum HtmlTreeBuilderState { return false; } else { tb.pop(); - if (!tb.isFragmentParsing() && !tb.currentElement().nodeName().equals("frameset")) { + if (!tb.isFragmentParsing() + && !tb.currentElement().nodeName() + .equals("frameset")) { tb.transition(AfterFrameset); } } @@ -1385,6 +1564,7 @@ enum HtmlTreeBuilderState { } }, AfterFrameset { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); @@ -1397,7 +1577,8 @@ enum HtmlTreeBuilderState { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { tb.transition(AfterAfterFrameset); - } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { + } else if (t.isStartTag() + && t.asStartTag().name().equals("noframes")) { return tb.process(t, InHead); } else if (t.isEOF()) { // cool your heels, we're complete @@ -1409,10 +1590,12 @@ enum HtmlTreeBuilderState { } }, AfterAfterBody { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isComment()) { tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { + } else if (t.isDoctype() || isWhitespace(t) + || (t.isStartTag() && t.asStartTag().name().equals("html"))) { return tb.process(t, InBody); } else if (t.isEOF()) { // nice work chuck @@ -1425,14 +1608,17 @@ enum HtmlTreeBuilderState { } }, AfterAfterFrameset { + @Override boolean process(Token t, HtmlTreeBuilder tb) { if (t.isComment()) { tb.insert(t.asComment()); - } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { + } else if (t.isDoctype() || isWhitespace(t) + || (t.isStartTag() && t.asStartTag().name().equals("html"))) { return tb.process(t, InBody); } else if (t.isEOF()) { // nice work chuck - } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { + } else if (t.isStartTag() + && t.asStartTag().name().equals("noframes")) { return tb.process(t, InHead); } else { tb.error(this); @@ -1442,6 +1628,7 @@ enum HtmlTreeBuilderState { } }, ForeignContent { + @Override boolean process(Token t, HtmlTreeBuilder tb) { return true; // todo: implement. Also; how do we get here? @@ -1458,8 +1645,9 @@ enum HtmlTreeBuilderState { // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " for (int i = 0; i < data.length(); i++) { char c = data.charAt(i); - if (!StringUtil.isWhitespace(c)) + if (!StringUtil.isWhitespace(c)) { return false; + } } return true; } @@ -1473,7 +1661,8 @@ enum HtmlTreeBuilderState { tb.transition(Text); } - private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) { + private static void handleRawtext(Token.StartTag startTag, + HtmlTreeBuilder tb) { tb.insert(startTag); tb.tokeniser.transition(TokeniserState.Rawtext); tb.markInsertionMode(); diff --git a/server/src/org/jsoup/parser/ParseError.java b/server/src/org/jsoup/parser/ParseError.java index dfa090051b..eb3c240a59 100644 --- a/server/src/org/jsoup/parser/ParseError.java +++ b/server/src/org/jsoup/parser/ParseError.java @@ -1,7 +1,8 @@ package org.jsoup.parser; /** - * A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. + * A Parse Error records an error in the input HTML that occurs in either the + * tokenisation or the tree building phase. */ public class ParseError { private int pos; @@ -13,12 +14,13 @@ public class ParseError { } ParseError(int pos, String errorFormat, Object... args) { - this.errorMsg = String.format(errorFormat, args); + errorMsg = String.format(errorFormat, args); this.pos = pos; } /** * Retrieve the error message. + * * @return the error message. */ public String getErrorMessage() { @@ -27,6 +29,7 @@ public class ParseError { /** * Retrieves the offset of the error. + * * @return error offset within input */ public int getPosition() { diff --git a/server/src/org/jsoup/parser/ParseErrorList.java b/server/src/org/jsoup/parser/ParseErrorList.java index 3824ffbc4e..773dfcae24 100644 --- a/server/src/org/jsoup/parser/ParseErrorList.java +++ b/server/src/org/jsoup/parser/ParseErrorList.java @@ -7,15 +7,15 @@ import java.util.ArrayList; * * @author Jonathan Hedley */ -class ParseErrorList extends ArrayList<ParseError>{ +class ParseErrorList extends ArrayList<ParseError> { private static final int INITIAL_CAPACITY = 16; private final int maxSize; - + ParseErrorList(int initialCapacity, int maxSize) { super(initialCapacity); this.maxSize = maxSize; } - + boolean canAddError() { return size() < maxSize; } @@ -27,7 +27,7 @@ class ParseErrorList extends ArrayList<ParseError>{ static ParseErrorList noTracking() { return new ParseErrorList(0, 0); } - + static ParseErrorList tracking(int maxSize) { return new ParseErrorList(INITIAL_CAPACITY, maxSize); } diff --git a/server/src/org/jsoup/parser/Parser.java b/server/src/org/jsoup/parser/Parser.java index 2236219c06..a1f6fd5184 100644 --- a/server/src/org/jsoup/parser/Parser.java +++ b/server/src/org/jsoup/parser/Parser.java @@ -1,32 +1,36 @@ package org.jsoup.parser; +import java.util.List; + import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import java.util.List; - /** - * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods - * in {@link org.jsoup.Jsoup}. + * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use + * one of the more convenient parse methods in {@link org.jsoup.Jsoup}. */ public class Parser { - private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. - + private static final int DEFAULT_MAX_ERRORS = 0; // by default, error + // tracking is disabled. + private TreeBuilder treeBuilder; private int maxErrors = DEFAULT_MAX_ERRORS; private ParseErrorList errors; /** * Create a new Parser, using the specified TreeBuilder - * @param treeBuilder TreeBuilder to use to parse input into Documents. + * + * @param treeBuilder + * TreeBuilder to use to parse input into Documents. */ public Parser(TreeBuilder treeBuilder) { this.treeBuilder = treeBuilder; } - + public Document parseInput(String html, String baseUri) { - errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); + errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) + : ParseErrorList.noTracking(); Document doc = treeBuilder.parse(html, baseUri, errors); return doc; } @@ -34,6 +38,7 @@ public class Parser { // gets & sets /** * Get the TreeBuilder currently in use. + * * @return current TreeBuilder. */ public TreeBuilder getTreeBuilder() { @@ -42,7 +47,9 @@ public class Parser { /** * Update the TreeBuilder used when parsing content. - * @param treeBuilder current TreeBuilder + * + * @param treeBuilder + * current TreeBuilder * @return this, for chaining */ public Parser setTreeBuilder(TreeBuilder treeBuilder) { @@ -52,6 +59,7 @@ public class Parser { /** * Check if parse error tracking is enabled. + * * @return current track error state. */ public boolean isTrackErrors() { @@ -60,7 +68,9 @@ public class Parser { /** * Enable or disable parse error tracking for the next parse. - * @param maxErrors the maximum number of errors to track. Set to 0 to disable. + * + * @param maxErrors + * the maximum number of errors to track. Set to 0 to disable. * @return this, for chaining */ public Parser setTrackErrors(int maxErrors) { @@ -70,7 +80,9 @@ public class Parser { /** * Retrieve the parse errors, if any, from the last parse. - * @return list of parse errors, up to the size of the maximum errors tracked. + * + * @return list of parse errors, up to the size of the maximum errors + * tracked. */ public List<ParseError> getErrors() { return errors; @@ -79,10 +91,13 @@ public class Parser { // static parse functions below /** * Parse HTML into a Document. - * - * @param html HTML to parse - * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * + * + * @param html + * HTML to parse + * @param baseUri + * base URI of document (i.e. original fetch location), for + * resolving relative URLs. + * * @return parsed Document */ public static Document parse(String html, String baseUri) { @@ -91,33 +106,49 @@ public class Parser { } /** - * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. - * - * @param fragmentHtml the fragment of HTML to parse - * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This - * provides stack context (for implicit element creation). - * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * - * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. + * Parse a fragment of HTML into a list of nodes. The context element, if + * supplied, supplies parsing context. + * + * @param fragmentHtml + * the fragment of HTML to parse + * @param context + * (optional) the element that this HTML fragment is being parsed + * for (i.e. for inner HTML). This provides stack context (for + * implicit element creation). + * @param baseUri + * base URI of document (i.e. original fetch location), for + * resolving relative URLs. + * + * @return list of nodes parsed from the input HTML. Note that the context + * element, if supplied, is not modified. */ - public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { + public static List<Node> parseFragment(String fragmentHtml, + Element context, String baseUri) { HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); - return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking()); + return treeBuilder.parseFragment(fragmentHtml, context, baseUri, + ParseErrorList.noTracking()); } /** * Parse a fragment of HTML into the {@code body} of a Document. - * - * @param bodyHtml fragment of HTML - * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * + * + * @param bodyHtml + * fragment of HTML + * @param baseUri + * base URI of document (i.e. original fetch location), for + * resolving relative URLs. + * * @return Document, with empty head, and HTML parsed into body */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { Document doc = Document.createShell(baseUri); Element body = doc.body(); List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); - Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented + Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node + // list gets + // modified + // when + // re-parented for (Node node : nodes) { body.appendChild(node); } @@ -125,21 +156,29 @@ public class Parser { } /** - * @param bodyHtml HTML to parse - * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. - * + * @param bodyHtml + * HTML to parse + * @param baseUri + * baseUri base URI of document (i.e. original fetch location), + * for resolving relative URLs. + * * @return parsed Document - * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. + * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} + * instead. */ - public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { + @Deprecated + public static Document parseBodyFragmentRelaxed(String bodyHtml, + String baseUri) { return parse(bodyHtml, baseUri); } - + // builders /** - * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, - * based on a knowledge of the semantics of the incoming tags. + * Create a new HTML parser. This parser treats input as HTML5, and enforces + * the creation of a normalised document, based on a knowledge of the + * semantics of the incoming tags. + * * @return a new HTML parser. */ public static Parser htmlParser() { @@ -147,8 +186,10 @@ public class Parser { } /** - * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, - * rather creates a simple tree directly from the input. + * Create a new XML parser. This parser assumes no knowledge of the incoming + * tags and does not treat it as HTML, rather creates a simple tree directly + * from the input. + * * @return a new simple XML parser. */ public static Parser xmlParser() { diff --git a/server/src/org/jsoup/parser/Tag.java b/server/src/org/jsoup/parser/Tag.java index 40b7557b39..c43f27aff3 100644 --- a/server/src/org/jsoup/parser/Tag.java +++ b/server/src/org/jsoup/parser/Tag.java @@ -1,25 +1,31 @@ package org.jsoup.parser; -import org.jsoup.helper.Validate; - import java.util.HashMap; import java.util.Map; +import org.jsoup.helper.Validate; + /** * HTML Tag capabilities. - * + * * @author Jonathan Hedley, jonathan@hedley.net */ public class Tag { - private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map of known tags + private static final Map<String, Tag> tags = new HashMap<String, Tag>(); // map + // of + // known + // tags private String tagName; private boolean isBlock = true; // block or inline private boolean formatAsBlock = true; // should be formatted as a block - private boolean canContainBlock = true; // Can this tag hold block level tags? + private boolean canContainBlock = true; // Can this tag hold block level + // tags? private boolean canContainInline = true; // only pcdata if not private boolean empty = false; // can hold nothing; e.g. img - private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. + private boolean selfClosing = false; // can self close (<foo />). used for + // unknown tags that self close, + // without forcing them as empty. private boolean preserveWhitespace = false; // for pre, textarea, script etc private Tag(String tagName) { @@ -28,7 +34,7 @@ public class Tag { /** * Get this tag's name. - * + * * @return the tag's name */ public String getName() { @@ -36,11 +42,14 @@ public class Tag { } /** - * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. + * Get a Tag by name. If not previously defined (unknown), returns a new + * generic tag, that can do anything. * <p/> - * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). - * - * @param tagName Name of tag, e.g. "p". Case insensitive. + * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not + * registered and will only .equals(). + * + * @param tagName + * Name of tag, e.g. "p". Case insensitive. * @return The tag, either defined or new generic. */ public static Tag valueOf(String tagName) { @@ -51,7 +60,8 @@ public class Tag { synchronized (tags) { Tag tag = tags.get(tagName); if (tag == null) { - // not defined: create default; go anywhere, do anything! (incl be inside a <p>) + // not defined: create default; go anywhere, do anything! (incl + // be inside a <p>) tag = new Tag(tagName); tag.isBlock = false; tag.canContainBlock = true; @@ -62,7 +72,7 @@ public class Tag { /** * Gets if this is a block tag. - * + * * @return if block tag */ public boolean isBlock() { @@ -71,7 +81,7 @@ public class Tag { /** * Gets if this tag should be formatted as a block (or as inline) - * + * * @return if should be formatted as block or inline */ public boolean formatAsBlock() { @@ -80,7 +90,7 @@ public class Tag { /** * Gets if this tag can contain block tags. - * + * * @return if tag can contain block tags */ public boolean canContainBlock() { @@ -89,7 +99,7 @@ public class Tag { /** * Gets if this tag is an inline tag. - * + * * @return if this tag is an inline tag. */ public boolean isInline() { @@ -98,7 +108,7 @@ public class Tag { /** * Gets if this tag is a data only tag. - * + * * @return if this tag is a data only tag */ public boolean isData() { @@ -107,7 +117,7 @@ public class Tag { /** * Get if this is an empty tag - * + * * @return if this is an empty tag */ public boolean isEmpty() { @@ -116,7 +126,7 @@ public class Tag { /** * Get if this tag is self closing. - * + * * @return if this tag should be output as self closing. */ public boolean isSelfClosing() { @@ -125,7 +135,7 @@ public class Tag { /** * Get if this is a pre-defined tag, or was auto created on parsing. - * + * * @return if a known tag */ public boolean isKnownTag() { @@ -134,8 +144,9 @@ public class Tag { /** * Check if this tagname is a known tag. - * - * @param tagName name of tag + * + * @param tagName + * name of tag * @return if known HTML tag */ public static boolean isKnownTag(String tagName) { @@ -144,7 +155,7 @@ public class Tag { /** * Get if this tag should preserve whitespace within child text nodes. - * + * * @return if preserve whitepace */ public boolean preserveWhitespace() { @@ -158,19 +169,39 @@ public class Tag { @Override public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Tag)) return false; + if (this == o) { + return true; + } + if (!(o instanceof Tag)) { + return false; + } Tag tag = (Tag) o; - if (canContainBlock != tag.canContainBlock) return false; - if (canContainInline != tag.canContainInline) return false; - if (empty != tag.empty) return false; - if (formatAsBlock != tag.formatAsBlock) return false; - if (isBlock != tag.isBlock) return false; - if (preserveWhitespace != tag.preserveWhitespace) return false; - if (selfClosing != tag.selfClosing) return false; - if (!tagName.equals(tag.tagName)) return false; + if (canContainBlock != tag.canContainBlock) { + return false; + } + if (canContainInline != tag.canContainInline) { + return false; + } + if (empty != tag.empty) { + return false; + } + if (formatAsBlock != tag.formatAsBlock) { + return false; + } + if (isBlock != tag.isBlock) { + return false; + } + if (preserveWhitespace != tag.preserveWhitespace) { + return false; + } + if (selfClosing != tag.selfClosing) { + return false; + } + if (!tagName.equals(tag.tagName)) { + return false; + } return true; } @@ -188,34 +219,39 @@ public class Tag { return result; } + @Override public String toString() { return tagName; } // internal static initialisers: - // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources - private static final String[] blockTags = { - "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", - "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", - "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", - "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", - "td", "video", "audio", "canvas", "details", "menu", "plaintext" - }; - private static final String[] inlineTags = { - "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", - "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", - "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", - "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", - "summary", "command", "device" - }; - private static final String[] emptyTags = { - "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", - "device" - }; - private static final String[] formatAsInlineTags = { - "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style" - }; - private static final String[] preserveWhitespaceTags = {"pre", "plaintext", "title"}; + // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other + // sources + private static final String[] blockTags = { "html", "head", "body", + "frameset", "script", "noscript", "style", "meta", "link", "title", + "frame", "noframes", "section", "nav", "aside", "hgroup", "header", + "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", + "pre", "div", "blockquote", "hr", "address", "figure", + "figcaption", "form", "fieldset", "ins", "del", "dl", "dt", "dd", + "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", + "col", "tr", "th", "td", "video", "audio", "canvas", "details", + "menu", "plaintext" }; + private static final String[] inlineTags = { "object", "base", "font", + "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", + "samp", "kbd", "var", "cite", "abbr", "time", "acronym", "mark", + "ruby", "rt", "rp", "a", "img", "br", "wbr", "map", "q", "sub", + "sup", "bdo", "iframe", "embed", "span", "input", "select", + "textarea", "label", "button", "optgroup", "option", "legend", + "datalist", "keygen", "output", "progress", "meter", "area", + "param", "source", "track", "summary", "command", "device" }; + private static final String[] emptyTags = { "meta", "link", "base", + "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", + "col", "command", "device" }; + private static final String[] formatAsInlineTags = { "title", "a", "p", + "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", + "td", "script", "style" }; + private static final String[] preserveWhitespaceTags = { "pre", + "plaintext", "title" }; static { // creates diff --git a/server/src/org/jsoup/parser/Token.java b/server/src/org/jsoup/parser/Token.java index 9f4f9e250d..e465eb74e3 100644 --- a/server/src/org/jsoup/parser/Token.java +++ b/server/src/org/jsoup/parser/Token.java @@ -12,7 +12,7 @@ abstract class Token { private Token() { } - + String tokenType() { return this.getClass().getSimpleName(); } @@ -50,13 +50,16 @@ abstract class Token { private String pendingAttributeValue; boolean selfClosing = false; - Attributes attributes = new Attributes(); // todo: allow nodes to not have attributes + Attributes attributes = new Attributes(); // todo: allow nodes to not + // have attributes void newAttribute() { if (pendingAttributeName != null) { - if (pendingAttributeValue == null) + if (pendingAttributeValue == null) { pendingAttributeValue = ""; - Attribute attribute = new Attribute(pendingAttributeName, pendingAttributeValue); + } + Attribute attribute = new Attribute(pendingAttributeName, + pendingAttributeValue); attributes.put(attribute); } pendingAttributeName = null; @@ -85,12 +88,13 @@ abstract class Token { return selfClosing; } - @SuppressWarnings({"TypeMayBeWeakened"}) + @SuppressWarnings({ "TypeMayBeWeakened" }) Attributes getAttributes() { return attributes; } - // these appenders are rarely hit in not null state-- caused by null chars. + // these appenders are rarely hit in not null state-- caused by null + // chars. void appendTagName(String append) { tagName = tagName == null ? append : tagName.concat(append); } @@ -100,7 +104,8 @@ abstract class Token { } void appendAttributeName(String append) { - pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); + pendingAttributeName = pendingAttributeName == null ? append + : pendingAttributeName.concat(append); } void appendAttributeName(char append) { @@ -108,7 +113,8 @@ abstract class Token { } void appendAttributeValue(String append) { - pendingAttributeValue = pendingAttributeValue == null ? append : pendingAttributeValue.concat(append); + pendingAttributeValue = pendingAttributeValue == null ? append + : pendingAttributeValue.concat(append); } void appendAttributeValue(char append) { @@ -124,12 +130,12 @@ abstract class Token { StartTag(String name) { this(); - this.tagName = name; + tagName = name; } StartTag(String name, Attributes attributes) { this(); - this.tagName = name; + tagName = name; this.attributes = attributes; } @@ -139,7 +145,7 @@ abstract class Token { } } - static class EndTag extends Tag{ + static class EndTag extends Tag { EndTag() { super(); type = TokenType.EndTag; @@ -147,7 +153,7 @@ abstract class Token { EndTag(String name) { this(); - this.tagName = name; + tagName = name; } @Override @@ -242,11 +248,6 @@ abstract class Token { } enum TokenType { - Doctype, - StartTag, - EndTag, - Comment, - Character, - EOF + Doctype, StartTag, EndTag, Comment, Character, EOF } } diff --git a/server/src/org/jsoup/parser/TokenQueue.java b/server/src/org/jsoup/parser/TokenQueue.java index a2fdfe621a..3e7127e640 100644 --- a/server/src/org/jsoup/parser/TokenQueue.java +++ b/server/src/org/jsoup/parser/TokenQueue.java @@ -5,18 +5,20 @@ import org.jsoup.helper.Validate; /** * A character queue with parsing helpers. - * + * * @author Jonathan Hedley */ public class TokenQueue { private String queue; private int pos = 0; - + private static final char ESC = '\\'; // escape char for chomp balanced. /** - Create a new TokenQueue. - @param data string of data to back queue. + * Create a new TokenQueue. + * + * @param data + * string of data to back queue. */ public TokenQueue(String data) { Validate.notNull(data); @@ -25,18 +27,20 @@ public class TokenQueue { /** * Is the queue empty? + * * @return true if no data left in queue. */ public boolean isEmpty() { return remainingLength() == 0; } - + private int remainingLength() { return queue.length() - pos; } /** * Retrieves but does not remove the first character from the queue. + * * @return First character, or 0 if empty. */ public char peek() { @@ -44,16 +48,21 @@ public class TokenQueue { } /** - Add a character to the start of the queue (will be the next character retrieved). - @param c character to add + * Add a character to the start of the queue (will be the next character + * retrieved). + * + * @param c + * character to add */ public void addFirst(Character c) { addFirst(c.toString()); } /** - Add a string to the start of the queue. - @param seq string to add. + * Add a string to the start of the queue. + * + * @param seq + * string to add. */ public void addFirst(String seq) { // not very performant, but an edge case @@ -62,8 +71,11 @@ public class TokenQueue { } /** - * Tests if the next characters on the queue match the sequence. Case insensitive. - * @param seq String to check queue for. + * Tests if the next characters on the queue match the sequence. Case + * insensitive. + * + * @param seq + * String to check queue for. * @return true if the next characters match. */ public boolean matches(String seq) { @@ -72,47 +84,57 @@ public class TokenQueue { /** * Case sensitive match test. - * @param seq string to case sensitively check for + * + * @param seq + * string to case sensitively check for * @return true if matched, false if not */ public boolean matchesCS(String seq) { return queue.startsWith(seq, pos); } - /** - Tests if the next characters match any of the sequences. Case insensitive. - @param seq list of strings to case insensitively check for - @return true of any matched, false if none did + * Tests if the next characters match any of the sequences. Case + * insensitive. + * + * @param seq + * list of strings to case insensitively check for + * @return true of any matched, false if none did */ public boolean matchesAny(String... seq) { for (String s : seq) { - if (matches(s)) + if (matches(s)) { return true; + } } return false; } public boolean matchesAny(char... seq) { - if (isEmpty()) + if (isEmpty()) { return false; + } - for (char c: seq) { - if (queue.charAt(pos) == c) + for (char c : seq) { + if (queue.charAt(pos) == c) { return true; + } } return false; } public boolean matchesStartTag() { // micro opt for matching "<x" - return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character.isLetter(queue.charAt(pos+1))); + return (remainingLength() >= 2 && queue.charAt(pos) == '<' && Character + .isLetter(queue.charAt(pos + 1))); } /** - * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the - * queue. - * @param seq String to search for, and if found, remove from queue. + * Tests if the queue matches the sequence (as with match), and if they do, + * removes the matched string from the queue. + * + * @param seq + * String to search for, and if found, remove from queue. * @return true if found and removed, false if not found. */ public boolean matchChomp(String seq) { @@ -125,16 +147,18 @@ public class TokenQueue { } /** - Tests if queue starts with a whitespace character. - @return if starts with whitespace + * Tests if queue starts with a whitespace character. + * + * @return if starts with whitespace */ public boolean matchesWhitespace() { return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); } /** - Test if the queue matches a word character (letter or digit). - @return if matches a word character + * Test if the queue matches a word character (letter or digit). + * + * @return if matches a word character */ public boolean matchesWord() { return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); @@ -144,11 +168,14 @@ public class TokenQueue { * Drops the next character off the queue. */ public void advance() { - if (!isEmpty()) pos++; + if (!isEmpty()) { + pos++; + } } /** * Consume one character off queue. + * * @return first character on queue. */ public char consume() { @@ -156,25 +183,36 @@ public class TokenQueue { } /** - * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will - * throw an illegal state exception -- but you should be running match() against that condition. - <p> - Case insensitive. - * @param seq sequence to remove from head of queue. + * Consumes the supplied sequence of the queue. If the queue does not start + * with the supplied sequence, will throw an illegal state exception -- but + * you should be running match() against that condition. + * <p> + * Case insensitive. + * + * @param seq + * sequence to remove from head of queue. */ public void consume(String seq) { - if (!matches(seq)) - throw new IllegalStateException("Queue did not match expected sequence"); + if (!matches(seq)) { + throw new IllegalStateException( + "Queue did not match expected sequence"); + } int len = seq.length(); - if (len > remainingLength()) - throw new IllegalStateException("Queue not long enough to consume sequence"); - + if (len > remainingLength()) { + throw new IllegalStateException( + "Queue not long enough to consume sequence"); + } + pos += len; } /** - * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. - * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> + * Pulls a string off the queue, up to but exclusive of the match sequence, + * or to the queue running out. + * + * @param seq + * String to end on (and not include in return, but leave on + * queue). <b>Case sensitive.</b> * @return The matched data consumed from queue. */ public String consumeTo(String seq) { @@ -187,38 +225,52 @@ public class TokenQueue { return remainder(); } } - + public String consumeToIgnoreCase(String seq) { int start = pos; String first = seq.substring(0, 1); - boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of + boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if + // first + // is + // not + // cased, + // use + // index + // of while (!isEmpty()) { - if (matches(seq)) + if (matches(seq)) { break; - + } + if (canScan) { int skip = queue.indexOf(first, pos) - pos; - if (skip == 0) // this char is the skip char, but not match, so force advance of pos + if (skip == 0) { pos++; - else if (skip < 0) // no chance of finding, grab to end + } else if (skip < 0) { pos = queue.length(); - else + } else { pos += skip; - } - else + } + } else { pos++; + } } - String data = queue.substring(start, pos); - return data; + String data = queue.substring(start, pos); + return data; } /** - Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. - @param seq any number of terminators to consume to. <b>Case insensitive.</b> - @return consumed string + * Consumes to the first sequence provided, or to the end of the queue. + * Leaves the terminator on the queue. + * + * @param seq + * any number of terminators to consume to. <b>Case + * insensitive.</b> + * @return consumed string */ - // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this + // todo: method name. not good that consumeTo cares for case, and consume to + // any doesn't. And the only use for this // is is a case sensitive time... public String consumeToAny(String... seq) { int start = pos; @@ -226,16 +278,20 @@ public class TokenQueue { pos++; } - String data = queue.substring(start, pos); - return data; + String data = queue.substring(start, pos); + return data; } /** - * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). + * Pulls a string off the queue (like consumeTo), and then pulls off the + * matched string (but does not return it). * <p> - * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go - * isEmpty() == true). - * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> + * If the queue runs out of characters before finding the seq, will return + * as much as it can (and queue will go isEmpty() == true). + * + * @param seq + * String to match up to, and not include in return, and to pull + * off queue. <b>Case sensitive.</b> * @return Data matched from queue. */ public String chompTo(String seq) { @@ -243,7 +299,7 @@ public class TokenQueue { matchChomp(seq); return data; } - + public String chompToIgnoreCase(String seq) { String data = consumeToIgnoreCase(seq); // case insensitive scan matchChomp(seq); @@ -251,12 +307,17 @@ public class TokenQueue { } /** - * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", - * and leave " four" on the queue. Unbalanced openers and closers can be escaped (with \). Those escapes will be left - * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for + * Pulls a balanced string off the queue. E.g. if queue is + * "(one (two) three) four", (,) will return "one (two) three", and leave + * " four" on the queue. Unbalanced openers and closers can be escaped (with + * \). Those escapes will be left in the returned string, which is suitable + * for regexes (where we need to preserve the escape), but unsuitable for * contains text strings; use unescape for that. - * @param open opener - * @param close closer + * + * @param open + * opener + * @param close + * closer * @return data matched from the queue */ public String chompBalanced(char open, char close) { @@ -265,25 +326,32 @@ public class TokenQueue { char last = 0; do { - if (isEmpty()) break; + if (isEmpty()) { + break; + } Character c = consume(); if (last == 0 || last != ESC) { - if (c.equals(open)) + if (c.equals(open)) { depth++; - else if (c.equals(close)) + } else if (c.equals(close)) { depth--; + } } - if (depth > 0 && last != 0) - accum.append(c); // don't include the outer match pair in the return + if (depth > 0 && last != 0) { + accum.append(c); // don't include the outer match pair in the + // return + } last = c; } while (depth > 0); return accum.toString(); } - + /** * Unescaped a \ escaped string. - * @param in backslash escaped string + * + * @param in + * backslash escaped string * @return unescaped string */ public static String unescape(String in) { @@ -291,11 +359,12 @@ public class TokenQueue { char last = 0; for (char c : in.toCharArray()) { if (c == ESC) { - if (last != 0 && last == ESC) + if (last != 0 && last == ESC) { out.append(c); - } - else + } + } else { out.append(c); + } last = c; } return out.toString(); @@ -315,15 +384,17 @@ public class TokenQueue { /** * Retrieves the next run of word type (letter or digit) off the queue. + * * @return String of word characters from queue, or empty string if none. */ public String consumeWord() { int start = pos; - while (matchesWord()) + while (matchesWord()) { pos++; + } return queue.substring(start, pos); } - + /** * Consume an tag name off the queue (word or :, _, -) * @@ -331,53 +402,61 @@ public class TokenQueue { */ public String consumeTagName() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) + while (!isEmpty() && (matchesWord() || matchesAny(':', '_', '-'))) { pos++; - + } + return queue.substring(start, pos); } - + /** - * Consume a CSS element selector (tag name, but | instead of : for namespaces, to not conflict with :pseudo selects). + * Consume a CSS element selector (tag name, but | instead of : for + * namespaces, to not conflict with :pseudo selects). * * @return tag name */ public String consumeElementSelector() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) + while (!isEmpty() && (matchesWord() || matchesAny('|', '_', '-'))) { pos++; - + } + return queue.substring(start, pos); } /** - Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) - http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier - @return identifier + * Consume a CSS identifier (ID or class) off the queue (letter, digit, -, + * _) http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier + * + * @return identifier */ public String consumeCssIdentifier() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) + while (!isEmpty() && (matchesWord() || matchesAny('-', '_'))) { pos++; + } return queue.substring(start, pos); } /** - Consume an attribute key off the queue (letter, digit, -, _, :") - @return attribute key + * Consume an attribute key off the queue (letter, digit, -, _, :") + * + * @return attribute key */ public String consumeAttributeKey() { int start = pos; - while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) + while (!isEmpty() && (matchesWord() || matchesAny('-', '_', ':'))) { pos++; - + } + return queue.substring(start, pos); } /** - Consume and return whatever is left on the queue. - @return remained of queue. + * Consume and return whatever is left on the queue. + * + * @return remained of queue. */ public String remainder() { StringBuilder accum = new StringBuilder(); @@ -386,7 +465,8 @@ public class TokenQueue { } return accum.toString(); } - + + @Override public String toString() { return queue.substring(pos); } diff --git a/server/src/org/jsoup/parser/Tokeniser.java b/server/src/org/jsoup/parser/Tokeniser.java index ce6ee690d6..f46c962281 100644 --- a/server/src/org/jsoup/parser/Tokeniser.java +++ b/server/src/org/jsoup/parser/Tokeniser.java @@ -3,9 +3,6 @@ package org.jsoup.parser; import org.jsoup.helper.Validate; import org.jsoup.nodes.Entities; -import java.util.ArrayList; -import java.util.List; - /** * Readers the input stream into tokens. */ @@ -15,16 +12,21 @@ class Tokeniser { private CharacterReader reader; // html input private ParseErrorList errors; // errors found while tokenising - private TokeniserState state = TokeniserState.Data; // current tokenisation state + private TokeniserState state = TokeniserState.Data; // current tokenisation + // state private Token emitPending; // the token we are about to emit on next read private boolean isEmitPending = false; - private StringBuilder charBuffer = new StringBuilder(); // buffers characters to output as one token + private StringBuilder charBuffer = new StringBuilder(); // buffers + // characters to + // output as one + // token StringBuilder dataBuffer; // buffers data looking for </script> Token.Tag tagPending; // tag we are building up Token.Doctype doctypePending; // doctype building up Token.Comment commentPending; // comment building up - private Token.StartTag lastStartTag; // the last start tag emitted, to test appropriate end tag + private Token.StartTag lastStartTag; // the last start tag emitted, to test + // appropriate end tag private boolean selfClosingFlagAcknowledged = true; Tokeniser(CharacterReader reader, ParseErrorList errors) { @@ -38,10 +40,12 @@ class Tokeniser { selfClosingFlagAcknowledged = true; } - while (!isEmitPending) + while (!isEmitPending) { state.read(this, reader); + } - // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: + // if emit is pending, a non-character token was found: return any chars + // in buffer, and leave token for next read: if (charBuffer.length() > 0) { String str = charBuffer.toString(); charBuffer.delete(0, charBuffer.length()); @@ -61,17 +65,20 @@ class Tokeniser { if (token.type == Token.TokenType.StartTag) { Token.StartTag startTag = (Token.StartTag) token; lastStartTag = startTag; - if (startTag.selfClosing) + if (startTag.selfClosing) { selfClosingFlagAcknowledged = false; + } } else if (token.type == Token.TokenType.EndTag) { Token.EndTag endTag = (Token.EndTag) token; - if (endTag.attributes.size() > 0) + if (endTag.attributes.size() > 0) { error("Attributes incorrectly present on end tag"); + } } } void emit(String str) { - // buffer strings up until last string token found, to emit only one token for a run of character refs etc. + // buffer strings up until last string token found, to emit only one + // token for a run of character refs etc. // does not set isEmitPending; read checks that charBuffer.append(str); } @@ -97,32 +104,40 @@ class Tokeniser { selfClosingFlagAcknowledged = true; } - Character consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { - if (reader.isEmpty()) + Character consumeCharacterReference(Character additionalAllowedCharacter, + boolean inAttribute) { + if (reader.isEmpty()) { return null; - if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) + } + if (additionalAllowedCharacter != null + && additionalAllowedCharacter == reader.current()) { return null; - if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) + } + if (reader.matchesAny('\t', '\n', '\f', ' ', '<', '&')) { return null; + } reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); - String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); + String numRef = isHexMode ? reader.consumeHexSequence() : reader + .consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } - if (!reader.matchConsume(";")) + if (!reader.matchConsume(";")) { characterReferenceError("missing semicolon"); // missing semi + } int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip - if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { + if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) + || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); return replacementChar; } else { @@ -131,32 +146,40 @@ class Tokeniser { return (char) charval; } } else { // named - // get as many letters as possible, and look for matching entities. unconsume backwards till a match is found + // get as many letters as possible, and look for matching entities. + // unconsume backwards till a match is found String nameRef = reader.consumeLetterThenDigitSequence(); - String origNameRef = new String(nameRef); // for error reporting. nameRef gets chomped looking for matches + String origNameRef = new String(nameRef); // for error reporting. + // nameRef gets chomped + // looking for matches boolean looksLegit = reader.matches(';'); boolean found = false; while (nameRef.length() > 0 && !found) { - if (Entities.isNamedEntity(nameRef)) + if (Entities.isNamedEntity(nameRef)) { found = true; - else { - nameRef = nameRef.substring(0, nameRef.length()-1); + } else { + nameRef = nameRef.substring(0, nameRef.length() - 1); reader.unconsume(); } } if (!found) { - if (looksLegit) // named with semicolon - characterReferenceError(String.format("invalid named referenece '%s'", origNameRef)); + if (looksLegit) { + characterReferenceError(String.format( + "invalid named referenece '%s'", origNameRef)); + } reader.rewindToMark(); return null; } - if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { + if (inAttribute + && (reader.matchesLetter() || reader.matchesDigit() || reader + .matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } - if (!reader.matchConsume(";")) + if (!reader.matchConsume(";")) { characterReferenceError("missing semicolon"); // missing semi + } return Entities.getCharacterByName(nameRef); } } @@ -192,8 +215,9 @@ class Tokeniser { } boolean isAppropriateEndTagToken() { - if (lastStartTag == null) + if (lastStartTag == null) { return false; + } return tagPending.tagName.equals(lastStartTag.tagName); } @@ -202,23 +226,33 @@ class Tokeniser { } void error(TokeniserState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state)); + if (errors.canAddError()) { + errors.add(new ParseError(reader.pos(), + "Unexpected character '%s' in input state [%s]", reader + .current(), state)); + } } void eofError(TokeniserState state) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state)); + if (errors.canAddError()) { + errors.add(new ParseError( + reader.pos(), + "Unexpectedly reached end of file (EOF) in input state [%s]", + state)); + } } private void characterReferenceError(String message) { - if (errors.canAddError()) - errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message)); + if (errors.canAddError()) { + errors.add(new ParseError(reader.pos(), + "Invalid character reference: %s", message)); + } } private void error(String errorMsg) { - if (errors.canAddError()) + if (errors.canAddError()) { errors.add(new ParseError(reader.pos(), errorMsg)); + } } boolean currentNodeInHtmlNS() { diff --git a/server/src/org/jsoup/parser/TokeniserState.java b/server/src/org/jsoup/parser/TokeniserState.java index e3013c73e9..7f7315d769 100644 --- a/server/src/org/jsoup/parser/TokeniserState.java +++ b/server/src/org/jsoup/parser/TokeniserState.java @@ -5,162 +5,174 @@ package org.jsoup.parser; */ enum TokeniserState { Data { - // in data state, gather characters until a character reference or tag is found + // in data state, gather characters until a character reference or tag + // is found + @Override void read(Tokeniser t, CharacterReader r) { switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInData); - break; - case '<': - t.advanceTransition(TagOpen); - break; - case nullChar: - t.error(this); // NOT replacement character (oddly?) - t.emit(r.consume()); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; + case '&': + t.advanceTransition(CharacterReferenceInData); + break; + case '<': + t.advanceTransition(TagOpen); + break; + case nullChar: + t.error(this); // NOT replacement character (oddly?) + t.emit(r.consume()); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('&', '<', nullChar); + t.emit(data); + break; } } }, CharacterReferenceInData { // from & in data + @Override void read(Tokeniser t, CharacterReader r) { Character c = t.consumeCharacterReference(null, false); - if (c == null) + if (c == null) { t.emit('&'); - else + } else { t.emit(c); + } t.transition(Data); } }, Rcdata { - /// handles data in title, textarea etc + // / handles data in title, textarea etc + @Override void read(Tokeniser t, CharacterReader r) { switch (r.current()) { - case '&': - t.advanceTransition(CharacterReferenceInRcdata); - break; - case '<': - t.advanceTransition(RcdataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('&', '<', nullChar); - t.emit(data); - break; + case '&': + t.advanceTransition(CharacterReferenceInRcdata); + break; + case '<': + t.advanceTransition(RcdataLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('&', '<', nullChar); + t.emit(data); + break; } } }, CharacterReferenceInRcdata { + @Override void read(Tokeniser t, CharacterReader r) { Character c = t.consumeCharacterReference(null, false); - if (c == null) + if (c == null) { t.emit('&'); - else + } else { t.emit(c); + } t.transition(Rcdata); } }, Rawtext { + @Override void read(Tokeniser t, CharacterReader r) { switch (r.current()) { - case '<': - t.advanceTransition(RawtextLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; + case '<': + t.advanceTransition(RawtextLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('<', nullChar); + t.emit(data); + break; } } }, ScriptData { + @Override void read(Tokeniser t, CharacterReader r) { switch (r.current()) { - case '<': - t.advanceTransition(ScriptDataLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeToAny('<', nullChar); - t.emit(data); - break; + case '<': + t.advanceTransition(ScriptDataLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeToAny('<', nullChar); + t.emit(data); + break; } } }, PLAINTEXT { + @Override void read(Tokeniser t, CharacterReader r) { switch (r.current()) { - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.emit(new Token.EOF()); - break; - default: - String data = r.consumeTo(nullChar); - t.emit(data); - break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.emit(new Token.EOF()); + break; + default: + String data = r.consumeTo(nullChar); + t.emit(data); + break; } } }, TagOpen { // from < in data + @Override void read(Tokeniser t, CharacterReader r) { switch (r.current()) { - case '!': - t.advanceTransition(MarkupDeclarationOpen); - break; - case '/': - t.advanceTransition(EndTagOpen); - break; - case '?': - t.advanceTransition(BogusComment); - break; - default: - if (r.matchesLetter()) { - t.createTagPending(true); - t.transition(TagName); - } else { - t.error(this); - t.emit('<'); // char that got us here - t.transition(Data); - } - break; + case '!': + t.advanceTransition(MarkupDeclarationOpen); + break; + case '/': + t.advanceTransition(EndTagOpen); + break; + case '?': + t.advanceTransition(BogusComment); + break; + default: + if (r.matchesLetter()) { + t.createTagPending(true); + t.transition(TagName); + } else { + t.error(this); + t.emit('<'); // char that got us here + t.transition(Data); + } + break; } } }, EndTagOpen { + @Override void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); @@ -180,43 +192,49 @@ enum TokeniserState { }, TagName { // from < or </ in data, will have start or end tag pending + @Override void read(Tokeniser t, CharacterReader r) { - // previous TagOpen state did NOT consume, will have a letter char in current - String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', nullChar).toLowerCase(); + // previous TagOpen state did NOT consume, will have a letter char + // in current + String tagName = r.consumeToAny('\t', '\n', '\f', ' ', '/', '>', + nullChar).toLowerCase(); t.tagPending.appendTagName(tagName); switch (r.consume()) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: // replacement - t.tagPending.appendTagName(replacementStr); - break; - case eof: // should emit pending tag? - t.eofError(this); - t.transition(Data); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: // replacement + t.tagPending.appendTagName(replacementStr); + break; + case eof: // should emit pending tag? + t.eofError(this); + t.transition(Data); // no default, as covered with above consumeToAny } } }, RcdataLessthanSign { // from < in rcdata + @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); t.advanceTransition(RCDATAEndTagOpen); - } else if (r.matchesLetter() && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) { - // diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than + } else if (r.matchesLetter() + && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) { + // diverge from spec: got a start tag, but there's no + // appropriate end tag (</title>), so rather than // consuming to EOF; break out here t.tagPending = new Token.EndTag(t.appropriateEndTagName()); t.emitTagPending(); @@ -229,6 +247,7 @@ enum TokeniserState { } }, RCDATAEndTagOpen { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); @@ -242,6 +261,7 @@ enum TokeniserState { } }, RCDATAEndTagName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -252,31 +272,33 @@ enum TokeniserState { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - if (t.isAppropriateEndTagToken()) - t.transition(BeforeAttributeName); - else - anythingElse(t, r); - break; - case '/': - if (t.isAppropriateEndTagToken()) - t.transition(SelfClosingStartTag); - else - anythingElse(t, r); - break; - case '>': - if (t.isAppropriateEndTagToken()) { - t.emitTagPending(); - t.transition(Data); - } - else - anythingElse(t, r); - break; - default: + case '\t': + case '\n': + case '\f': + case ' ': + if (t.isAppropriateEndTagToken()) { + t.transition(BeforeAttributeName); + } else { + anythingElse(t, r); + } + break; + case '/': + if (t.isAppropriateEndTagToken()) { + t.transition(SelfClosingStartTag); + } else { anythingElse(t, r); + } + break; + case '>': + if (t.isAppropriateEndTagToken()) { + t.emitTagPending(); + t.transition(Data); + } else { + anythingElse(t, r); + } + break; + default: + anythingElse(t, r); } } @@ -286,6 +308,7 @@ enum TokeniserState { } }, RawtextLessthanSign { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.createTempBuffer(); @@ -297,6 +320,7 @@ enum TokeniserState { } }, RawtextEndTagOpen { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); @@ -308,6 +332,7 @@ enum TokeniserState { } }, RawtextEndTagName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -319,25 +344,26 @@ enum TokeniserState { if (t.isAppropriateEndTagToken() && !r.isEmpty()) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + anythingElse(t, r); } - } else + } else { anythingElse(t, r); + } } private void anythingElse(Tokeniser t, CharacterReader r) { @@ -346,24 +372,26 @@ enum TokeniserState { } }, ScriptDataLessthanSign { + @Override void read(Tokeniser t, CharacterReader r) { switch (r.consume()) { - case '/': - t.createTempBuffer(); - t.transition(ScriptDataEndTagOpen); - break; - case '!': - t.emit("<!"); - t.transition(ScriptDataEscapeStart); - break; - default: - t.emit("<"); - r.unconsume(); - t.transition(ScriptData); + case '/': + t.createTempBuffer(); + t.transition(ScriptDataEndTagOpen); + break; + case '!': + t.emit("<!"); + t.transition(ScriptDataEscapeStart); + break; + default: + t.emit("<"); + r.unconsume(); + t.transition(ScriptData); } } }, ScriptDataEndTagOpen { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); @@ -376,6 +404,7 @@ enum TokeniserState { } }, ScriptDataEndTagName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -387,22 +416,22 @@ enum TokeniserState { if (t.isAppropriateEndTagToken() && !r.isEmpty()) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + anythingElse(t, r); } } else { anythingElse(t, r); @@ -415,6 +444,7 @@ enum TokeniserState { } }, ScriptDataEscapeStart { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('-')) { t.emit('-'); @@ -425,6 +455,7 @@ enum TokeniserState { } }, ScriptDataEscapeStartDash { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('-')) { t.emit('-'); @@ -435,6 +466,7 @@ enum TokeniserState { } }, ScriptDataEscaped { + @Override void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); @@ -443,25 +475,26 @@ enum TokeniserState { } switch (r.current()) { - case '-': - t.emit('-'); - t.advanceTransition(ScriptDataEscapedDash); - break; - case '<': - t.advanceTransition(ScriptDataEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); + case '-': + t.emit('-'); + t.advanceTransition(ScriptDataEscapedDash); + break; + case '<': + t.advanceTransition(ScriptDataEscapedLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + default: + String data = r.consumeToAny('-', '<', nullChar); + t.emit(data); } } }, ScriptDataEscapedDash { + @Override void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); @@ -471,25 +504,26 @@ enum TokeniserState { char c = r.consume(); switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataEscapedDashDash); - break; - case '<': - t.transition(ScriptDataEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); + case '-': + t.emit(c); + t.transition(ScriptDataEscapedDashDash); + break; + case '<': + t.transition(ScriptDataEscapedLessthanSign); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataEscaped); + break; + default: + t.emit(c); + t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedDashDash { + @Override void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); @@ -499,28 +533,29 @@ enum TokeniserState { char c = r.consume(); switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.transition(ScriptDataEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataEscaped); - break; - default: - t.emit(c); - t.transition(ScriptDataEscaped); + case '-': + t.emit(c); + break; + case '<': + t.transition(ScriptDataEscapedLessthanSign); + break; + case '>': + t.emit(c); + t.transition(ScriptData); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataEscaped); + break; + default: + t.emit(c); + t.transition(ScriptDataEscaped); } } }, ScriptDataEscapedLessthanSign { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTempBuffer(); @@ -537,6 +572,7 @@ enum TokeniserState { } }, ScriptDataEscapedEndTagOpen { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createTagPending(false); @@ -550,6 +586,7 @@ enum TokeniserState { } }, ScriptDataEscapedEndTagName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -561,35 +598,36 @@ enum TokeniserState { if (t.isAppropriateEndTagToken() && !r.isEmpty()) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - default: - t.dataBuffer.append(c); - anythingElse(t, r); - break; + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + default: + t.dataBuffer.append(c); + anythingElse(t, r); + break; } } else { anythingElse(t, r); } } - + private void anythingElse(Tokeniser t, CharacterReader r) { t.emit("</" + t.dataBuffer.toString()); t.transition(ScriptDataEscaped); } }, ScriptDataDoubleEscapeStart { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -600,109 +638,114 @@ enum TokeniserState { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) - t.transition(ScriptDataDoubleEscaped); - else - t.transition(ScriptDataEscaped); - t.emit(c); - break; - default: - r.unconsume(); + case '\t': + case '\n': + case '\f': + case ' ': + case '/': + case '>': + if (t.dataBuffer.toString().equals("script")) { + t.transition(ScriptDataDoubleEscaped); + } else { t.transition(ScriptDataEscaped); + } + t.emit(c); + break; + default: + r.unconsume(); + t.transition(ScriptDataEscaped); } } }, ScriptDataDoubleEscaped { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.current(); switch (c) { - case '-': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedDash); - break; - case '<': - t.emit(c); - t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - r.advance(); - t.emit(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - String data = r.consumeToAny('-', '<', nullChar); - t.emit(data); + case '-': + t.emit(c); + t.advanceTransition(ScriptDataDoubleEscapedDash); + break; + case '<': + t.emit(c); + t.advanceTransition(ScriptDataDoubleEscapedLessthanSign); + break; + case nullChar: + t.error(this); + r.advance(); + t.emit(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + String data = r.consumeToAny('-', '<', nullChar); + t.emit(data); } } }, ScriptDataDoubleEscapedDash { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '-': - t.emit(c); - t.transition(ScriptDataDoubleEscapedDashDash); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); + case '-': + t.emit(c); + t.transition(ScriptDataDoubleEscapedDashDash); + break; + case '<': + t.emit(c); + t.transition(ScriptDataDoubleEscapedLessthanSign); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataDoubleEscaped); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.emit(c); + t.transition(ScriptDataDoubleEscaped); } } }, ScriptDataDoubleEscapedDashDash { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '-': - t.emit(c); - break; - case '<': - t.emit(c); - t.transition(ScriptDataDoubleEscapedLessthanSign); - break; - case '>': - t.emit(c); - t.transition(ScriptData); - break; - case nullChar: - t.error(this); - t.emit(replacementChar); - t.transition(ScriptDataDoubleEscaped); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.emit(c); - t.transition(ScriptDataDoubleEscaped); + case '-': + t.emit(c); + break; + case '<': + t.emit(c); + t.transition(ScriptDataDoubleEscapedLessthanSign); + break; + case '>': + t.emit(c); + t.transition(ScriptData); + break; + case nullChar: + t.error(this); + t.emit(replacementChar); + t.transition(ScriptDataDoubleEscaped); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.emit(c); + t.transition(ScriptDataDoubleEscaped); } } }, ScriptDataDoubleEscapedLessthanSign { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matches('/')) { t.emit('/'); @@ -714,6 +757,7 @@ enum TokeniserState { } }, ScriptDataDoubleEscapeEnd { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -724,357 +768,377 @@ enum TokeniserState { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - case '/': - case '>': - if (t.dataBuffer.toString().equals("script")) - t.transition(ScriptDataEscaped); - else - t.transition(ScriptDataDoubleEscaped); - t.emit(c); - break; - default: - r.unconsume(); + case '\t': + case '\n': + case '\f': + case ' ': + case '/': + case '>': + if (t.dataBuffer.toString().equals("script")) { + t.transition(ScriptDataEscaped); + } else { t.transition(ScriptDataDoubleEscaped); + } + t.emit(c); + break; + default: + r.unconsume(); + t.transition(ScriptDataDoubleEscaped); } } }, BeforeAttributeName { // from tagname <xxx + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); + case '\t': + case '\n': + case '\f': + case ' ': + break; // ignore whitespace + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + case '=': + t.error(this); + t.tagPending.newAttribute(); + t.tagPending.appendAttributeName(c); + t.transition(AttributeName); + break; + default: // A-Z, anything else + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); } } }, AttributeName { // from before attribute name + @Override void read(Tokeniser t, CharacterReader r) { - String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', nullChar, '"', '\'', '<'); + String name = r.consumeToAny('\t', '\n', '\f', ' ', '/', '=', '>', + nullChar, '"', '\'', '<'); t.tagPending.appendAttributeName(name.toLowerCase()); char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.appendAttributeName(c); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(AfterAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '=': + t.transition(BeforeAttributeValue); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeName(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + t.error(this); + t.tagPending.appendAttributeName(c); // no default, as covered in consumeToAny } } }, AfterAttributeName { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '=': - t.transition(BeforeAttributeValue); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeName(replacementChar); - t.transition(AttributeName); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - t.error(this); - t.tagPending.newAttribute(); - t.tagPending.appendAttributeName(c); - t.transition(AttributeName); - break; - default: // A-Z, anything else - t.tagPending.newAttribute(); - r.unconsume(); - t.transition(AttributeName); + case '\t': + case '\n': + case '\f': + case ' ': + // ignore + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '=': + t.transition(BeforeAttributeValue); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeName(replacementChar); + t.transition(AttributeName); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + t.error(this); + t.tagPending.newAttribute(); + t.tagPending.appendAttributeName(c); + t.transition(AttributeName); + break; + default: // A-Z, anything else + t.tagPending.newAttribute(); + r.unconsume(); + t.transition(AttributeName); } } }, BeforeAttributeValue { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - // ignore - break; - case '"': - t.transition(AttributeValue_doubleQuoted); - break; - case '&': - r.unconsume(); - t.transition(AttributeValue_unquoted); - break; - case '\'': - t.transition(AttributeValue_singleQuoted); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - t.transition(AttributeValue_unquoted); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '>': - t.error(this); - t.emitTagPending(); - t.transition(Data); - break; - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - t.transition(AttributeValue_unquoted); - break; - default: - r.unconsume(); - t.transition(AttributeValue_unquoted); + case '\t': + case '\n': + case '\f': + case ' ': + // ignore + break; + case '"': + t.transition(AttributeValue_doubleQuoted); + break; + case '&': + r.unconsume(); + t.transition(AttributeValue_unquoted); + break; + case '\'': + t.transition(AttributeValue_singleQuoted); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + t.transition(AttributeValue_unquoted); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '>': + t.error(this); + t.emitTagPending(); + t.transition(Data); + break; + case '<': + case '=': + case '`': + t.error(this); + t.tagPending.appendAttributeValue(c); + t.transition(AttributeValue_unquoted); + break; + default: + r.unconsume(); + t.transition(AttributeValue_unquoted); } } }, AttributeValue_doubleQuoted { + @Override void read(Tokeniser t, CharacterReader r) { String value = r.consumeToAny('"', '&', nullChar); - if (value.length() > 0) + if (value.length() > 0) { t.tagPending.appendAttributeValue(value); + } char c = r.consume(); switch (c) { - case '"': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('"', true); - if (ref != null) - t.tagPending.appendAttributeValue(ref); - else - t.tagPending.appendAttributeValue('&'); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above + case '"': + t.transition(AfterAttributeValue_quoted); + break; + case '&': + Character ref = t.consumeCharacterReference('"', true); + if (ref != null) { + t.tagPending.appendAttributeValue(ref); + } else { + t.tagPending.appendAttributeValue('&'); + } + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + // no default, handled in consume to any above } } }, AttributeValue_singleQuoted { + @Override void read(Tokeniser t, CharacterReader r) { String value = r.consumeToAny('\'', '&', nullChar); - if (value.length() > 0) + if (value.length() > 0) { t.tagPending.appendAttributeValue(value); + } char c = r.consume(); switch (c) { - case '\'': - t.transition(AfterAttributeValue_quoted); - break; - case '&': - Character ref = t.consumeCharacterReference('\'', true); - if (ref != null) - t.tagPending.appendAttributeValue(ref); - else - t.tagPending.appendAttributeValue('&'); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - // no default, handled in consume to any above + case '\'': + t.transition(AfterAttributeValue_quoted); + break; + case '&': + Character ref = t.consumeCharacterReference('\'', true); + if (ref != null) { + t.tagPending.appendAttributeValue(ref); + } else { + t.tagPending.appendAttributeValue('&'); + } + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + // no default, handled in consume to any above } } }, AttributeValue_unquoted { + @Override void read(Tokeniser t, CharacterReader r) { - String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', nullChar, '"', '\'', '<', '=', '`'); - if (value.length() > 0) + String value = r.consumeToAny('\t', '\n', '\f', ' ', '&', '>', + nullChar, '"', '\'', '<', '=', '`'); + if (value.length() > 0) { t.tagPending.appendAttributeValue(value); + } char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '&': - Character ref = t.consumeCharacterReference('>', true); - if (ref != null) - t.tagPending.appendAttributeValue(ref); - else - t.tagPending.appendAttributeValue('&'); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.tagPending.appendAttributeValue(replacementChar); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - case '"': - case '\'': - case '<': - case '=': - case '`': - t.error(this); - t.tagPending.appendAttributeValue(c); - break; - // no default, handled in consume to any above + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '&': + Character ref = t.consumeCharacterReference('>', true); + if (ref != null) { + t.tagPending.appendAttributeValue(ref); + } else { + t.tagPending.appendAttributeValue('&'); + } + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.tagPending.appendAttributeValue(replacementChar); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + case '"': + case '\'': + case '<': + case '=': + case '`': + t.error(this); + t.tagPending.appendAttributeValue(c); + break; + // no default, handled in consume to any above } } }, // CharacterReferenceInAttributeValue state handled inline AfterAttributeValue_quoted { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeAttributeName); - break; - case '/': - t.transition(SelfClosingStartTag); - break; - case '>': - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - r.unconsume(); - t.transition(BeforeAttributeName); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeAttributeName); + break; + case '/': + t.transition(SelfClosingStartTag); + break; + case '>': + t.emitTagPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.error(this); + r.unconsume(); + t.transition(BeforeAttributeName); } } }, SelfClosingStartTag { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '>': - t.tagPending.selfClosing = true; - t.emitTagPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeAttributeName); + case '>': + t.tagPending.selfClosing = true; + t.emitTagPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BeforeAttributeName); } } }, BogusComment { + @Override void read(Tokeniser t, CharacterReader r) { - // todo: handle bogus comment starting from eof. when does that trigger? + // todo: handle bogus comment starting from eof. when does that + // trigger? // rewind to capture character that lead us here r.unconsume(); Token.Comment comment = new Token.Comment(); @@ -1085,6 +1149,7 @@ enum TokeniserState { } }, MarkupDeclarationOpen { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchConsume("--")) { t.createCommentPending(); @@ -1092,202 +1157,214 @@ enum TokeniserState { } else if (r.matchConsumeIgnoreCase("DOCTYPE")) { t.transition(Doctype); } else if (r.matchConsume("[CDATA[")) { - // todo: should actually check current namepspace, and only non-html allows cdata. until namespace + // todo: should actually check current namepspace, and only + // non-html allows cdata. until namespace // is implemented properly, keep handling as cdata - //} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) { + // } else if (!t.currentNodeInHtmlNS() && + // r.matchConsume("[CDATA[")) { t.transition(CdataSection); } else { t.error(this); - t.advanceTransition(BogusComment); // advance so this character gets in bogus comment data's rewind + t.advanceTransition(BogusComment); // advance so this character + // gets in bogus comment + // data's rewind } } }, CommentStart { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); + case '-': + t.transition(CommentStartDash); + break; + case nullChar: + t.error(this); + t.commentPending.data.append(replacementChar); + t.transition(Comment); + break; + case '>': + t.error(this); + t.emitCommentPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(c); + t.transition(Comment); } } }, CommentStartDash { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '-': - t.transition(CommentStartDash); - break; - case nullChar: - t.error(this); - t.commentPending.data.append(replacementChar); - t.transition(Comment); - break; - case '>': - t.error(this); - t.emitCommentPending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(c); - t.transition(Comment); + case '-': + t.transition(CommentStartDash); + break; + case nullChar: + t.error(this); + t.commentPending.data.append(replacementChar); + t.transition(Comment); + break; + case '>': + t.error(this); + t.emitCommentPending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(c); + t.transition(Comment); } } }, Comment { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.current(); switch (c) { - case '-': - t.advanceTransition(CommentEndDash); - break; - case nullChar: - t.error(this); - r.advance(); - t.commentPending.data.append(replacementChar); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append(r.consumeToAny('-', nullChar)); + case '-': + t.advanceTransition(CommentEndDash); + break; + case nullChar: + t.error(this); + r.advance(); + t.commentPending.data.append(replacementChar); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append(r.consumeToAny('-', nullChar)); } } }, CommentEndDash { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '-': - t.transition(CommentEnd); - break; - case nullChar: - t.error(this); - t.commentPending.data.append('-').append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append('-').append(c); - t.transition(Comment); + case '-': + t.transition(CommentEnd); + break; + case nullChar: + t.error(this); + t.commentPending.data.append('-').append(replacementChar); + t.transition(Comment); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append('-').append(c); + t.transition(Comment); } } }, CommentEnd { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--").append(replacementChar); - t.transition(Comment); - break; - case '!': - t.error(this); - t.transition(CommentEndBang); - break; - case '-': - t.error(this); - t.commentPending.data.append('-'); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.error(this); - t.commentPending.data.append("--").append(c); - t.transition(Comment); + case '>': + t.emitCommentPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.commentPending.data.append("--").append(replacementChar); + t.transition(Comment); + break; + case '!': + t.error(this); + t.transition(CommentEndBang); + break; + case '-': + t.error(this); + t.commentPending.data.append('-'); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.error(this); + t.commentPending.data.append("--").append(c); + t.transition(Comment); } } }, CommentEndBang { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '-': - t.commentPending.data.append("--!"); - t.transition(CommentEndDash); - break; - case '>': - t.emitCommentPending(); - t.transition(Data); - break; - case nullChar: - t.error(this); - t.commentPending.data.append("--!").append(replacementChar); - t.transition(Comment); - break; - case eof: - t.eofError(this); - t.emitCommentPending(); - t.transition(Data); - break; - default: - t.commentPending.data.append("--!").append(c); - t.transition(Comment); + case '-': + t.commentPending.data.append("--!"); + t.transition(CommentEndDash); + break; + case '>': + t.emitCommentPending(); + t.transition(Data); + break; + case nullChar: + t.error(this); + t.commentPending.data.append("--!").append(replacementChar); + t.transition(Comment); + break; + case eof: + t.eofError(this); + t.emitCommentPending(); + t.transition(Data); + break; + default: + t.commentPending.data.append("--!").append(c); + t.transition(Comment); } } }, Doctype { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BeforeDoctypeName); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeDoctypeName); + break; + case eof: + t.eofError(this); + t.createDoctypePending(); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BeforeDoctypeName); } } }, BeforeDoctypeName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { t.createDoctypePending(); @@ -1296,31 +1373,32 @@ enum TokeniserState { } char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; // ignore whitespace - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - t.transition(DoctypeName); - break; - case eof: - t.eofError(this); - t.createDoctypePending(); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.createDoctypePending(); - t.doctypePending.name.append(c); - t.transition(DoctypeName); + case '\t': + case '\n': + case '\f': + case ' ': + break; // ignore whitespace + case nullChar: + t.error(this); + t.doctypePending.name.append(replacementChar); + t.transition(DoctypeName); + break; + case eof: + t.eofError(this); + t.createDoctypePending(); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.createDoctypePending(); + t.doctypePending.name.append(c); + t.transition(DoctypeName); } } }, DoctypeName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.matchesLetter()) { String name = r.consumeLetterSequence(); @@ -1329,32 +1407,33 @@ enum TokeniserState { } char c = r.consume(); switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(AfterDoctypeName); - break; - case nullChar: - t.error(this); - t.doctypePending.name.append(replacementChar); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.name.append(c); + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(AfterDoctypeName); + break; + case nullChar: + t.error(this); + t.doctypePending.name.append(replacementChar); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.name.append(c); } } }, AfterDoctypeName { + @Override void read(Tokeniser t, CharacterReader r) { if (r.isEmpty()) { t.eofError(this); @@ -1363,9 +1442,9 @@ enum TokeniserState { t.transition(Data); return; } - if (r.matchesAny('\t', '\n', '\f', ' ')) + if (r.matchesAny('\t', '\n', '\f', ' ')) { r.advance(); // ignore whitespace - else if (r.matches('>')) { + } else if (r.matches('>')) { t.emitDoctypePending(); t.advanceTransition(Data); } else if (r.matchConsumeIgnoreCase("PUBLIC")) { @@ -1381,385 +1460,398 @@ enum TokeniserState { } }, AfterDoctypePublicKeyword { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypePublicIdentifier); - break; - case '"': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeDoctypePublicIdentifier); + break; + case '"': + t.error(this); + // set public id to empty string + t.transition(DoctypePublicIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // set public id to empty string + t.transition(DoctypePublicIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); } } }, BeforeDoctypePublicIdentifier { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set public id to empty string - t.transition(DoctypePublicIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypePublicIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '"': + // set public id to empty string + t.transition(DoctypePublicIdentifier_doubleQuoted); + break; + case '\'': + // set public id to empty string + t.transition(DoctypePublicIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); } } }, DoctypePublicIdentifier_doubleQuoted { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '"': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); + case '"': + t.transition(AfterDoctypePublicIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.publicIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.publicIdentifier.append(c); } } }, DoctypePublicIdentifier_singleQuoted { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\'': - t.transition(AfterDoctypePublicIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.publicIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.publicIdentifier.append(c); + case '\'': + t.transition(AfterDoctypePublicIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.publicIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.publicIdentifier.append(c); } } }, AfterDoctypePublicIdentifier { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BetweenDoctypePublicAndSystemIdentifiers); - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BetweenDoctypePublicAndSystemIdentifiers); + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); } } }, BetweenDoctypePublicAndSystemIdentifiers { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); } } }, AfterDoctypeSystemKeyword { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - t.transition(BeforeDoctypeSystemIdentifier); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case '"': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - t.error(this); - // system id empty - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); + case '\t': + case '\n': + case '\f': + case ' ': + t.transition(BeforeDoctypeSystemIdentifier); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case '"': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + t.error(this); + // system id empty + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); } } }, BeforeDoctypeSystemIdentifier { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '"': - // set system id to empty string - t.transition(DoctypeSystemIdentifier_doubleQuoted); - break; - case '\'': - // set public id to empty string - t.transition(DoctypeSystemIdentifier_singleQuoted); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.doctypePending.forceQuirks = true; - t.transition(BogusDoctype); + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '"': + // set system id to empty string + t.transition(DoctypeSystemIdentifier_doubleQuoted); + break; + case '\'': + // set public id to empty string + t.transition(DoctypeSystemIdentifier_singleQuoted); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.doctypePending.forceQuirks = true; + t.transition(BogusDoctype); } } }, DoctypeSystemIdentifier_doubleQuoted { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '"': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); + case '"': + t.transition(AfterDoctypeSystemIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.systemIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.systemIdentifier.append(c); } } }, DoctypeSystemIdentifier_singleQuoted { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\'': - t.transition(AfterDoctypeSystemIdentifier); - break; - case nullChar: - t.error(this); - t.doctypePending.systemIdentifier.append(replacementChar); - break; - case '>': - t.error(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.doctypePending.systemIdentifier.append(c); + case '\'': + t.transition(AfterDoctypeSystemIdentifier); + break; + case nullChar: + t.error(this); + t.doctypePending.systemIdentifier.append(replacementChar); + break; + case '>': + t.error(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.doctypePending.systemIdentifier.append(c); } } }, AfterDoctypeSystemIdentifier { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '\t': - case '\n': - case '\f': - case ' ': - break; - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.eofError(this); - t.doctypePending.forceQuirks = true; - t.emitDoctypePending(); - t.transition(Data); - break; - default: - t.error(this); - t.transition(BogusDoctype); - // NOT force quirks + case '\t': + case '\n': + case '\f': + case ' ': + break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.eofError(this); + t.doctypePending.forceQuirks = true; + t.emitDoctypePending(); + t.transition(Data); + break; + default: + t.error(this); + t.transition(BogusDoctype); + // NOT force quirks } } }, BogusDoctype { + @Override void read(Tokeniser t, CharacterReader r) { char c = r.consume(); switch (c) { - case '>': - t.emitDoctypePending(); - t.transition(Data); - break; - case eof: - t.emitDoctypePending(); - t.transition(Data); - break; - default: - // ignore char - break; + case '>': + t.emitDoctypePending(); + t.transition(Data); + break; + case eof: + t.emitDoctypePending(); + t.transition(Data); + break; + default: + // ignore char + break; } } }, CdataSection { + @Override void read(Tokeniser t, CharacterReader r) { String data = r.consumeTo("]]>"); t.emit(data); @@ -1768,11 +1860,11 @@ enum TokeniserState { } }; - abstract void read(Tokeniser t, CharacterReader r); private static final char nullChar = '\u0000'; private static final char replacementChar = Tokeniser.replacementChar; - private static final String replacementStr = String.valueOf(Tokeniser.replacementChar); + private static final String replacementStr = String + .valueOf(Tokeniser.replacementChar); private static final char eof = CharacterReader.EOF; } diff --git a/server/src/org/jsoup/parser/TreeBuilder.java b/server/src/org/jsoup/parser/TreeBuilder.java index e06caad501..5e2dbebc66 100644 --- a/server/src/org/jsoup/parser/TreeBuilder.java +++ b/server/src/org/jsoup/parser/TreeBuilder.java @@ -5,9 +5,6 @@ import org.jsoup.helper.Validate; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import java.util.ArrayList; -import java.util.List; - /** * @author Jonathan Hedley */ @@ -15,12 +12,15 @@ abstract class TreeBuilder { CharacterReader reader; Tokeniser tokeniser; protected Document doc; // current doc we are building into - protected DescendableLinkedList<Element> stack; // the stack of open elements + protected DescendableLinkedList<Element> stack; // the stack of open + // elements protected String baseUri; // current base uri, for creating new elements - protected Token currentToken; // currentToken is used only for error tracking. + protected Token currentToken; // currentToken is used only for error + // tracking. protected ParseErrorList errors; // null when not tracking errors - protected void initialiseParse(String input, String baseUri, ParseErrorList errors) { + protected void initialiseParse(String input, String baseUri, + ParseErrorList errors) { Validate.notNull(input, "String input must not be null"); Validate.notNull(baseUri, "BaseURI must not be null"); @@ -47,8 +47,9 @@ abstract class TreeBuilder { Token token = tokeniser.read(); process(token); - if (token.type == Token.TokenType.EOF) + if (token.type == Token.TokenType.EOF) { break; + } } } diff --git a/server/src/org/jsoup/parser/XmlTreeBuilder.java b/server/src/org/jsoup/parser/XmlTreeBuilder.java index 3f03ad26ac..c2a3635b3d 100644 --- a/server/src/org/jsoup/parser/XmlTreeBuilder.java +++ b/server/src/org/jsoup/parser/XmlTreeBuilder.java @@ -1,43 +1,49 @@ package org.jsoup.parser; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.*; - import java.util.Iterator; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Comment; +import org.jsoup.nodes.DocumentType; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; + /** * @author Jonathan Hedley */ public class XmlTreeBuilder extends TreeBuilder { @Override - protected void initialiseParse(String input, String baseUri, ParseErrorList errors) { + protected void initialiseParse(String input, String baseUri, + ParseErrorList errors) { super.initialiseParse(input, baseUri, errors); - stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) + stack.add(doc); // place the document onto the stack. differs from + // HtmlTreeBuilder (not on stack) } @Override protected boolean process(Token token) { // start tag, end tag, doctype, comment, character, eof switch (token.type) { - case StartTag: - insert(token.asStartTag()); - break; - case EndTag: - popStackToClose(token.asEndTag()); - break; - case Comment: - insert(token.asComment()); - break; - case Character: - insert(token.asCharacter()); - break; - case Doctype: - insert(token.asDoctype()); - break; - case EOF: // could put some normalisation here if desired - break; - default: - Validate.fail("Unexpected token type: " + token.type); + case StartTag: + insert(token.asStartTag()); + break; + case EndTag: + popStackToClose(token.asEndTag()); + break; + case Comment: + insert(token.asComment()); + break; + case Character: + insert(token.asCharacter()); + break; + case Doctype: + insert(token.asDoctype()); + break; + case EOF: // could put some normalisation here if desired + break; + default: + Validate.fail("Unexpected token type: " + token.type); } return true; } @@ -48,13 +54,15 @@ public class XmlTreeBuilder extends TreeBuilder { Element insert(Token.StartTag startTag) { Tag tag = Tag.valueOf(startTag.name()); - // todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html. + // todo: wonder if for xml parsing, should treat all tags as unknown? + // because it's not html. Element el = new Element(tag, baseUri, startTag.attributes); insertNode(el); if (startTag.isSelfClosing()) { tokeniser.acknowledgeSelfClosingFlag(); - if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. + if (!tag.isKnownTag()) { tag.setSelfClosing(); + } } else { stack.add(el); } @@ -72,14 +80,15 @@ public class XmlTreeBuilder extends TreeBuilder { } void insert(Token.Doctype d) { - DocumentType doctypeNode = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); + DocumentType doctypeNode = new DocumentType(d.getName(), + d.getPublicIdentifier(), d.getSystemIdentifier(), baseUri); insertNode(doctypeNode); } /** - * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not - * found, skips. - * + * If the stack contains an element with this tag's name, pop up the stack + * to remove the first occurrence. If not found, skips. + * * @param endTag */ private void popStackToClose(Token.EndTag endTag) { @@ -94,8 +103,9 @@ public class XmlTreeBuilder extends TreeBuilder { break; } } - if (firstFound == null) + if (firstFound == null) { return; // not found, skip + } it = stack.descendingIterator(); while (it.hasNext()) { diff --git a/server/src/org/jsoup/parser/package-info.java b/server/src/org/jsoup/parser/package-info.java index 168fdf4086..c6c3d9a029 100644 --- a/server/src/org/jsoup/parser/package-info.java +++ b/server/src/org/jsoup/parser/package-info.java @@ -2,3 +2,4 @@ Contains the HTML parser, tag specifications, and HTML tokeniser. */ package org.jsoup.parser; + diff --git a/server/src/org/jsoup/safety/Cleaner.java b/server/src/org/jsoup/safety/Cleaner.java index eda67df86b..046efbbaa8 100644 --- a/server/src/org/jsoup/safety/Cleaner.java +++ b/server/src/org/jsoup/safety/Cleaner.java @@ -1,29 +1,41 @@ package org.jsoup.safety; +import java.util.List; + import org.jsoup.helper.Validate; -import org.jsoup.nodes.*; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; import org.jsoup.parser.Tag; -import java.util.List; - /** - The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes - that you are expecting; no junk, and no cross-site scripting attacks! - <p/> - The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain - HTML that is allowed by the whitelist. - <p/> - It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the - canned white-lists only allow body contained tags. - <p/> - Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. + * The whitelist based HTML cleaner. Use to ensure that end-user provided HTML + * contains only the elements and attributes that you are expecting; no junk, + * and no cross-site scripting attacks! + * <p/> + * The HTML cleaner parses the input as HTML and then runs it through a + * white-list, so the output HTML can only contain HTML that is allowed by the + * whitelist. + * <p/> + * It is assumed that the input HTML is a body fragment; the clean methods only + * pull from the source's body, and the canned white-lists only allow body + * contained tags. + * <p/> + * Rather than interacting directly with a Cleaner object, generally see the + * {@code clean} methods in {@link org.jsoup.Jsoup}. */ public class Cleaner { private Whitelist whitelist; /** - Create a new cleaner, that sanitizes documents using the supplied whitelist. - @param whitelist white-list to clean with + * Create a new cleaner, that sanitizes documents using the supplied + * whitelist. + * + * @param whitelist + * white-list to clean with */ public Cleaner(Whitelist whitelist) { Validate.notNull(whitelist); @@ -31,10 +43,14 @@ public class Cleaner { } /** - Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. - The original document is not modified. Only elements from the dirt document's <code>body</code> are used. - @param dirtyDocument Untrusted base document to clean. - @return cleaned document. + * Creates a new, clean document, from the original dirty document, + * containing only elements allowed by the whitelist. The original document + * is not modified. Only elements from the dirt document's <code>body</code> + * are used. + * + * @param dirtyDocument + * Untrusted base document to clean. + * @return cleaned document. */ public Document clean(Document dirtyDocument) { Validate.notNull(dirtyDocument); @@ -46,14 +62,20 @@ public class Cleaner { } /** - Determines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes - in the input HTML are allowed by the whitelist. - <p/> - This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully - using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document - to ensure enforced attributes are set correctly, and that the output is tidied. - @param dirtyDocument document to test - @return true if no tags or attributes need to be removed; false if they do + * Determines if the input document is valid, against the whitelist. It is + * considered valid if all the tags and attributes in the input HTML are + * allowed by the whitelist. + * <p/> + * This method can be used as a validator for user input forms. An invalid + * document will still be cleaned successfully using the + * {@link #clean(Document)} document. If using as a validator, it is + * recommended to still clean the document to ensure enforced attributes are + * set correctly, and that the output is tidied. + * + * @param dirtyDocument + * document to test + * @return true if no tags or attributes need to be removed; false if they + * do */ public boolean isValid(Document dirtyDocument) { Validate.notNull(dirtyDocument); @@ -64,10 +86,14 @@ public class Cleaner { } /** - Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. - @param source source of HTML - @param dest destination element to copy into - @return number of discarded elements (that were considered unsafe) + * Iterates the input and copies trusted nodes (tags, attributes, text) into + * the destination. + * + * @param source + * source of HTML + * @param dest + * destination element to copy into + * @return number of discarded elements (that were considered unsafe) */ private int copySafeNodes(Element source, Element dest) { List<Node> sourceChildren = source.childNodes(); @@ -77,20 +103,24 @@ public class Cleaner { if (sourceChild instanceof Element) { Element sourceEl = (Element) sourceChild; - if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs + if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone + // and copy safe + // attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; dest.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; numDiscarded += copySafeNodes(sourceEl, destChild); // recurs - } else { // not a safe tag, but it may have children (els or text) that are, so recurse + } else { // not a safe tag, but it may have children (els or + // text) that are, so recurse numDiscarded++; numDiscarded += copySafeNodes(sourceEl, dest); } } else if (sourceChild instanceof TextNode) { TextNode sourceText = (TextNode) sourceChild; - TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri()); + TextNode destText = new TextNode(sourceText.getWholeText(), + sourceChild.baseUri()); dest.appendChild(destText); } // else, we don't care about comments, xml proc instructions, etc } @@ -100,15 +130,17 @@ public class Cleaner { private ElementMeta createSafeElement(Element sourceEl) { String sourceTag = sourceEl.tagName(); Attributes destAttrs = new Attributes(); - Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs); + Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), + destAttrs); int numDiscarded = 0; Attributes sourceAttrs = sourceEl.attributes(); for (Attribute sourceAttr : sourceAttrs) { - if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) + if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) { destAttrs.put(sourceAttr); - else + } else { numDiscarded++; + } } Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag); destAttrs.addAll(enforcedAttrs); diff --git a/server/src/org/jsoup/safety/Whitelist.java b/server/src/org/jsoup/safety/Whitelist.java index 2c1150ce9e..b86cb5c6cf 100644 --- a/server/src/org/jsoup/safety/Whitelist.java +++ b/server/src/org/jsoup/safety/Whitelist.java @@ -1,171 +1,187 @@ package org.jsoup.safety; /* - Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired - this whitelist configuration, and the initial defaults. + Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/rgrove/sanitize/, which inspired + this whitelist configuration, and the initial defaults. */ -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Element; - import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Element; /** - Whitelists define what HTML (elements and attributes) to allow through the cleaner. Everything else is removed. - <p/> - Start with one of the defaults: - <ul> - <li>{@link #none} - <li>{@link #simpleText} - <li>{@link #basic} - <li>{@link #basicWithImages} - <li>{@link #relaxed} - </ul> - <p/> - If you need to allow more through (please be careful!), tweak a base whitelist with: - <ul> - <li>{@link #addTags} - <li>{@link #addAttributes} - <li>{@link #addEnforcedAttribute} - <li>{@link #addProtocols} - </ul> - <p/> - The cleaner and these whitelists assume that you want to clean a <code>body</code> fragment of HTML (to add user - supplied HTML into a templated page), and not to clean a full HTML document. If the latter is the case, either wrap the - document HTML around the cleaned body HTML, or create a whitelist that allows <code>html</code> and <code>head</code> - elements as appropriate. - <p/> - If you are going to extend a whitelist, please be very careful. Make sure you understand what attributes may lead to - XSS attack vectors. URL attributes are particularly vulnerable and require careful validation. See - http://ha.ckers.org/xss.html for some XSS attack examples. - - @author Jonathan Hedley + * Whitelists define what HTML (elements and attributes) to allow through the + * cleaner. Everything else is removed. + * <p/> + * Start with one of the defaults: + * <ul> + * <li>{@link #none} + * <li>{@link #simpleText} + * <li>{@link #basic} + * <li>{@link #basicWithImages} + * <li>{@link #relaxed} + * </ul> + * <p/> + * If you need to allow more through (please be careful!), tweak a base + * whitelist with: + * <ul> + * <li>{@link #addTags} + * <li>{@link #addAttributes} + * <li>{@link #addEnforcedAttribute} + * <li>{@link #addProtocols} + * </ul> + * <p/> + * The cleaner and these whitelists assume that you want to clean a + * <code>body</code> fragment of HTML (to add user supplied HTML into a + * templated page), and not to clean a full HTML document. If the latter is the + * case, either wrap the document HTML around the cleaned body HTML, or create a + * whitelist that allows <code>html</code> and <code>head</code> elements as + * appropriate. + * <p/> + * If you are going to extend a whitelist, please be very careful. Make sure you + * understand what attributes may lead to XSS attack vectors. URL attributes are + * particularly vulnerable and require careful validation. See + * http://ha.ckers.org/xss.html for some XSS attack examples. + * + * @author Jonathan Hedley */ public class Whitelist { - private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span] - private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. - private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values - private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes + private Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, + // span] + private Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. + // allowed attributes + // [href] for a tag. + private Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always + // set + // these + // attribute + // values + private Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed + // URL + // protocols + // for + // attributes private boolean preserveRelativeLinks; // option to preserve relative links /** - This whitelist allows only text nodes: all HTML will be stripped. - - @return whitelist + * This whitelist allows only text nodes: all HTML will be stripped. + * + * @return whitelist */ public static Whitelist none() { return new Whitelist(); } /** - This whitelist allows only simple text formatting: <code>b, em, i, strong, u</code>. All other HTML (tags and - attributes) will be removed. - - @return whitelist + * This whitelist allows only simple text formatting: + * <code>b, em, i, strong, u</code>. All other HTML (tags and attributes) + * will be removed. + * + * @return whitelist */ public static Whitelist simpleText() { - return new Whitelist() - .addTags("b", "em", "i", "strong", "u") - ; + return new Whitelist().addTags("b", "em", "i", "strong", "u"); } /** - This whitelist allows a fuller range of text nodes: <code>a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, - ol, p, pre, q, small, strike, strong, sub, sup, u, ul</code>, and appropriate attributes. - <p/> - Links (<code>a</code> elements) can point to <code>http, https, ftp, mailto</code>, and have an enforced - <code>rel=nofollow</code> attribute. - <p/> - Does not allow images. - - @return whitelist + * This whitelist allows a fuller range of text nodes: + * <code>a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, + ol, p, pre, q, small, strike, strong, sub, sup, u, ul</code>, and + * appropriate attributes. + * <p/> + * Links (<code>a</code> elements) can point to + * <code>http, https, ftp, mailto</code>, and have an enforced + * <code>rel=nofollow</code> attribute. + * <p/> + * Does not allow images. + * + * @return whitelist */ public static Whitelist basic() { return new Whitelist() - .addTags( - "a", "b", "blockquote", "br", "cite", "code", "dd", "dl", "dt", "em", - "i", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", - "sup", "u", "ul") + .addTags("a", "b", "blockquote", "br", "cite", "code", "dd", + "dl", "dt", "em", "i", "li", "ol", "p", "pre", "q", + "small", "strike", "strong", "sub", "sup", "u", "ul") - .addAttributes("a", "href") - .addAttributes("blockquote", "cite") + .addAttributes("a", "href").addAttributes("blockquote", "cite") .addAttributes("q", "cite") .addProtocols("a", "href", "ftp", "http", "https", "mailto") .addProtocols("blockquote", "cite", "http", "https") .addProtocols("cite", "cite", "http", "https") - .addEnforcedAttribute("a", "rel", "nofollow") - ; + .addEnforcedAttribute("a", "rel", "nofollow"); } /** - This whitelist allows the same text tags as {@link #basic}, and also allows <code>img</code> tags, with appropriate - attributes, with <code>src</code> pointing to <code>http</code> or <code>https</code>. - - @return whitelist + * This whitelist allows the same text tags as {@link #basic}, and also + * allows <code>img</code> tags, with appropriate attributes, with + * <code>src</code> pointing to <code>http</code> or <code>https</code>. + * + * @return whitelist */ public static Whitelist basicWithImages() { return basic() .addTags("img") - .addAttributes("img", "align", "alt", "height", "src", "title", "width") - .addProtocols("img", "src", "http", "https") - ; + .addAttributes("img", "align", "alt", "height", "src", "title", + "width").addProtocols("img", "src", "http", "https"); } /** - This whitelist allows a full range of text and structural body HTML: <code>a, b, blockquote, br, caption, cite, + * This whitelist allows a full range of text and structural body HTML: + * <code>a, b, blockquote, br, caption, cite, code, col, colgroup, dd, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, u, ul</code> - <p/> - Links do not have an enforced <code>rel=nofollow</code> attribute, but you can add that if desired. - - @return whitelist + * <p/> + * Links do not have an enforced <code>rel=nofollow</code> attribute, but + * you can add that if desired. + * + * @return whitelist */ public static Whitelist relaxed() { return new Whitelist() - .addTags( - "a", "b", "blockquote", "br", "caption", "cite", "code", "col", - "colgroup", "dd", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", - "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", - "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", - "ul") + .addTags("a", "b", "blockquote", "br", "caption", "cite", + "code", "col", "colgroup", "dd", "div", "dl", "dt", + "em", "h1", "h2", "h3", "h4", "h5", "h6", "i", "img", + "li", "ol", "p", "pre", "q", "small", "strike", + "strong", "sub", "sup", "table", "tbody", "td", + "tfoot", "th", "thead", "tr", "u", "ul") .addAttributes("a", "href", "title") .addAttributes("blockquote", "cite") .addAttributes("col", "span", "width") .addAttributes("colgroup", "span", "width") - .addAttributes("img", "align", "alt", "height", "src", "title", "width") + .addAttributes("img", "align", "alt", "height", "src", "title", + "width") .addAttributes("ol", "start", "type") .addAttributes("q", "cite") .addAttributes("table", "summary", "width") - .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width") - .addAttributes( - "th", "abbr", "axis", "colspan", "rowspan", "scope", + .addAttributes("td", "abbr", "axis", "colspan", "rowspan", "width") - .addAttributes("ul", "type") + .addAttributes("th", "abbr", "axis", "colspan", "rowspan", + "scope", "width").addAttributes("ul", "type") .addProtocols("a", "href", "ftp", "http", "https", "mailto") .addProtocols("blockquote", "cite", "http", "https") .addProtocols("img", "src", "http", "https") - .addProtocols("q", "cite", "http", "https") - ; + .addProtocols("q", "cite", "http", "https"); } /** - Create a new, empty whitelist. Generally it will be better to start with a default prepared whitelist instead. - - @see #basic() - @see #basicWithImages() - @see #simpleText() - @see #relaxed() + * Create a new, empty whitelist. Generally it will be better to start with + * a default prepared whitelist instead. + * + * @see #basic() + * @see #basicWithImages() + * @see #simpleText() + * @see #relaxed() */ public Whitelist() { tagNames = new HashSet<TagName>(); @@ -176,10 +192,12 @@ public class Whitelist { } /** - Add a list of allowed elements to a whitelist. (If a tag is not allowed, it will be removed from the HTML.) - - @param tags tag names to allow - @return this (for chaining) + * Add a list of allowed elements to a whitelist. (If a tag is not allowed, + * it will be removed from the HTML.) + * + * @param tags + * tag names to allow + * @return this (for chaining) */ public Whitelist addTags(String... tags) { Validate.notNull(tags); @@ -192,17 +210,22 @@ public class Whitelist { } /** - Add a list of allowed attributes to a tag. (If an attribute is not allowed on an element, it will be removed.) - <p/> - E.g.: <code>addAttributes("a", "href", "class")</code> allows <code>href</code> and <code>class</code> attributes - on <code>a</code> tags. - <p/> - To make an attribute valid for <b>all tags</b>, use the pseudo tag <code>:all</code>, e.g. - <code>addAttributes(":all", "class")</code>. - - @param tag The tag the attributes are for. The tag will be added to the allowed tag list if necessary. - @param keys List of valid attributes for the tag - @return this (for chaining) + * Add a list of allowed attributes to a tag. (If an attribute is not + * allowed on an element, it will be removed.) + * <p/> + * E.g.: <code>addAttributes("a", "href", "class")</code> allows + * <code>href</code> and <code>class</code> attributes on <code>a</code> + * tags. + * <p/> + * To make an attribute valid for <b>all tags</b>, use the pseudo tag + * <code>:all</code>, e.g. <code>addAttributes(":all", "class")</code>. + * + * @param tag + * The tag the attributes are for. The tag will be added to the + * allowed tag list if necessary. + * @param keys + * List of valid attributes for the tag + * @return this (for chaining) */ public Whitelist addAttributes(String tag, String... keys) { Validate.notEmpty(tag); @@ -210,8 +233,9 @@ public class Whitelist { Validate.isTrue(keys.length > 0, "No attributes supplied."); TagName tagName = TagName.valueOf(tag); - if (!tagNames.contains(tagName)) + if (!tagNames.contains(tagName)) { tagNames.add(tagName); + } Set<AttributeKey> attributeSet = new HashSet<AttributeKey>(); for (String key : keys) { Validate.notEmpty(key); @@ -227,16 +251,22 @@ public class Whitelist { } /** - Add an enforced attribute to a tag. An enforced attribute will always be added to the element. If the element - already has the attribute set, it will be overridden. - <p/> - E.g.: <code>addEnforcedAttribute("a", "rel", "nofollow")</code> will make all <code>a</code> tags output as - <code><a href="..." rel="nofollow"></code> - - @param tag The tag the enforced attribute is for. The tag will be added to the allowed tag list if necessary. - @param key The attribute key - @param value The enforced attribute value - @return this (for chaining) + * Add an enforced attribute to a tag. An enforced attribute will always be + * added to the element. If the element already has the attribute set, it + * will be overridden. + * <p/> + * E.g.: <code>addEnforcedAttribute("a", "rel", "nofollow")</code> will make + * all <code>a</code> tags output as + * <code><a href="..." rel="nofollow"></code> + * + * @param tag + * The tag the enforced attribute is for. The tag will be added + * to the allowed tag list if necessary. + * @param key + * The attribute key + * @param value + * The enforced attribute value + * @return this (for chaining) */ public Whitelist addEnforcedAttribute(String tag, String key, String value) { Validate.notEmpty(tag); @@ -244,8 +274,9 @@ public class Whitelist { Validate.notEmpty(value); TagName tagName = TagName.valueOf(tag); - if (!tagNames.contains(tagName)) + if (!tagNames.contains(tagName)) { tagNames.add(tagName); + } AttributeKey attrKey = AttributeKey.valueOf(key); AttributeValue attrVal = AttributeValue.valueOf(value); @@ -260,16 +291,21 @@ public class Whitelist { } /** - * Configure this Whitelist to preserve relative links in an element's URL attribute, or convert them to absolute - * links. By default, this is <b>false</b>: URLs will be made absolute (e.g. start with an allowed protocol, like - * e.g. {@code http://}. + * Configure this Whitelist to preserve relative links in an element's URL + * attribute, or convert them to absolute links. By default, this is + * <b>false</b>: URLs will be made absolute (e.g. start with an allowed + * protocol, like e.g. {@code http://}. * <p /> - * Note that when handling relative links, the input document must have an appropriate {@code base URI} set when - * parsing, so that the link's protocol can be confirmed. Regardless of the setting of the {@code preserve relative - * links} option, the link must be resolvable against the base URI to an allowed protocol; otherwise the attribute - * will be removed. - * - * @param preserve {@code true} to allow relative links, {@code false} (default) to deny + * Note that when handling relative links, the input document must have an + * appropriate {@code base URI} set when parsing, so that the link's + * protocol can be confirmed. Regardless of the setting of the + * {@code preserve relative + * links} option, the link must be resolvable against the base URI to an + * allowed protocol; otherwise the attribute will be removed. + * + * @param preserve + * {@code true} to allow relative links, {@code false} (default) + * to deny * @return this Whitelist, for chaining. * @see #addProtocols */ @@ -279,15 +315,18 @@ public class Whitelist { } /** - Add allowed URL protocols for an element's URL attribute. This restricts the possible values of the attribute to - URLs with the defined protocol. - <p/> - E.g.: <code>addProtocols("a", "href", "ftp", "http", "https")</code> - - @param tag Tag the URL protocol is for - @param key Attribute key - @param protocols List of valid protocols - @return this, for chaining + * Add allowed URL protocols for an element's URL attribute. This restricts + * the possible values of the attribute to URLs with the defined protocol. + * <p/> + * E.g.: <code>addProtocols("a", "href", "ftp", "http", "https")</code> + * + * @param tag + * Tag the URL protocol is for + * @param key + * Attribute key + * @param protocols + * List of valid protocols + * @return this, for chaining */ public Whitelist addProtocols(String tag, String key, String... protocols) { Validate.notEmpty(tag); @@ -330,9 +369,11 @@ public class Whitelist { if (attributes.containsKey(tag)) { if (attributes.get(tag).contains(key)) { if (protocols.containsKey(tag)) { - Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag); + Map<AttributeKey, Set<Protocol>> attrProts = protocols + .get(tag); // ok if not defined protocol; otherwise test - return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key)); + return !attrProts.containsKey(key) + || testValidProtocol(el, attr, attrProts.get(key)); } else { // attribute found, no protocols defined, so OK return true; } @@ -342,15 +383,20 @@ public class Whitelist { return !tagName.equals(":all") && isSafeAttribute(":all", el, attr); } - private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) { - // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. + private boolean testValidProtocol(Element el, Attribute attr, + Set<Protocol> protocols) { + // try to resolve relative urls to abs, and optionally update the + // attribute so output html has abs. // rels without a baseuri get removed String value = el.absUrl(attr.getKey()); - if (value.length() == 0) - value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown protocols - if (!preserveRelativeLinks) + if (value.length() == 0) { + value = attr.getValue(); // if it could not be made abs, run as-is + // to allow custom unknown protocols + } + if (!preserveRelativeLinks) { attr.setValue(value); - + } + for (Protocol protocol : protocols) { String prot = protocol.toString() + ":"; if (value.toLowerCase().startsWith(prot)) { @@ -364,14 +410,17 @@ public class Whitelist { Attributes attrs = new Attributes(); TagName tag = TagName.valueOf(tagName); if (enforcedAttributes.containsKey(tag)) { - Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes.get(tag); - for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals.entrySet()) { - attrs.put(entry.getKey().toString(), entry.getValue().toString()); + Map<AttributeKey, AttributeValue> keyVals = enforcedAttributes + .get(tag); + for (Map.Entry<AttributeKey, AttributeValue> entry : keyVals + .entrySet()) { + attrs.put(entry.getKey().toString(), entry.getValue() + .toString()); } } return attrs; } - + // named types for config. All just hold strings, but here for my sanity. static class TagName extends TypedValue { @@ -432,13 +481,23 @@ public class Whitelist { @Override public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } TypedValue other = (TypedValue) obj; if (value == null) { - if (other.value != null) return false; - } else if (!value.equals(other.value)) return false; + if (other.value != null) { + return false; + } + } else if (!value.equals(other.value)) { + return false; + } return true; } @@ -448,4 +507,3 @@ public class Whitelist { } } } - diff --git a/server/src/org/jsoup/safety/package-info.java b/server/src/org/jsoup/safety/package-info.java index ac890f0607..acbff6665f 100644 --- a/server/src/org/jsoup/safety/package-info.java +++ b/server/src/org/jsoup/safety/package-info.java @@ -2,3 +2,4 @@ Contains the jsoup HTML cleaner, and whitelist definitions. */ package org.jsoup.safety; + diff --git a/server/src/org/jsoup/select/Collector.java b/server/src/org/jsoup/select/Collector.java index 8f01045768..20554e8653 100644 --- a/server/src/org/jsoup/select/Collector.java +++ b/server/src/org/jsoup/select/Collector.java @@ -5,7 +5,7 @@ import org.jsoup.nodes.Node; /** * Collects a list of elements that match the supplied criteria. - * + * * @author Jonathan Hedley */ public class Collector { @@ -14,12 +14,16 @@ public class Collector { } /** - Build a list of elements, by visiting root and every descendant of root, and testing it against the evaluator. - @param eval Evaluator to test elements against - @param root root of tree to descend - @return list of matches; empty if none + * Build a list of elements, by visiting root and every descendant of root, + * and testing it against the evaluator. + * + * @param eval + * Evaluator to test elements against + * @param root + * root of tree to descend + * @return list of matches; empty if none */ - public static Elements collect (Evaluator eval, Element root) { + public static Elements collect(Evaluator eval, Element root) { Elements elements = new Elements(); new NodeTraversor(new Accumulator(root, elements, eval)).traverse(root); return elements; @@ -36,14 +40,17 @@ public class Collector { this.eval = eval; } + @Override public void head(Node node, int depth) { if (node instanceof Element) { Element el = (Element) node; - if (eval.matches(root, el)) + if (eval.matches(root, el)) { elements.add(el); + } } } + @Override public void tail(Node node, int depth) { // void } diff --git a/server/src/org/jsoup/select/CombiningEvaluator.java b/server/src/org/jsoup/select/CombiningEvaluator.java index a31ed2636f..c3f9a8af2e 100644 --- a/server/src/org/jsoup/select/CombiningEvaluator.java +++ b/server/src/org/jsoup/select/CombiningEvaluator.java @@ -1,13 +1,13 @@ package org.jsoup.select; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Element; - import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; +import org.jsoup.helper.StringUtil; +import org.jsoup.nodes.Element; + /** * Base combining (and, or) evaluator. */ @@ -25,9 +25,10 @@ abstract class CombiningEvaluator extends Evaluator { } Evaluator rightMostEvaluator() { - return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) : null; + return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) + : null; } - + void replaceRightMostEvaluator(Evaluator replacement) { evaluators.set(evaluators.size() - 1, replacement); } @@ -44,8 +45,9 @@ abstract class CombiningEvaluator extends Evaluator { @Override public boolean matches(Element root, Element node) { for (Evaluator s : evaluators) { - if (!s.matches(root, node)) + if (!s.matches(root, node)) { return false; + } } return true; } @@ -58,15 +60,20 @@ abstract class CombiningEvaluator extends Evaluator { static final class Or extends CombiningEvaluator { /** - * Create a new Or evaluator. The initial evaluators are ANDed together and used as the first clause of the OR. - * @param evaluators initial OR clause (these are wrapped into an AND evaluator). + * Create a new Or evaluator. The initial evaluators are ANDed together + * and used as the first clause of the OR. + * + * @param evaluators + * initial OR clause (these are wrapped into an AND + * evaluator). */ Or(Collection<Evaluator> evaluators) { super(); - if (evaluators.size() > 1) + if (evaluators.size() > 1) { this.evaluators.add(new And(evaluators)); - else // 0 or 1 + } else { this.evaluators.addAll(evaluators); + } } Or() { @@ -80,8 +87,9 @@ abstract class CombiningEvaluator extends Evaluator { @Override public boolean matches(Element root, Element node) { for (Evaluator s : evaluators) { - if (s.matches(root, node)) + if (s.matches(root, node)) { return true; + } } return false; } diff --git a/server/src/org/jsoup/select/Elements.java b/server/src/org/jsoup/select/Elements.java index 8302da1e53..cddea67d96 100644 --- a/server/src/org/jsoup/select/Elements.java +++ b/server/src/org/jsoup/select/Elements.java @@ -1,17 +1,26 @@ package org.jsoup.select; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.ListIterator; + import org.jsoup.helper.Validate; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; -import java.util.*; - /** - A list of {@link Element Elements}, with methods that act on every element in the list. - <p/> - To get an Elements object, use the {@link Element#select(String)} method. - - @author Jonathan Hedley, jonathan@hedley.net */ + * A list of {@link Element Elements}, with methods that act on every element in + * the list. + * <p/> + * To get an Elements object, use the {@link Element#select(String)} method. + * + * @author Jonathan Hedley, jonathan@hedley.net + */ public class Elements implements List<Element>, Cloneable { private List<Element> contents; @@ -26,59 +35,70 @@ public class Elements implements List<Element>, Cloneable { public Elements(Collection<Element> elements) { contents = new ArrayList<Element>(elements); } - + public Elements(List<Element> elements) { contents = elements; } - + public Elements(Element... elements) { this(Arrays.asList(elements)); } - + @Override - public Elements clone() { - List<Element> elements = new ArrayList<Element>(); - - for(Element e : contents) - elements.add(e.clone()); - - - return new Elements(elements); - } - - // attribute methods - /** - Get an attribute value from the first matched element that has the attribute. - @param attributeKey The attribute key. - @return The attribute value from the first matched element that has the attribute.. If no elements were matched (isEmpty() == true), - or if the no elements have the attribute, returns empty string. - @see #hasAttr(String) + public Elements clone() { + List<Element> elements = new ArrayList<Element>(); + + for (Element e : contents) { + elements.add(e.clone()); + } + + return new Elements(elements); + } + + // attribute methods + /** + * Get an attribute value from the first matched element that has the + * attribute. + * + * @param attributeKey + * The attribute key. + * @return The attribute value from the first matched element that has the + * attribute.. If no elements were matched (isEmpty() == true), or + * if the no elements have the attribute, returns empty string. + * @see #hasAttr(String) */ public String attr(String attributeKey) { for (Element element : contents) { - if (element.hasAttr(attributeKey)) + if (element.hasAttr(attributeKey)) { return element.attr(attributeKey); + } } return ""; } /** - Checks if any of the matched elements have this attribute set. - @param attributeKey attribute key - @return true if any of the elements have the attribute; false if none do. + * Checks if any of the matched elements have this attribute set. + * + * @param attributeKey + * attribute key + * @return true if any of the elements have the attribute; false if none do. */ public boolean hasAttr(String attributeKey) { for (Element element : contents) { - if (element.hasAttr(attributeKey)) + if (element.hasAttr(attributeKey)) { return true; + } } return false; } /** * Set an attribute on all matched elements. - * @param attributeKey attribute key - * @param attributeValue attribute value + * + * @param attributeKey + * attribute key + * @param attributeValue + * attribute value * @return this */ public Elements attr(String attributeKey, String attributeValue) { @@ -90,7 +110,9 @@ public class Elements implements List<Element>, Cloneable { /** * Remove an attribute from every matched element. - * @param attributeKey The attribute to remove. + * + * @param attributeKey + * The attribute to remove. * @return this (for chaining) */ public Elements removeAttr(String attributeKey) { @@ -101,9 +123,11 @@ public class Elements implements List<Element>, Cloneable { } /** - Add the class name to every matched element's {@code class} attribute. - @param className class name to add - @return this + * Add the class name to every matched element's {@code class} attribute. + * + * @param className + * class name to add + * @return this */ public Elements addClass(String className) { for (Element element : contents) { @@ -113,9 +137,12 @@ public class Elements implements List<Element>, Cloneable { } /** - Remove the class name from every matched element's {@code class} attribute, if present. - @param className class name to remove - @return this + * Remove the class name from every matched element's {@code class} + * attribute, if present. + * + * @param className + * class name to remove + * @return this */ public Elements removeClass(String className) { for (Element element : contents) { @@ -125,9 +152,12 @@ public class Elements implements List<Element>, Cloneable { } /** - Toggle the class name on every matched element's {@code class} attribute. - @param className class name to add if missing, or remove if present, from every element. - @return this + * Toggle the class name on every matched element's {@code class} attribute. + * + * @param className + * class name to add if missing, or remove if present, from every + * element. + * @return this */ public Elements toggleClass(String className) { for (Element element : contents) { @@ -137,69 +167,83 @@ public class Elements implements List<Element>, Cloneable { } /** - Determine if any of the matched elements have this class name set in their {@code class} attribute. - @param className class name to check for - @return true if any do, false if none do + * Determine if any of the matched elements have this class name set in + * their {@code class} attribute. + * + * @param className + * class name to check for + * @return true if any do, false if none do */ public boolean hasClass(String className) { for (Element element : contents) { - if (element.hasClass(className)) + if (element.hasClass(className)) { return true; + } } return false; } - + /** * Get the form element's value of the first matched element. + * * @return The form element's value, or empty if not set. * @see Element#val() */ public String val() { - if (size() > 0) + if (size() > 0) { return first().val(); - else + } else { return ""; + } } - + /** * Set the form element's value in each of the matched elements. - * @param value The value to set into each matched element + * + * @param value + * The value to set into each matched element * @return this (for chaining) */ public Elements val(String value) { - for (Element element : contents) + for (Element element : contents) { element.val(value); + } return this; } - + /** * Get the combined text of all the matched elements. * <p> - * Note that it is possible to get repeats if the matched elements contain both parent elements and their own - * children, as the Element.text() method returns the combined text of a parent and all its children. + * Note that it is possible to get repeats if the matched elements contain + * both parent elements and their own children, as the Element.text() method + * returns the combined text of a parent and all its children. + * * @return string of all text: unescaped and no HTML. * @see Element#text() */ public String text() { StringBuilder sb = new StringBuilder(); for (Element element : contents) { - if (sb.length() != 0) + if (sb.length() != 0) { sb.append(" "); + } sb.append(element.text()); } return sb.toString(); } public boolean hasText() { - for (Element element: contents) { - if (element.hasText()) + for (Element element : contents) { + if (element.hasText()) { return true; + } } return false; } - + /** * Get the combined inner HTML of all matched elements. + * * @return string of all element's inner HTML. * @see #text() * @see #outerHtml() @@ -207,15 +251,17 @@ public class Elements implements List<Element>, Cloneable { public String html() { StringBuilder sb = new StringBuilder(); for (Element element : contents) { - if (sb.length() != 0) + if (sb.length() != 0) { sb.append("\n"); + } sb.append(element.html()); } return sb.toString(); } - + /** * Get the combined outer HTML of all matched elements. + * * @return string of all element's outer HTML. * @see #text() * @see #html() @@ -223,27 +269,33 @@ public class Elements implements List<Element>, Cloneable { public String outerHtml() { StringBuilder sb = new StringBuilder(); for (Element element : contents) { - if (sb.length() != 0) + if (sb.length() != 0) { sb.append("\n"); + } sb.append(element.outerHtml()); } return sb.toString(); } /** - * Get the combined outer HTML of all matched elements. Alias of {@link #outerHtml()}. + * Get the combined outer HTML of all matched elements. Alias of + * {@link #outerHtml()}. + * * @return string of all element's outer HTML. * @see #text() * @see #html() */ + @Override public String toString() { return outerHtml(); } /** - * Update the tag name of each matched element. For example, to change each {@code <i>} to a {@code <em>}, do - * {@code doc.select("i").tagName("em");} - * @param tagName the new tag name + * Update the tag name of each matched element. For example, to change each + * {@code <i>} to a {@code <em>}, do {@code doc.select("i").tagName("em");} + * + * @param tagName + * the new tag name * @return this, for chaining * @see Element#tagName(String) */ @@ -253,10 +305,12 @@ public class Elements implements List<Element>, Cloneable { } return this; } - + /** * Set the inner HTML of each matched element. - * @param html HTML to parse and set into each matched element. + * + * @param html + * HTML to parse and set into each matched element. * @return this, for chaining * @see Element#html(String) */ @@ -266,10 +320,12 @@ public class Elements implements List<Element>, Cloneable { } return this; } - + /** * Add the supplied HTML to the start of each matched element's inner HTML. - * @param html HTML to add inside each element, before the existing HTML + * + * @param html + * HTML to add inside each element, before the existing HTML * @return this, for chaining * @see Element#prepend(String) */ @@ -279,10 +335,12 @@ public class Elements implements List<Element>, Cloneable { } return this; } - + /** * Add the supplied HTML to the end of each matched element's inner HTML. - * @param html HTML to add inside each element, after the existing HTML + * + * @param html + * HTML to add inside each element, after the existing HTML * @return this, for chaining * @see Element#append(String) */ @@ -292,10 +350,12 @@ public class Elements implements List<Element>, Cloneable { } return this; } - + /** * Insert the supplied HTML before each matched element's outer HTML. - * @param html HTML to insert before each element + * + * @param html + * HTML to insert before each element * @return this, for chaining * @see Element#before(String) */ @@ -305,10 +365,12 @@ public class Elements implements List<Element>, Cloneable { } return this; } - + /** * Insert the supplied HTML after each matched element's outer HTML. - * @param html HTML to insert after each element + * + * @param html + * HTML to insert after each element * @return this, for chaining * @see Element#after(String) */ @@ -320,13 +382,16 @@ public class Elements implements List<Element>, Cloneable { } /** - Wrap the supplied HTML around each matched elements. For example, with HTML - {@code <p><b>This</b> is <b>Jsoup</b></p>}, - <code>doc.select("b").wrap("<i></i>");</code> - becomes {@code <p><i><b>This</b></i> is <i><b>jsoup</b></i></p>} - @param html HTML to wrap around each element, e.g. {@code <div class="head"></div>}. Can be arbitrarily deep. - @return this (for chaining) - @see Element#wrap + * Wrap the supplied HTML around each matched elements. For example, with + * HTML {@code <p><b>This</b> is <b>Jsoup</b></p>}, + * <code>doc.select("b").wrap("<i></i>");</code> becomes + * {@code <p><i><b>This</b></i> is <i><b>jsoup</b></i></p>} + * + * @param html + * HTML to wrap around each element, e.g. + * {@code <div class="head"></div>}. Can be arbitrarily deep. + * @return this (for chaining) + * @see Element#wrap */ public Elements wrap(String html) { Validate.notEmpty(html); @@ -337,15 +402,18 @@ public class Elements implements List<Element>, Cloneable { } /** - * Removes the matched elements from the DOM, and moves their children up into their parents. This has the effect of - * dropping the elements but keeping their children. + * Removes the matched elements from the DOM, and moves their children up + * into their parents. This has the effect of dropping the elements but + * keeping their children. * <p/> - * This is useful for e.g removing unwanted formatting elements but keeping their contents. + * This is useful for e.g removing unwanted formatting elements but keeping + * their contents. * <p/> - * E.g. with HTML: {@code <div><font>One</font> <font><a href="/">Two</a></font></div>}<br/> + * E.g. with HTML: + * {@code <div><font>One</font> <font><a href="/">Two</a></font></div>}<br/> * {@code doc.select("font").unwrap();}<br/> * HTML = {@code <div>One <a href="/">Two</a></div>} - * + * * @return this (for chaining) * @see Node#unwrap */ @@ -357,12 +425,16 @@ public class Elements implements List<Element>, Cloneable { } /** - * Empty (remove all child nodes from) each matched element. This is similar to setting the inner HTML of each - * element to nothing. + * Empty (remove all child nodes from) each matched element. This is similar + * to setting the inner HTML of each element to nothing. * <p> * E.g. HTML: {@code <div><p>Hello <b>there</b></p> <p>now</p></div>}<br> * <code>doc.select("p").empty();</code><br> - * HTML = {@code <div><p></p> <p></p></div>} + * HTML = {@code <div> + * <p></p> + * <p></p> + * </div>} + * * @return this, for chaining * @see Element#empty() * @see #remove() @@ -375,13 +447,16 @@ public class Elements implements List<Element>, Cloneable { } /** - * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing. + * Remove each matched element from the DOM. This is similar to setting the + * outer HTML of each element to nothing. * <p> * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br> * <code>doc.select("p").remove();</code><br> * HTML = {@code <div> <img /></div>} * <p> - * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. + * Note that this method should not be used to clean user-submitted HTML; + * rather, use {@link org.jsoup.safety.Cleaner} to clean HTML. + * * @return this, for chaining * @see Element#empty() * @see #empty() @@ -392,12 +467,14 @@ public class Elements implements List<Element>, Cloneable { } return this; } - + // filters - + /** * Find matching elements within this element list. - * @param query A {@link Selector} query + * + * @param query + * A {@link Selector} query * @return the filtered list of elements, or an empty list if none match. */ public Elements select(String query) { @@ -411,28 +488,37 @@ public class Elements implements List<Element>, Cloneable { * <code>Elements divs = doc.select("div").not("#logo");</code><br> * Result: {@code divs: [<div>Two</div>]} * <p> - * @param query the selector query whose results should be removed from these elements + * + * @param query + * the selector query whose results should be removed from these + * elements * @return a new elements list that contains only the filtered results */ public Elements not(String query) { Elements out = Selector.select(query, this); return Selector.filterOut(this, out); } - + /** * Get the <i>nth</i> matched element as an Elements object. * <p> * See also {@link #get(int)} to retrieve an Element. - * @param index the (zero-based) index of the element in the list to retain - * @return Elements containing only the specified element, or, if that element did not exist, an empty list. + * + * @param index + * the (zero-based) index of the element in the list to retain + * @return Elements containing only the specified element, or, if that + * element did not exist, an empty list. */ public Elements eq(int index) { - return contents.size() > index ? new Elements(get(index)) : new Elements(); + return contents.size() > index ? new Elements(get(index)) + : new Elements(); } - + /** * Test if any of the matched elements match the supplied query. - * @param query A selector + * + * @param query + * A selector * @return true if at least one element in the list matches the query. */ public boolean is(String query) { @@ -442,11 +528,12 @@ public class Elements implements List<Element>, Cloneable { /** * Get all of the parents and ancestor elements of the matched elements. + * * @return all of the parents and ancestor elements of the matched elements */ public Elements parents() { HashSet<Element> combo = new LinkedHashSet<Element>(); - for (Element e: contents) { + for (Element e : contents) { combo.addAll(e.parents()); } return new Elements(combo); @@ -454,16 +541,20 @@ public class Elements implements List<Element>, Cloneable { // list-like methods /** - Get the first matched element. - @return The first matched element, or <code>null</code> if contents is empty; + * Get the first matched element. + * + * @return The first matched element, or <code>null</code> if contents is + * empty; */ public Element first() { return contents.isEmpty() ? null : contents.get(0); } /** - Get the last matched element. - @return The last matched element, or <code>null</code> if contents is empty. + * Get the last matched element. + * + * @return The last matched element, or <code>null</code> if contents is + * empty. */ public Element last() { return contents.isEmpty() ? null : contents.get(contents.size() - 1); @@ -471,66 +562,143 @@ public class Elements implements List<Element>, Cloneable { /** * Perform a depth-first traversal on each of the selected elements. - * @param nodeVisitor the visitor callbacks to perform on each node + * + * @param nodeVisitor + * the visitor callbacks to perform on each node * @return this, for chaining */ public Elements traverse(NodeVisitor nodeVisitor) { Validate.notNull(nodeVisitor); NodeTraversor traversor = new NodeTraversor(nodeVisitor); - for (Element el: contents) { + for (Element el : contents) { traversor.traverse(el); } return this; } // implements List<Element> delegates: - public int size() {return contents.size();} + @Override + public int size() { + return contents.size(); + } - public boolean isEmpty() {return contents.isEmpty();} + @Override + public boolean isEmpty() { + return contents.isEmpty(); + } - public boolean contains(Object o) {return contents.contains(o);} + @Override + public boolean contains(Object o) { + return contents.contains(o); + } - public Iterator<Element> iterator() {return contents.iterator();} + @Override + public Iterator<Element> iterator() { + return contents.iterator(); + } - public Object[] toArray() {return contents.toArray();} + @Override + public Object[] toArray() { + return contents.toArray(); + } - public <T> T[] toArray(T[] a) {return contents.toArray(a);} + @Override + public <T> T[] toArray(T[] a) { + return contents.toArray(a); + } - public boolean add(Element element) {return contents.add(element);} + @Override + public boolean add(Element element) { + return contents.add(element); + } - public boolean remove(Object o) {return contents.remove(o);} + @Override + public boolean remove(Object o) { + return contents.remove(o); + } - public boolean containsAll(Collection<?> c) {return contents.containsAll(c);} + @Override + public boolean containsAll(Collection<?> c) { + return contents.containsAll(c); + } - public boolean addAll(Collection<? extends Element> c) {return contents.addAll(c);} + @Override + public boolean addAll(Collection<? extends Element> c) { + return contents.addAll(c); + } - public boolean addAll(int index, Collection<? extends Element> c) {return contents.addAll(index, c);} + @Override + public boolean addAll(int index, Collection<? extends Element> c) { + return contents.addAll(index, c); + } - public boolean removeAll(Collection<?> c) {return contents.removeAll(c);} + @Override + public boolean removeAll(Collection<?> c) { + return contents.removeAll(c); + } - public boolean retainAll(Collection<?> c) {return contents.retainAll(c);} + @Override + public boolean retainAll(Collection<?> c) { + return contents.retainAll(c); + } - public void clear() {contents.clear();} + @Override + public void clear() { + contents.clear(); + } - public boolean equals(Object o) {return contents.equals(o);} + @Override + public boolean equals(Object o) { + return contents.equals(o); + } - public int hashCode() {return contents.hashCode();} + @Override + public int hashCode() { + return contents.hashCode(); + } - public Element get(int index) {return contents.get(index);} + @Override + public Element get(int index) { + return contents.get(index); + } - public Element set(int index, Element element) {return contents.set(index, element);} + @Override + public Element set(int index, Element element) { + return contents.set(index, element); + } - public void add(int index, Element element) {contents.add(index, element);} + @Override + public void add(int index, Element element) { + contents.add(index, element); + } - public Element remove(int index) {return contents.remove(index);} + @Override + public Element remove(int index) { + return contents.remove(index); + } - public int indexOf(Object o) {return contents.indexOf(o);} + @Override + public int indexOf(Object o) { + return contents.indexOf(o); + } - public int lastIndexOf(Object o) {return contents.lastIndexOf(o);} + @Override + public int lastIndexOf(Object o) { + return contents.lastIndexOf(o); + } - public ListIterator<Element> listIterator() {return contents.listIterator();} + @Override + public ListIterator<Element> listIterator() { + return contents.listIterator(); + } - public ListIterator<Element> listIterator(int index) {return contents.listIterator(index);} + @Override + public ListIterator<Element> listIterator(int index) { + return contents.listIterator(index); + } - public List<Element> subList(int fromIndex, int toIndex) {return contents.subList(fromIndex, toIndex);} + @Override + public List<Element> subList(int fromIndex, int toIndex) { + return contents.subList(fromIndex, toIndex); + } } diff --git a/server/src/org/jsoup/select/Evaluator.java b/server/src/org/jsoup/select/Evaluator.java index bd0cee481d..5dd4c91616 100644 --- a/server/src/org/jsoup/select/Evaluator.java +++ b/server/src/org/jsoup/select/Evaluator.java @@ -1,12 +1,11 @@ package org.jsoup.select; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Element; - import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Element; /** * Evaluates that an element matches the selector. @@ -17,9 +16,11 @@ public abstract class Evaluator { /** * Test if the element meets the evaluator's requirements. - * - * @param root UI of the matching subtree - * @param element tested element + * + * @param root + * UI of the matching subtree + * @param element + * tested element */ public abstract boolean matches(Element root, Element element); @@ -122,10 +123,12 @@ public abstract class Evaluator { @Override public boolean matches(Element root, Element element) { - List<org.jsoup.nodes.Attribute> values = element.attributes().asList(); + List<org.jsoup.nodes.Attribute> values = element.attributes() + .asList(); for (org.jsoup.nodes.Attribute attribute : values) { - if (attribute.getKey().startsWith(keyPrefix)) + if (attribute.getKey().startsWith(keyPrefix)) { return true; + } } return false; } @@ -147,7 +150,8 @@ public abstract class Evaluator { @Override public boolean matches(Element root, Element element) { - return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key)); + return element.hasAttr(key) + && value.equalsIgnoreCase(element.attr(key)); } @Override @@ -180,14 +184,20 @@ public abstract class Evaluator { /** * Evaluator for attribute name/value matching (value prefix) */ - public static final class AttributeWithValueStarting extends AttributeKeyPair { + public static final class AttributeWithValueStarting extends + AttributeKeyPair { public AttributeWithValueStarting(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { - return element.hasAttr(key) && element.attr(key).toLowerCase().startsWith(value); // value is lower case already + return element.hasAttr(key) + && element.attr(key).toLowerCase().startsWith(value); // value + // is + // lower + // case + // already } @Override @@ -207,7 +217,11 @@ public abstract class Evaluator { @Override public boolean matches(Element root, Element element) { - return element.hasAttr(key) && element.attr(key).toLowerCase().endsWith(value); // value is lower case + return element.hasAttr(key) + && element.attr(key).toLowerCase().endsWith(value); // value + // is + // lower + // case } @Override @@ -220,14 +234,19 @@ public abstract class Evaluator { /** * Evaluator for attribute name/value matching (value containing) */ - public static final class AttributeWithValueContaining extends AttributeKeyPair { + public static final class AttributeWithValueContaining extends + AttributeKeyPair { public AttributeWithValueContaining(String key, String value) { super(key, value); } @Override public boolean matches(Element root, Element element) { - return element.hasAttr(key) && element.attr(key).toLowerCase().contains(value); // value is lower case + return element.hasAttr(key) + && element.attr(key).toLowerCase().contains(value); // value + // is + // lower + // case } @Override @@ -251,7 +270,8 @@ public abstract class Evaluator { @Override public boolean matches(Element root, Element element) { - return element.hasAttr(key) && pattern.matcher(element.attr(key)).find(); + return element.hasAttr(key) + && pattern.matcher(element.attr(key)).find(); } @Override @@ -355,7 +375,7 @@ public abstract class Evaluator { /** * Abstract evaluator for sibling index matching - * + * * @author ant */ public abstract static class IndexEvaluator extends Evaluator { diff --git a/server/src/org/jsoup/select/NodeTraversor.java b/server/src/org/jsoup/select/NodeTraversor.java index 9bb081e56c..f94a7762fc 100644 --- a/server/src/org/jsoup/select/NodeTraversor.java +++ b/server/src/org/jsoup/select/NodeTraversor.java @@ -3,16 +3,21 @@ package org.jsoup.select; import org.jsoup.nodes.Node; /** - * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node. + * Depth-first node traversor. Use to iterate through all nodes under and + * including the specified root node. * <p/> - * This implementation does not use recursion, so a deep DOM does not risk blowing the stack. + * This implementation does not use recursion, so a deep DOM does not risk + * blowing the stack. */ public class NodeTraversor { private NodeVisitor visitor; /** * Create a new traversor. - * @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node. + * + * @param visitor + * a class implementing the {@link NodeVisitor} interface, to be + * called when visiting each node. */ public NodeTraversor(NodeVisitor visitor) { this.visitor = visitor; @@ -20,12 +25,14 @@ public class NodeTraversor { /** * Start a depth-first traverse of the root and all of its descendants. - * @param root the root node point to traverse. + * + * @param root + * the root node point to traverse. */ public void traverse(Node root) { Node node = root; int depth = 0; - + while (node != null) { visitor.head(node, depth); if (node.childNodes().size() > 0) { @@ -38,8 +45,9 @@ public class NodeTraversor { depth--; } visitor.tail(node, depth); - if (node == root) + if (node == root) { break; + } node = node.nextSibling(); } } diff --git a/server/src/org/jsoup/select/NodeVisitor.java b/server/src/org/jsoup/select/NodeVisitor.java index 20112e8d29..9e827d6c55 100644 --- a/server/src/org/jsoup/select/NodeVisitor.java +++ b/server/src/org/jsoup/select/NodeVisitor.java @@ -3,28 +3,37 @@ package org.jsoup.select; import org.jsoup.nodes.Node; /** - * Node visitor interface. Provide an implementing class to {@link NodeTraversor} to iterate through nodes. + * Node visitor interface. Provide an implementing class to + * {@link NodeTraversor} to iterate through nodes. * <p/> - * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first - * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to + * This interface provides two methods, {@code head} and {@code tail}. The head + * method is called when the node is first seen, and the tail method when all of + * the node's children have been visited. As an example, head can be used to * create a start tag for a node, and tail to create the end tag. */ public interface NodeVisitor { /** * Callback for when a node is first visited. - * - * @param node the node being visited. - * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node - * of that will have depth 1. + * + * @param node + * the node being visited. + * @param depth + * the depth of the node, relative to the root node. E.g., the + * root node has depth 0, and a child node of that will have + * depth 1. */ public void head(Node node, int depth); /** - * Callback for when a node is last visited, after all of its descendants have been visited. - * - * @param node the node being visited. - * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node - * of that will have depth 1. + * Callback for when a node is last visited, after all of its descendants + * have been visited. + * + * @param node + * the node being visited. + * @param depth + * the depth of the node, relative to the root node. E.g., the + * root node has depth 0, and a child node of that will have + * depth 1. */ public void tail(Node node, int depth); } diff --git a/server/src/org/jsoup/select/QueryParser.java b/server/src/org/jsoup/select/QueryParser.java index d3cc36f91c..7a04899d82 100644 --- a/server/src/org/jsoup/select/QueryParser.java +++ b/server/src/org/jsoup/select/QueryParser.java @@ -12,7 +12,7 @@ import org.jsoup.parser.TokenQueue; * Parses a CSS selector into an Evaluator tree. */ class QueryParser { - private final static String[] combinators = {",", ">", "+", "~", " "}; + private final static String[] combinators = { ",", ">", "+", "~", " " }; private TokenQueue tq; private String query; @@ -20,16 +20,20 @@ class QueryParser { /** * Create a new QueryParser. - * @param query CSS query + * + * @param query + * CSS query */ private QueryParser(String query) { this.query = query; - this.tq = new TokenQueue(query); + tq = new TokenQueue(query); } /** * Parse a CSS query into an Evaluator. - * @param query CSS query + * + * @param query + * CSS query * @return Evaluator */ public static Evaluator parse(String query) { @@ -39,12 +43,14 @@ class QueryParser { /** * Parse the query + * * @return Evaluator */ Evaluator parse() { tq.consumeWhitespace(); - if (tq.matchesAny(combinators)) { // if starts with a combinator, use root as elements + if (tq.matchesAny(combinators)) { // if starts with a combinator, use + // root as elements evals.add(new StructuralEvaluator.Root()); combinator(tq.consume()); } else { @@ -64,8 +70,9 @@ class QueryParser { } } - if (evals.size() == 1) + if (evals.size() == 1) { return evals.get(0); + } return new CombiningEvaluator.And(evals); } @@ -75,33 +82,41 @@ class QueryParser { String subQuery = consumeSubQuery(); // support multi > childs Evaluator rootEval; // the new topmost evaluator - Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. - Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator + Evaluator currentEval; // the evaluator the new eval will be combined + // to. could be root, or rightmost or. + Evaluator newEval = parse(subQuery); // the evaluator to add into target + // evaluator boolean replaceRightMost = false; if (evals.size() == 1) { rootEval = currentEval = evals.get(0); // make sure OR (,) has precedence: if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { - currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); + currentEval = ((CombiningEvaluator.Or) currentEval) + .rightMostEvaluator(); replaceRightMost = true; } - } - else { + } else { rootEval = currentEval = new CombiningEvaluator.And(evals); } evals.clear(); - // for most combinators: change the current eval into an AND of the current eval and the new eval - if (combinator == '>') - currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediateParent(currentEval)); - else if (combinator == ' ') - currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.Parent(currentEval)); - else if (combinator == '+') - currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.ImmediatePreviousSibling(currentEval)); - else if (combinator == '~') - currentEval = new CombiningEvaluator.And(newEval, new StructuralEvaluator.PreviousSibling(currentEval)); - else if (combinator == ',') { // group or. + // for most combinators: change the current eval into an AND of the + // current eval and the new eval + if (combinator == '>') { + currentEval = new CombiningEvaluator.And(newEval, + new StructuralEvaluator.ImmediateParent(currentEval)); + } else if (combinator == ' ') { + currentEval = new CombiningEvaluator.And(newEval, + new StructuralEvaluator.Parent(currentEval)); + } else if (combinator == '+') { + currentEval = new CombiningEvaluator.And(newEval, + new StructuralEvaluator.ImmediatePreviousSibling( + currentEval)); + } else if (combinator == '~') { + currentEval = new CombiningEvaluator.And(newEval, + new StructuralEvaluator.PreviousSibling(currentEval)); + } else if (combinator == ',') { // group or. CombiningEvaluator.Or or; if (currentEval instanceof CombiningEvaluator.Or) { or = (CombiningEvaluator.Or) currentEval; @@ -112,62 +127,70 @@ class QueryParser { or.add(newEval); } currentEval = or; + } else { + throw new Selector.SelectorParseException("Unknown combinator: " + + combinator); } - else - throw new Selector.SelectorParseException("Unknown combinator: " + combinator); - if (replaceRightMost) - ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); - else rootEval = currentEval; + if (replaceRightMost) { + ((CombiningEvaluator.Or) rootEval) + .replaceRightMostEvaluator(currentEval); + } else { + rootEval = currentEval; + } evals.add(rootEval); } private String consumeSubQuery() { StringBuilder sq = new StringBuilder(); while (!tq.isEmpty()) { - if (tq.matches("(")) + if (tq.matches("(")) { sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); - else if (tq.matches("[")) + } else if (tq.matches("[")) { sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); - else if (tq.matchesAny(combinators)) + } else if (tq.matchesAny(combinators)) { break; - else + } else { sq.append(tq.consume()); + } } return sq.toString(); } private void findElements() { - if (tq.matchChomp("#")) + if (tq.matchChomp("#")) { byId(); - else if (tq.matchChomp(".")) + } else if (tq.matchChomp(".")) { byClass(); - else if (tq.matchesWord()) + } else if (tq.matchesWord()) { byTag(); - else if (tq.matches("[")) + } else if (tq.matches("[")) { byAttribute(); - else if (tq.matchChomp("*")) + } else if (tq.matchChomp("*")) { allElements(); - else if (tq.matchChomp(":lt(")) + } else if (tq.matchChomp(":lt(")) { indexLessThan(); - else if (tq.matchChomp(":gt(")) + } else if (tq.matchChomp(":gt(")) { indexGreaterThan(); - else if (tq.matchChomp(":eq(")) + } else if (tq.matchChomp(":eq(")) { indexEquals(); - else if (tq.matches(":has(")) + } else if (tq.matches(":has(")) { has(); - else if (tq.matches(":contains(")) + } else if (tq.matches(":contains(")) { contains(false); - else if (tq.matches(":containsOwn(")) + } else if (tq.matches(":containsOwn(")) { contains(true); - else if (tq.matches(":matches(")) + } else if (tq.matches(":matches(")) { matches(false); - else if (tq.matches(":matchesOwn(")) + } else if (tq.matches(":matchesOwn(")) { matches(true); - else if (tq.matches(":not(")) + } else if (tq.matches(":not(")) { not(); - else // unhandled - throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); + } else { + throw new Selector.SelectorParseException( + "Could not parse query '%s': unexpected token at '%s'", + query, tq.remainder()); + } } @@ -187,44 +210,58 @@ class QueryParser { String tagName = tq.consumeElementSelector(); Validate.notEmpty(tagName); - // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: - if (tagName.contains("|")) + // namespaces: if element name is "abc:def", selector must be "abc|def", + // so flip: + if (tagName.contains("|")) { tagName = tagName.replace("|", ":"); + } evals.add(new Evaluator.Tag(tagName.trim().toLowerCase())); } private void byAttribute() { - TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue - String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, not, start, end, contain, match, (no val) + TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content + // queue + String key = cq.consumeToAny("=", "!=", "^=", "$=", "*=", "~="); // eq, + // not, + // start, + // end, + // contain, + // match, + // (no + // val) Validate.notEmpty(key); cq.consumeWhitespace(); if (cq.isEmpty()) { - if (key.startsWith("^")) + if (key.startsWith("^")) { evals.add(new Evaluator.AttributeStarting(key.substring(1))); - else + } else { evals.add(new Evaluator.Attribute(key)); + } } else { - if (cq.matchChomp("=")) + if (cq.matchChomp("=")) { evals.add(new Evaluator.AttributeWithValue(key, cq.remainder())); - - else if (cq.matchChomp("!=")) - evals.add(new Evaluator.AttributeWithValueNot(key, cq.remainder())); - - else if (cq.matchChomp("^=")) - evals.add(new Evaluator.AttributeWithValueStarting(key, cq.remainder())); - - else if (cq.matchChomp("$=")) - evals.add(new Evaluator.AttributeWithValueEnding(key, cq.remainder())); - - else if (cq.matchChomp("*=")) - evals.add(new Evaluator.AttributeWithValueContaining(key, cq.remainder())); - - else if (cq.matchChomp("~=")) - evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()))); - else - throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); + } else if (cq.matchChomp("!=")) { + evals.add(new Evaluator.AttributeWithValueNot(key, cq + .remainder())); + } else if (cq.matchChomp("^=")) { + evals.add(new Evaluator.AttributeWithValueStarting(key, cq + .remainder())); + } else if (cq.matchChomp("$=")) { + evals.add(new Evaluator.AttributeWithValueEnding(key, cq + .remainder())); + } else if (cq.matchChomp("*=")) { + evals.add(new Evaluator.AttributeWithValueContaining(key, cq + .remainder())); + } else if (cq.matchChomp("~=")) { + evals.add(new Evaluator.AttributeWithValueMatching(key, Pattern + .compile(cq.remainder()))); + } else { + throw new Selector.SelectorParseException( + "Could not parse attribute query '%s': unexpected token at '%s'", + query, cq.remainder()); + } } } @@ -264,29 +301,33 @@ class QueryParser { tq.consume(own ? ":containsOwn" : ":contains"); String searchText = TokenQueue.unescape(tq.chompBalanced('(', ')')); Validate.notEmpty(searchText, ":contains(text) query must not be empty"); - if (own) + if (own) { evals.add(new Evaluator.ContainsOwnText(searchText)); - else + } else { evals.add(new Evaluator.ContainsText(searchText)); + } } // :matches(regex), matchesOwn(regex) private void matches(boolean own) { tq.consume(own ? ":matchesOwn" : ":matches"); - String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex bits will be escaped + String regex = tq.chompBalanced('(', ')'); // don't unescape, as regex + // bits will be escaped Validate.notEmpty(regex, ":matches(regex) query must not be empty"); - if (own) + if (own) { evals.add(new Evaluator.MatchesOwn(Pattern.compile(regex))); - else + } else { evals.add(new Evaluator.Matches(Pattern.compile(regex))); + } } // :not(selector) private void not() { tq.consume(":not"); String subQuery = tq.chompBalanced('(', ')'); - Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); + Validate.notEmpty(subQuery, + ":not(selector) subselect must not be empty"); evals.add(new StructuralEvaluator.Not(parse(subQuery))); } diff --git a/server/src/org/jsoup/select/Selector.java b/server/src/org/jsoup/select/Selector.java index 8fc6286798..d5ea6f2dc9 100644 --- a/server/src/org/jsoup/select/Selector.java +++ b/server/src/org/jsoup/select/Selector.java @@ -1,55 +1,201 @@ package org.jsoup.select; -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Element; - import java.util.Collection; import java.util.LinkedHashSet; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Element; + /** * CSS-like element selector, that finds elements matching a query. * <p/> * <h2>Selector syntax</h2> - * A selector is a chain of simple selectors, separated by combinators. Selectors are case insensitive (including against - * elements, attributes, and attribute values). + * A selector is a chain of simple selectors, separated by combinators. + * Selectors are case insensitive (including against elements, attributes, and + * attribute values). * <p/> - * The universal selector (*) is implicit when no element selector is supplied (i.e. {@code *.header} and {@code .header} - * is equivalent). + * The universal selector (*) is implicit when no element selector is supplied + * (i.e. {@code *.header} and {@code .header} is equivalent). * <p/> * <table> - * <tr><th>Pattern</th><th>Matches</th><th>Example</th></tr> - * <tr><td><code>*</code></td><td>any element</td><td><code>*</code></td></tr> - * <tr><td><code>tag</code></td><td>elements with the given tag name</td><td><code>div</code></td></tr> - * <tr><td><code>ns|E</code></td><td>elements of type E in the namespace <i>ns</i></td><td><code>fb|name</code> finds <code><fb:name></code> elements</td></tr> - * <tr><td><code>#id</code></td><td>elements with attribute ID of "id"</td><td><code>div#wrap</code>, <code>#logo</code></td></tr> - * <tr><td><code>.class</code></td><td>elements with a class name of "class"</td><td><code>div.left</code>, <code>.result</code></td></tr> - * <tr><td><code>[attr]</code></td><td>elements with an attribute named "attr" (with any value)</td><td><code>a[href]</code>, <code>[title]</code></td></tr> - * <tr><td><code>[^attrPrefix]</code></td><td>elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets</td><td><code>[^data-]</code>, <code>div[^data-]</code></td></tr> - * <tr><td><code>[attr=val]</code></td><td>elements with an attribute named "attr", and value equal to "val"</td><td><code>img[width=500]</code>, <code>a[rel=nofollow]</code></td></tr> - * <tr><td><code>[attr^=valPrefix]</code></td><td>elements with an attribute named "attr", and value starting with "valPrefix"</td><td><code>a[href^=http:]</code></code></td></tr> - * <tr><td><code>[attr$=valSuffix]</code></td><td>elements with an attribute named "attr", and value ending with "valSuffix"</td><td><code>img[src$=.png]</code></td></tr> - * <tr><td><code>[attr*=valContaining]</code></td><td>elements with an attribute named "attr", and value containing "valContaining"</td><td><code>a[href*=/search/]</code></td></tr> - * <tr><td><code>[attr~=<em>regex</em>]</code></td><td>elements with an attribute named "attr", and value matching the regular expression</td><td><code>img[src~=(?i)\\.(png|jpe?g)]</code></td></tr> - * <tr><td></td><td>The above may be combined in any order</td><td><code>div.header[title]</code></td></tr> - * <tr><td><td colspan="3"><h3>Combinators</h3></td></tr> - * <tr><td><code>E F</code></td><td>an F element descended from an E element</td><td><code>div a</code>, <code>.logo h1</code></td></tr> - * <tr><td><code>E > F</code></td><td>an F direct child of E</td><td><code>ol > li</code></td></tr> - * <tr><td><code>E + F</code></td><td>an F element immediately preceded by sibling E</td><td><code>li + li</code>, <code>div.head + div</code></td></tr> - * <tr><td><code>E ~ F</code></td><td>an F element preceded by sibling E</td><td><code>h1 ~ p</code></td></tr> - * <tr><td><code>E, F, G</code></td><td>all matching elements E, F, or G</td><td><code>a[href], div, h3</code></td></tr> - * <tr><td><td colspan="3"><h3>Pseudo selectors</h3></td></tr> - * <tr><td><code>:lt(<em>n</em>)</code></td><td>elements whose sibling index is less than <em>n</em></td><td><code>td:lt(3)</code> finds the first 2 cells of each row</td></tr> - * <tr><td><code>:gt(<em>n</em>)</code></td><td>elements whose sibling index is greater than <em>n</em></td><td><code>td:gt(1)</code> finds cells after skipping the first two</td></tr> - * <tr><td><code>:eq(<em>n</em>)</code></td><td>elements whose sibling index is equal to <em>n</em></td><td><code>td:eq(0)</code> finds the first cell of each row</td></tr> - * <tr><td><code>:has(<em>selector</em>)</code></td><td>elements that contains at least one element matching the <em>selector</em></td><td><code>div:has(p)</code> finds divs that contain p elements </td></tr> - * <tr><td><code>:not(<em>selector</em>)</code></td><td>elements that do not match the <em>selector</em>. See also {@link Elements#not(String)}</td><td><code>div:not(.logo)</code> finds all divs that do not have the "logo" class.<br /><code>div:not(:has(div))</code> finds divs that do not contain divs.</code></td></tr> - * <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants.</td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".</td></tr> - * <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements whose text matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> - * <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contains the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr> - * <tr><td><code>:matchesOwn(<em>regex</em>)</code></td><td>elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> - * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr> + * <tr> + * <th>Pattern</th> + * <th>Matches</th> + * <th>Example</th> + * </tr> + * <tr> + * <td><code>*</code></td> + * <td>any element</td> + * <td><code>*</code></td> + * </tr> + * <tr> + * <td><code>tag</code></td> + * <td>elements with the given tag name</td> + * <td><code>div</code></td> + * </tr> + * <tr> + * <td><code>ns|E</code></td> + * <td>elements of type E in the namespace <i>ns</i></td> + * <td><code>fb|name</code> finds <code><fb:name></code> elements</td> + * </tr> + * <tr> + * <td><code>#id</code></td> + * <td>elements with attribute ID of "id"</td> + * <td><code>div#wrap</code>, <code>#logo</code></td> + * </tr> + * <tr> + * <td><code>.class</code></td> + * <td>elements with a class name of "class"</td> + * <td><code>div.left</code>, <code>.result</code></td> + * </tr> + * <tr> + * <td><code>[attr]</code></td> + * <td>elements with an attribute named "attr" (with any value)</td> + * <td><code>a[href]</code>, <code>[title]</code></td> + * </tr> + * <tr> + * <td><code>[^attrPrefix]</code></td> + * <td>elements with an attribute name starting with "attrPrefix". Use to find + * elements with HTML5 datasets</td> + * <td><code>[^data-]</code>, <code>div[^data-]</code></td> + * </tr> + * <tr> + * <td><code>[attr=val]</code></td> + * <td>elements with an attribute named "attr", and value equal to "val"</td> + * <td><code>img[width=500]</code>, <code>a[rel=nofollow]</code></td> + * </tr> + * <tr> + * <td><code>[attr^=valPrefix]</code></td> + * <td>elements with an attribute named "attr", and value starting with + * "valPrefix"</td> + * <td><code>a[href^=http:]</code></code></td> + * </tr> + * <tr> + * <td><code>[attr$=valSuffix]</code></td> + * <td>elements with an attribute named "attr", and value ending with + * "valSuffix"</td> + * <td><code>img[src$=.png]</code></td> + * </tr> + * <tr> + * <td><code>[attr*=valContaining]</code></td> + * <td>elements with an attribute named "attr", and value containing + * "valContaining"</td> + * <td><code>a[href*=/search/]</code></td> + * </tr> + * <tr> + * <td><code>[attr~=<em>regex</em>]</code></td> + * <td>elements with an attribute named "attr", and value matching the regular + * expression</td> + * <td><code>img[src~=(?i)\\.(png|jpe?g)]</code></td> + * </tr> + * <tr> + * <td></td> + * <td>The above may be combined in any order</td> + * <td><code>div.header[title]</code></td> + * </tr> + * <tr> + * <td> + * <td colspan="3"> + * <h3>Combinators</h3></td> + * </tr> + * <tr> + * <td><code>E F</code></td> + * <td>an F element descended from an E element</td> + * <td><code>div a</code>, <code>.logo h1</code></td> + * </tr> + * <tr> + * <td><code>E > F</code></td> + * <td>an F direct child of E</td> + * <td><code>ol > li</code></td> + * </tr> + * <tr> + * <td><code>E + F</code></td> + * <td>an F element immediately preceded by sibling E</td> + * <td><code>li + li</code>, <code>div.head + div</code></td> + * </tr> + * <tr> + * <td><code>E ~ F</code></td> + * <td>an F element preceded by sibling E</td> + * <td><code>h1 ~ p</code></td> + * </tr> + * <tr> + * <td><code>E, F, G</code></td> + * <td>all matching elements E, F, or G</td> + * <td><code>a[href], div, h3</code></td> + * </tr> + * <tr> + * <td> + * <td colspan="3"> + * <h3>Pseudo selectors</h3></td> + * </tr> + * <tr> + * <td><code>:lt(<em>n</em>)</code></td> + * <td>elements whose sibling index is less than <em>n</em></td> + * <td><code>td:lt(3)</code> finds the first 2 cells of each row</td> + * </tr> + * <tr> + * <td><code>:gt(<em>n</em>)</code></td> + * <td>elements whose sibling index is greater than <em>n</em></td> + * <td><code>td:gt(1)</code> finds cells after skipping the first two</td> + * </tr> + * <tr> + * <td><code>:eq(<em>n</em>)</code></td> + * <td>elements whose sibling index is equal to <em>n</em></td> + * <td><code>td:eq(0)</code> finds the first cell of each row</td> + * </tr> + * <tr> + * <td><code>:has(<em>selector</em>)</code></td> + * <td>elements that contains at least one element matching the + * <em>selector</em></td> + * <td><code>div:has(p)</code> finds divs that contain p elements</td> + * </tr> + * <tr> + * <td><code>:not(<em>selector</em>)</code></td> + * <td>elements that do not match the <em>selector</em>. See also + * {@link Elements#not(String)}</td> + * <td><code>div:not(.logo)</code> finds all divs that do not have the "logo" + * class.<br /> + * <code>div:not(:has(div))</code> finds divs that do not contain divs.</code></td> + * </tr> + * <tr> + * <td><code>:contains(<em>text</em>)</code></td> + * <td>elements that contains the specified text. The search is case + * insensitive. The text may appear in the found element, or any of its + * descendants.</td> + * <td><code>p:contains(jsoup)</code> finds p elements containing the text + * "jsoup".</td> + * </tr> + * <tr> + * <td><code>:matches(<em>regex</em>)</code></td> + * <td>elements whose text matches the specified regular expression. The text + * may appear in the found element, or any of its descendants.</td> + * <td><code>td:matches(\\d+)</code> finds table cells containing digits. + * <code>div:matches((?i)login)</code> finds divs containing the text, case + * insensitively.</td> + * </tr> + * <tr> + * <td><code>:containsOwn(<em>text</em>)</code></td> + * <td>elements that directly contains the specified text. The search is case + * insensitive. The text must appear in the found element, not any of its + * descendants.</td> + * <td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup". + * </td> + * </tr> + * <tr> + * <td><code>:matchesOwn(<em>regex</em>)</code></td> + * <td>elements whose own text matches the specified regular expression. The + * text must appear in the found element, not any of its descendants.</td> + * <td><code>td:matchesOwn(\\d+)</code> finds table cells directly containing + * digits. <code>div:matchesOwn((?i)login)</code> finds divs containing the + * text, case insensitively.</td> + * </tr> + * <tr> + * <td></td> + * <td>The above may be combined in any order and with other selectors</td> + * <td><code>.light:contains(name):eq(0)</code></td> + * </tr> * </table> - * + * * @author Jonathan Hedley, jonathan@hedley.net * @see Element#select(String) */ @@ -63,16 +209,18 @@ public class Selector { Validate.notEmpty(query); Validate.notNull(root); - this.evaluator = QueryParser.parse(query); + evaluator = QueryParser.parse(query); this.root = root; } /** * Find elements matching selector. - * - * @param query CSS selector - * @param root root element to descend into + * + * @param query + * CSS selector + * @param root + * root element to descend into * @return matching elements, empty if not */ public static Elements select(String query, Element root) { @@ -81,9 +229,11 @@ public class Selector { /** * Find elements matching selector. - * - * @param query CSS selector - * @param roots root elements to descend into + * + * @param query + * CSS selector + * @param roots + * root elements to descend into * @return matching elements, empty if not */ public static Elements select(String query, Iterable<Element> roots) { @@ -102,7 +252,8 @@ public class Selector { } // exclude set. package open so that Elements can implement .not() selector. - static Elements filterOut(Collection<Element> elements, Collection<Element> outs) { + static Elements filterOut(Collection<Element> elements, + Collection<Element> outs) { Elements output = new Elements(); for (Element el : elements) { boolean found = false; @@ -112,8 +263,9 @@ public class Selector { break; } } - if (!found) + if (!found) { output.add(el); + } } return output; } diff --git a/server/src/org/jsoup/select/StructuralEvaluator.java b/server/src/org/jsoup/select/StructuralEvaluator.java index 69e8a62e58..dea2413fb8 100644 --- a/server/src/org/jsoup/select/StructuralEvaluator.java +++ b/server/src/org/jsoup/select/StructuralEvaluator.java @@ -9,6 +9,7 @@ abstract class StructuralEvaluator extends Evaluator { Evaluator evaluator; static class Root extends Evaluator { + @Override public boolean matches(Element root, Element element) { return root == element; } @@ -19,14 +20,17 @@ abstract class StructuralEvaluator extends Evaluator { this.evaluator = evaluator; } + @Override public boolean matches(Element root, Element element) { for (Element e : element.getAllElements()) { - if (e != element && evaluator.matches(root, e)) + if (e != element && evaluator.matches(root, e)) { return true; + } } return false; } + @Override public String toString() { return String.format(":has(%s)", evaluator); } @@ -37,10 +41,12 @@ abstract class StructuralEvaluator extends Evaluator { this.evaluator = evaluator; } + @Override public boolean matches(Element root, Element node) { return !evaluator.matches(root, node); } + @Override public String toString() { return String.format(":not%s", evaluator); } @@ -51,19 +57,23 @@ abstract class StructuralEvaluator extends Evaluator { this.evaluator = evaluator; } + @Override public boolean matches(Element root, Element element) { - if (root == element) + if (root == element) { return false; + } Element parent = element.parent(); while (parent != root) { - if (evaluator.matches(root, parent)) + if (evaluator.matches(root, parent)) { return true; + } parent = parent.parent(); } return false; } + @Override public String toString() { return String.format(":parent%s", evaluator); } @@ -74,14 +84,17 @@ abstract class StructuralEvaluator extends Evaluator { this.evaluator = evaluator; } + @Override public boolean matches(Element root, Element element) { - if (root == element) + if (root == element) { return false; + } Element parent = element.parent(); return parent != null && evaluator.matches(root, parent); } + @Override public String toString() { return String.format(":ImmediateParent%s", evaluator); } @@ -92,21 +105,25 @@ abstract class StructuralEvaluator extends Evaluator { this.evaluator = evaluator; } + @Override public boolean matches(Element root, Element element) { - if (root == element) + if (root == element) { return false; + } Element prev = element.previousElementSibling(); while (prev != null) { - if (evaluator.matches(root, prev)) + if (evaluator.matches(root, prev)) { return true; + } prev = prev.previousElementSibling(); } return false; } + @Override public String toString() { return String.format(":prev*%s", evaluator); } @@ -117,14 +134,17 @@ abstract class StructuralEvaluator extends Evaluator { this.evaluator = evaluator; } + @Override public boolean matches(Element root, Element element) { - if (root == element) + if (root == element) { return false; + } Element prev = element.previousElementSibling(); return prev != null && evaluator.matches(root, prev); } + @Override public String toString() { return String.format(":prev%s", evaluator); } |