aboutsummaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/Jsoup.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/Jsoup.java')
-rw-r--r--server/src/org/jsoup/Jsoup.java324
1 files changed, 194 insertions, 130 deletions
diff --git a/server/src/org/jsoup/Jsoup.java b/server/src/org/jsoup/Jsoup.java
index 8c6afcee36..b5429d9410 100644
--- a/server/src/org/jsoup/Jsoup.java
+++ b/server/src/org/jsoup/Jsoup.java
@@ -1,178 +1,233 @@
package org.jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.parser.Parser;
-import org.jsoup.safety.Cleaner;
-import org.jsoup.safety.Whitelist;
-import org.jsoup.helper.DataUtil;
-import org.jsoup.helper.HttpConnection;
-
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-/**
- The core public access point to the jsoup functionality.
+import org.jsoup.helper.DataUtil;
+import org.jsoup.helper.HttpConnection;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.jsoup.safety.Cleaner;
+import org.jsoup.safety.Whitelist;
- @author Jonathan Hedley */
+/**
+ * The core public access point to the jsoup functionality.
+ *
+ * @author Jonathan Hedley
+ */
public class Jsoup {
- private Jsoup() {}
+ private Jsoup() {
+ }
/**
- Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
-
- @param html HTML to parse
- @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
- before the HTML declares a {@code <base href>} tag.
- @return sane HTML
+ * Parse HTML into a Document. The parser will make a sensible, balanced
+ * document tree out of any HTML.
+ *
+ * @param html
+ * HTML to parse
+ * @param baseUri
+ * The URL where the HTML was retrieved from. Used to resolve
+ * relative URLs to absolute URLs, that occur before the HTML
+ * declares a {@code <base href>} tag.
+ * @return sane HTML
*/
public static Document parse(String html, String baseUri) {
return Parser.parse(html, baseUri);
}
/**
- Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
- (non-HTML) parser.
-
- @param html HTML to parse
- @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
- before the HTML declares a {@code <base href>} tag.
- @param parser alternate {@link Parser#xmlParser() parser} to use.
- @return sane HTML
+ * Parse HTML into a Document, using the provided Parser. You can provide an
+ * alternate parser, such as a simple XML (non-HTML) parser.
+ *
+ * @param html
+ * HTML to parse
+ * @param baseUri
+ * The URL where the HTML was retrieved from. Used to resolve
+ * relative URLs to absolute URLs, that occur before the HTML
+ * declares a {@code <base href>} tag.
+ * @param parser
+ * alternate {@link Parser#xmlParser() parser} to use.
+ * @return sane HTML
*/
public static Document parse(String html, String baseUri, Parser parser) {
return parser.parseInput(html, baseUri);
}
/**
- Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
- {@code <base href>} tag.
-
- @param html HTML to parse
- @return sane HTML
-
- @see #parse(String, String)
+ * Parse HTML into a Document. As no base URI is specified, absolute URL
+ * detection relies on the HTML including a {@code <base href>} tag.
+ *
+ * @param html
+ * HTML to parse
+ * @return sane HTML
+ * @see #parse(String, String)
*/
public static Document parse(String html) {
return Parser.parse(html, "");
}
/**
- * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
+ * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML
+ * page.
* <p>
* Use examples:
* <ul>
- * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
- * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
+ * <li>
+ * <code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code>
+ * </li>
+ * <li>
+ * <code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
* </ul>
- * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
- * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
+ *
+ * @param url
+ * URL to connect to. The protocol must be {@code http} or
+ * {@code https}.
+ * @return the connection. You can add data, cookies, and headers; set the
+ * user-agent, referrer, method; and then execute.
*/
public static Connection connect(String url) {
return HttpConnection.connect(url);
}
/**
- Parse the contents of a file as HTML.
-
- @param in file to load HTML from
- @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
- present, or fall back to {@code UTF-8} (which is often safe to do).
- @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
- @return sane HTML
-
- @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ * Parse the contents of a file as HTML.
+ *
+ * @param in
+ * file to load HTML from
+ * @param charsetName
+ * (optional) character set of file contents. Set to {@code null}
+ * to determine from {@code http-equiv} meta tag, if present, or
+ * fall back to {@code UTF-8} (which is often safe to do).
+ * @param baseUri
+ * The URL where the HTML was retrieved from, to resolve relative
+ * links against.
+ * @return sane HTML
+ * @throws IOException
+ * if the file could not be found, or read, or if the
+ * charsetName is invalid.
*/
- public static Document parse(File in, String charsetName, String baseUri) throws IOException {
+ public static Document parse(File in, String charsetName, String baseUri)
+ throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
- Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
-
- @param in file to load HTML from
- @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
- present, or fall back to {@code UTF-8} (which is often safe to do).
- @return sane HTML
-
- @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
- @see #parse(File, String, String)
+ * Parse the contents of a file as HTML. The location of the file is used as
+ * the base URI to qualify relative URLs.
+ *
+ * @param in
+ * file to load HTML from
+ * @param charsetName
+ * (optional) character set of file contents. Set to {@code null}
+ * to determine from {@code http-equiv} meta tag, if present, or
+ * fall back to {@code UTF-8} (which is often safe to do).
+ * @return sane HTML
+ * @throws IOException
+ * if the file could not be found, or read, or if the
+ * charsetName is invalid.
+ * @see #parse(File, String, String)
*/
- public static Document parse(File in, String charsetName) throws IOException {
+ public static Document parse(File in, String charsetName)
+ throws IOException {
return DataUtil.load(in, charsetName, in.getAbsolutePath());
}
- /**
- Read an input stream, and parse it to a Document.
-
- @param in input stream to read. Make sure to close it after parsing.
- @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
- present, or fall back to {@code UTF-8} (which is often safe to do).
- @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
- @return sane HTML
-
- @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ /**
+ * Read an input stream, and parse it to a Document.
+ *
+ * @param in
+ * input stream to read. Make sure to close it after parsing.
+ * @param charsetName
+ * (optional) character set of file contents. Set to {@code null}
+ * to determine from {@code http-equiv} meta tag, if present, or
+ * fall back to {@code UTF-8} (which is often safe to do).
+ * @param baseUri
+ * The URL where the HTML was retrieved from, to resolve relative
+ * links against.
+ * @return sane HTML
+ * @throws IOException
+ * if the file could not be found, or read, or if the
+ * charsetName is invalid.
*/
- public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
+ public static Document parse(InputStream in, String charsetName,
+ String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
- Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
- (non-HTML) parser.
-
- @param in input stream to read. Make sure to close it after parsing.
- @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
- present, or fall back to {@code UTF-8} (which is often safe to do).
- @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
- @param parser alternate {@link Parser#xmlParser() parser} to use.
- @return sane HTML
-
- @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ * Read an input stream, and parse it to a Document. You can provide an
+ * alternate parser, such as a simple XML (non-HTML) parser.
+ *
+ * @param in
+ * input stream to read. Make sure to close it after parsing.
+ * @param charsetName
+ * (optional) character set of file contents. Set to {@code null}
+ * to determine from {@code http-equiv} meta tag, if present, or
+ * fall back to {@code UTF-8} (which is often safe to do).
+ * @param baseUri
+ * The URL where the HTML was retrieved from, to resolve relative
+ * links against.
+ * @param parser
+ * alternate {@link Parser#xmlParser() parser} to use.
+ * @return sane HTML
+ * @throws IOException
+ * if the file could not be found, or read, or if the
+ * charsetName is invalid.
*/
- public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+ public static Document parse(InputStream in, String charsetName,
+ String baseUri, Parser parser) throws IOException {
return DataUtil.load(in, charsetName, baseUri, parser);
}
/**
- Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
-
- @param bodyHtml body HTML fragment
- @param baseUri URL to resolve relative URLs against.
- @return sane HTML document
-
- @see Document#body()
+ * Parse a fragment of HTML, with the assumption that it forms the
+ * {@code body} of the HTML.
+ *
+ * @param bodyHtml
+ * body HTML fragment
+ * @param baseUri
+ * URL to resolve relative URLs against.
+ * @return sane HTML document
+ * @see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return Parser.parseBodyFragment(bodyHtml, baseUri);
}
/**
- Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
-
- @param bodyHtml body HTML fragment
- @return sane HTML document
-
- @see Document#body()
+ * Parse a fragment of HTML, with the assumption that it forms the
+ * {@code body} of the HTML.
+ *
+ * @param bodyHtml
+ * body HTML fragment
+ * @return sane HTML document
+ * @see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml) {
return Parser.parseBodyFragment(bodyHtml, "");
}
/**
- Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
- <p>
- The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
-
- @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
- @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
- @return The parsed HTML.
-
- @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading
- the response stream.
-
- @see #connect(String)
+ * Fetch a URL, and parse it as HTML. Provided for compatibility; in most
+ * cases use {@link #connect(String)} instead.
+ * <p>
+ * The encoding character set is determined by the content-type header or
+ * http-equiv meta tag, or falls back to {@code UTF-8}.
+ *
+ * @param url
+ * URL to fetch (with a GET). The protocol must be {@code http}
+ * or {@code https}.
+ * @param timeoutMillis
+ * Connection and read timeout, in milliseconds. If exceeded,
+ * IOException is thrown.
+ * @return The parsed HTML.
+ * @throws IOException
+ * If the final server response != 200 OK (redirects are
+ * followed), or if there's an error reading the response
+ * stream.
+ * @see #connect(String)
*/
public static Document parse(URL url, int timeoutMillis) throws IOException {
Connection con = HttpConnection.connect(url);
@@ -181,17 +236,20 @@ public class Jsoup {
}
/**
- Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
- tags and attributes.
-
- @param bodyHtml input untrusted HTML
- @param baseUri URL to resolve relative URLs against
- @param whitelist white-list of permitted HTML elements
- @return safe HTML
-
- @see Cleaner#clean(Document)
+ * Get safe HTML from untrusted input HTML, by parsing input HTML and
+ * filtering it through a white-list of permitted tags and attributes.
+ *
+ * @param bodyHtml
+ * input untrusted HTML
+ * @param baseUri
+ * URL to resolve relative URLs against
+ * @param whitelist
+ * white-list of permitted HTML elements
+ * @return safe HTML
+ * @see Cleaner#clean(Document)
*/
- public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
+ public static String clean(String bodyHtml, String baseUri,
+ Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
@@ -199,31 +257,37 @@ public class Jsoup {
}
/**
- Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
- tags and attributes.
-
- @param bodyHtml input untrusted HTML
- @param whitelist white-list of permitted HTML elements
- @return safe HTML
-
- @see Cleaner#clean(Document)
+ * Get safe HTML from untrusted input HTML, by parsing input HTML and
+ * filtering it through a white-list of permitted tags and attributes.
+ *
+ * @param bodyHtml
+ * input untrusted HTML
+ * @param whitelist
+ * white-list of permitted HTML elements
+ * @return safe HTML
+ * @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, Whitelist whitelist) {
return clean(bodyHtml, "", whitelist);
}
/**
- Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
- still be run through the cleaner to set up enforced attributes, and to tidy the output.
- @param bodyHtml HTML to test
- @param whitelist whitelist to test against
- @return true if no tags or attributes were removed; false otherwise
- @see #clean(String, org.jsoup.safety.Whitelist)
+ * Test if the input HTML has only tags and attributes allowed by the
+ * Whitelist. Useful for form validation. The input HTML should still be run
+ * through the cleaner to set up enforced attributes, and to tidy the
+ * output.
+ *
+ * @param bodyHtml
+ * HTML to test
+ * @param whitelist
+ * whitelist to test against
+ * @return true if no tags or attributes were removed; false otherwise
+ * @see #clean(String, org.jsoup.safety.Whitelist)
*/
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, "");
Cleaner cleaner = new Cleaner(whitelist);
return cleaner.isValid(dirty);
}
-
+
}