summaryrefslogtreecommitdiffstats
path: root/src/org/jsoup/Jsoup.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/org/jsoup/Jsoup.java')
-rw-r--r--src/org/jsoup/Jsoup.java229
1 files changed, 229 insertions, 0 deletions
diff --git a/src/org/jsoup/Jsoup.java b/src/org/jsoup/Jsoup.java
new file mode 100644
index 0000000000..8c6afcee36
--- /dev/null
+++ b/src/org/jsoup/Jsoup.java
@@ -0,0 +1,229 @@
+package org.jsoup;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.jsoup.safety.Cleaner;
+import org.jsoup.safety.Whitelist;
+import org.jsoup.helper.DataUtil;
+import org.jsoup.helper.HttpConnection;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+/**
+ The core public access point to the jsoup functionality.
+
+ @author Jonathan Hedley */
+public class Jsoup {
+ private Jsoup() {}
+
+ /**
+ Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
+
+ @param html HTML to parse
+ @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
+ before the HTML declares a {@code <base href>} tag.
+ @return sane HTML
+ */
+ public static Document parse(String html, String baseUri) {
+ return Parser.parse(html, baseUri);
+ }
+
+ /**
+ Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
+ (non-HTML) parser.
+
+ @param html HTML to parse
+ @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
+ before the HTML declares a {@code <base href>} tag.
+ @param parser alternate {@link Parser#xmlParser() parser} to use.
+ @return sane HTML
+ */
+ public static Document parse(String html, String baseUri, Parser parser) {
+ return parser.parseInput(html, baseUri);
+ }
+
+ /**
+ Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
+ {@code <base href>} tag.
+
+ @param html HTML to parse
+ @return sane HTML
+
+ @see #parse(String, String)
+ */
+ public static Document parse(String html) {
+ return Parser.parse(html, "");
+ }
+
+ /**
+ * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
+ * <p>
+ * Use examples:
+ * <ul>
+ * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
+ * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
+ * </ul>
+ * @param url URL to connect to. The protocol must be {@code http} or {@code https}.
+ * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
+ */
+ public static Connection connect(String url) {
+ return HttpConnection.connect(url);
+ }
+
+ /**
+ Parse the contents of a file as HTML.
+
+ @param in file to load HTML from
+ @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+ present, or fall back to {@code UTF-8} (which is often safe to do).
+ @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
+ @return sane HTML
+
+ @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ */
+ public static Document parse(File in, String charsetName, String baseUri) throws IOException {
+ return DataUtil.load(in, charsetName, baseUri);
+ }
+
+ /**
+ Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
+
+ @param in file to load HTML from
+ @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+ present, or fall back to {@code UTF-8} (which is often safe to do).
+ @return sane HTML
+
+ @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ @see #parse(File, String, String)
+ */
+ public static Document parse(File in, String charsetName) throws IOException {
+ return DataUtil.load(in, charsetName, in.getAbsolutePath());
+ }
+
+ /**
+ Read an input stream, and parse it to a Document.
+
+ @param in input stream to read. Make sure to close it after parsing.
+ @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+ present, or fall back to {@code UTF-8} (which is often safe to do).
+ @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
+ @return sane HTML
+
+ @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ */
+ public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
+ return DataUtil.load(in, charsetName, baseUri);
+ }
+
+ /**
+ Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
+ (non-HTML) parser.
+
+ @param in input stream to read. Make sure to close it after parsing.
+ @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
+ present, or fall back to {@code UTF-8} (which is often safe to do).
+ @param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
+ @param parser alternate {@link Parser#xmlParser() parser} to use.
+ @return sane HTML
+
+ @throws IOException if the file could not be found, or read, or if the charsetName is invalid.
+ */
+ public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+ return DataUtil.load(in, charsetName, baseUri, parser);
+ }
+
+ /**
+ Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
+
+ @param bodyHtml body HTML fragment
+ @param baseUri URL to resolve relative URLs against.
+ @return sane HTML document
+
+ @see Document#body()
+ */
+ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
+ return Parser.parseBodyFragment(bodyHtml, baseUri);
+ }
+
+ /**
+ Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
+
+ @param bodyHtml body HTML fragment
+ @return sane HTML document
+
+ @see Document#body()
+ */
+ public static Document parseBodyFragment(String bodyHtml) {
+ return Parser.parseBodyFragment(bodyHtml, "");
+ }
+
+ /**
+ Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
+ <p>
+ The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
+
+ @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
+ @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
+ @return The parsed HTML.
+
+ @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading
+ the response stream.
+
+ @see #connect(String)
+ */
+ public static Document parse(URL url, int timeoutMillis) throws IOException {
+ Connection con = HttpConnection.connect(url);
+ con.timeout(timeoutMillis);
+ return con.get();
+ }
+
+ /**
+ Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
+ tags and attributes.
+
+ @param bodyHtml input untrusted HTML
+ @param baseUri URL to resolve relative URLs against
+ @param whitelist white-list of permitted HTML elements
+ @return safe HTML
+
+ @see Cleaner#clean(Document)
+ */
+ public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
+ Document dirty = parseBodyFragment(bodyHtml, baseUri);
+ Cleaner cleaner = new Cleaner(whitelist);
+ Document clean = cleaner.clean(dirty);
+ return clean.body().html();
+ }
+
+ /**
+ Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
+ tags and attributes.
+
+ @param bodyHtml input untrusted HTML
+ @param whitelist white-list of permitted HTML elements
+ @return safe HTML
+
+ @see Cleaner#clean(Document)
+ */
+ public static String clean(String bodyHtml, Whitelist whitelist) {
+ return clean(bodyHtml, "", whitelist);
+ }
+
+ /**
+ Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
+ still be run through the cleaner to set up enforced attributes, and to tidy the output.
+ @param bodyHtml HTML to test
+ @param whitelist whitelist to test against
+ @return true if no tags or attributes were removed; false otherwise
+ @see #clean(String, org.jsoup.safety.Whitelist)
+ */
+ public static boolean isValid(String bodyHtml, Whitelist whitelist) {
+ Document dirty = parseBodyFragment(bodyHtml, "");
+ Cleaner cleaner = new Cleaner(whitelist);
+ return cleaner.isValid(dirty);
+ }
+
+}