package org.jsoup;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
/**
* The core public access point to the jsoup functionality.
*
* @author Jonathan Hedley
*/
public class Jsoup {
private Jsoup() {
}
/**
* Parse HTML into a Document. The parser will make a sensible, balanced
* document tree out of any HTML.
*
* @param html
* HTML to parse
* @param baseUri
* The URL where the HTML was retrieved from. Used to resolve
* relative URLs to absolute URLs, that occur before the HTML
* declares a {@code } tag.
* @return sane HTML
*/
public static Document parse(String html, String baseUri) {
return Parser.parse(html, baseUri);
}
/**
* Parse HTML into a Document, using the provided Parser. You can provide an
* alternate parser, such as a simple XML (non-HTML) parser.
*
* @param html
* HTML to parse
* @param baseUri
* The URL where the HTML was retrieved from. Used to resolve
* relative URLs to absolute URLs, that occur before the HTML
* declares a {@code } tag.
* @param parser
* alternate {@link Parser#xmlParser() parser} to use.
* @return sane HTML
*/
public static Document parse(String html, String baseUri, Parser parser) {
return parser.parseInput(html, baseUri);
}
/**
* Parse HTML into a Document. As no base URI is specified, absolute URL
* detection relies on the HTML including a {@code } tag.
*
* @param html
* HTML to parse
* @return sane HTML
* @see #parse(String, String)
*/
public static Document parse(String html) {
return Parser.parse(html, "");
}
/**
* Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML
* page.
*
* Use examples:
*
* -
*
Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();
*
* -
*
Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
*
*
* @param url
* URL to connect to. The protocol must be {@code http} or
* {@code https}.
* @return the connection. You can add data, cookies, and headers; set the
* user-agent, referrer, method; and then execute.
*/
public static Connection connect(String url) {
return HttpConnection.connect(url);
}
/**
* Parse the contents of a file as HTML.
*
* @param in
* file to load HTML from
* @param charsetName
* (optional) character set of file contents. Set to {@code null}
* to determine from {@code http-equiv} meta tag, if present, or
* fall back to {@code UTF-8} (which is often safe to do).
* @param baseUri
* The URL where the HTML was retrieved from, to resolve relative
* links against.
* @return sane HTML
* @throws IOException
* if the file could not be found, or read, or if the
* charsetName is invalid.
*/
public static Document parse(File in, String charsetName, String baseUri)
throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
* Parse the contents of a file as HTML. The location of the file is used as
* the base URI to qualify relative URLs.
*
* @param in
* file to load HTML from
* @param charsetName
* (optional) character set of file contents. Set to {@code null}
* to determine from {@code http-equiv} meta tag, if present, or
* fall back to {@code UTF-8} (which is often safe to do).
* @return sane HTML
* @throws IOException
* if the file could not be found, or read, or if the
* charsetName is invalid.
* @see #parse(File, String, String)
*/
public static Document parse(File in, String charsetName)
throws IOException {
return DataUtil.load(in, charsetName, in.getAbsolutePath());
}
/**
* Read an input stream, and parse it to a Document.
*
* @param in
* input stream to read. Make sure to close it after parsing.
* @param charsetName
* (optional) character set of file contents. Set to {@code null}
* to determine from {@code http-equiv} meta tag, if present, or
* fall back to {@code UTF-8} (which is often safe to do).
* @param baseUri
* The URL where the HTML was retrieved from, to resolve relative
* links against.
* @return sane HTML
* @throws IOException
* if the file could not be found, or read, or if the
* charsetName is invalid.
*/
public static Document parse(InputStream in, String charsetName,
String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
* Read an input stream, and parse it to a Document. You can provide an
* alternate parser, such as a simple XML (non-HTML) parser.
*
* @param in
* input stream to read. Make sure to close it after parsing.
* @param charsetName
* (optional) character set of file contents. Set to {@code null}
* to determine from {@code http-equiv} meta tag, if present, or
* fall back to {@code UTF-8} (which is often safe to do).
* @param baseUri
* The URL where the HTML was retrieved from, to resolve relative
* links against.
* @param parser
* alternate {@link Parser#xmlParser() parser} to use.
* @return sane HTML
* @throws IOException
* if the file could not be found, or read, or if the
* charsetName is invalid.
*/
public static Document parse(InputStream in, String charsetName,
String baseUri, Parser parser) throws IOException {
return DataUtil.load(in, charsetName, baseUri, parser);
}
/**
* Parse a fragment of HTML, with the assumption that it forms the
* {@code body} of the HTML.
*
* @param bodyHtml
* body HTML fragment
* @param baseUri
* URL to resolve relative URLs against.
* @return sane HTML document
* @see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return Parser.parseBodyFragment(bodyHtml, baseUri);
}
/**
* Parse a fragment of HTML, with the assumption that it forms the
* {@code body} of the HTML.
*
* @param bodyHtml
* body HTML fragment
* @return sane HTML document
* @see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml) {
return Parser.parseBodyFragment(bodyHtml, "");
}
/**
* Fetch a URL, and parse it as HTML. Provided for compatibility; in most
* cases use {@link #connect(String)} instead.
*
* The encoding character set is determined by the content-type header or
* http-equiv meta tag, or falls back to {@code UTF-8}.
*
* @param url
* URL to fetch (with a GET). The protocol must be {@code http}
* or {@code https}.
* @param timeoutMillis
* Connection and read timeout, in milliseconds. If exceeded,
* IOException is thrown.
* @return The parsed HTML.
* @throws IOException
* If the final server response != 200 OK (redirects are
* followed), or if there's an error reading the response
* stream.
* @see #connect(String)
*/
public static Document parse(URL url, int timeoutMillis) throws IOException {
Connection con = HttpConnection.connect(url);
con.timeout(timeoutMillis);
return con.get();
}
/**
* Get safe HTML from untrusted input HTML, by parsing input HTML and
* filtering it through a white-list of permitted tags and attributes.
*
* @param bodyHtml
* input untrusted HTML
* @param baseUri
* URL to resolve relative URLs against
* @param whitelist
* white-list of permitted HTML elements
* @return safe HTML
* @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, String baseUri,
Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
return clean.body().html();
}
/**
* Get safe HTML from untrusted input HTML, by parsing input HTML and
* filtering it through a white-list of permitted tags and attributes.
*
* @param bodyHtml
* input untrusted HTML
* @param whitelist
* white-list of permitted HTML elements
* @return safe HTML
* @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, Whitelist whitelist) {
return clean(bodyHtml, "", whitelist);
}
/**
* Test if the input HTML has only tags and attributes allowed by the
* Whitelist. Useful for form validation. The input HTML should still be run
* through the cleaner to set up enforced attributes, and to tidy the
* output.
*
* @param bodyHtml
* HTML to test
* @param whitelist
* whitelist to test against
* @return true if no tags or attributes were removed; false otherwise
* @see #clean(String, org.jsoup.safety.Whitelist)
*/
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, "");
Cleaner cleaner = new Cleaner(whitelist);
return cleaner.isValid(dirty);
}
}