package org.jsoup; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.URL; import org.jsoup.helper.DataUtil; import org.jsoup.helper.HttpConnection; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; /** * The core public access point to the jsoup functionality. * * @author Jonathan Hedley */ public class Jsoup { private Jsoup() { } /** * Parse HTML into a Document. The parser will make a sensible, balanced * document tree out of any HTML. * * @param html * HTML to parse * @param baseUri * The URL where the HTML was retrieved from. Used to resolve * relative URLs to absolute URLs, that occur before the HTML * declares a {@code } tag. * @return sane HTML */ public static Document parse(String html, String baseUri) { return Parser.parse(html, baseUri); } /** * Parse HTML into a Document, using the provided Parser. You can provide an * alternate parser, such as a simple XML (non-HTML) parser. * * @param html * HTML to parse * @param baseUri * The URL where the HTML was retrieved from. Used to resolve * relative URLs to absolute URLs, that occur before the HTML * declares a {@code } tag. * @param parser * alternate {@link Parser#xmlParser() parser} to use. * @return sane HTML */ public static Document parse(String html, String baseUri, Parser parser) { return parser.parseInput(html, baseUri); } /** * Parse HTML into a Document. As no base URI is specified, absolute URL * detection relies on the HTML including a {@code } tag. * * @param html * HTML to parse * @return sane HTML * @see #parse(String, String) */ public static Document parse(String html) { return Parser.parse(html, ""); } /** * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML * page. *

* Use examples: *

* * @param url * URL to connect to. The protocol must be {@code http} or * {@code https}. * @return the connection. You can add data, cookies, and headers; set the * user-agent, referrer, method; and then execute. */ public static Connection connect(String url) { return HttpConnection.connect(url); } /** * Parse the contents of a file as HTML. * * @param in * file to load HTML from * @param charsetName * (optional) character set of file contents. Set to {@code null} * to determine from {@code http-equiv} meta tag, if present, or * fall back to {@code UTF-8} (which is often safe to do). * @param baseUri * The URL where the HTML was retrieved from, to resolve relative * links against. * @return sane HTML * @throws IOException * if the file could not be found, or read, or if the * charsetName is invalid. */ public static Document parse(File in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** * Parse the contents of a file as HTML. The location of the file is used as * the base URI to qualify relative URLs. * * @param in * file to load HTML from * @param charsetName * (optional) character set of file contents. Set to {@code null} * to determine from {@code http-equiv} meta tag, if present, or * fall back to {@code UTF-8} (which is often safe to do). * @return sane HTML * @throws IOException * if the file could not be found, or read, or if the * charsetName is invalid. * @see #parse(File, String, String) */ public static Document parse(File in, String charsetName) throws IOException { return DataUtil.load(in, charsetName, in.getAbsolutePath()); } /** * Read an input stream, and parse it to a Document. * * @param in * input stream to read. Make sure to close it after parsing. * @param charsetName * (optional) character set of file contents. Set to {@code null} * to determine from {@code http-equiv} meta tag, if present, or * fall back to {@code UTF-8} (which is often safe to do). * @param baseUri * The URL where the HTML was retrieved from, to resolve relative * links against. * @return sane HTML * @throws IOException * if the file could not be found, or read, or if the * charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { return DataUtil.load(in, charsetName, baseUri); } /** * Read an input stream, and parse it to a Document. You can provide an * alternate parser, such as a simple XML (non-HTML) parser. * * @param in * input stream to read. Make sure to close it after parsing. * @param charsetName * (optional) character set of file contents. Set to {@code null} * to determine from {@code http-equiv} meta tag, if present, or * fall back to {@code UTF-8} (which is often safe to do). * @param baseUri * The URL where the HTML was retrieved from, to resolve relative * links against. * @param parser * alternate {@link Parser#xmlParser() parser} to use. * @return sane HTML * @throws IOException * if the file could not be found, or read, or if the * charsetName is invalid. */ public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { return DataUtil.load(in, charsetName, baseUri, parser); } /** * Parse a fragment of HTML, with the assumption that it forms the * {@code body} of the HTML. * * @param bodyHtml * body HTML fragment * @param baseUri * URL to resolve relative URLs against. * @return sane HTML document * @see Document#body() */ public static Document parseBodyFragment(String bodyHtml, String baseUri) { return Parser.parseBodyFragment(bodyHtml, baseUri); } /** * Parse a fragment of HTML, with the assumption that it forms the * {@code body} of the HTML. * * @param bodyHtml * body HTML fragment * @return sane HTML document * @see Document#body() */ public static Document parseBodyFragment(String bodyHtml) { return Parser.parseBodyFragment(bodyHtml, ""); } /** * Fetch a URL, and parse it as HTML. Provided for compatibility; in most * cases use {@link #connect(String)} instead. *

* The encoding character set is determined by the content-type header or * http-equiv meta tag, or falls back to {@code UTF-8}. * * @param url * URL to fetch (with a GET). The protocol must be {@code http} * or {@code https}. * @param timeoutMillis * Connection and read timeout, in milliseconds. If exceeded, * IOException is thrown. * @return The parsed HTML. * @throws IOException * If the final server response != 200 OK (redirects are * followed), or if there's an error reading the response * stream. * @see #connect(String) */ public static Document parse(URL url, int timeoutMillis) throws IOException { Connection con = HttpConnection.connect(url); con.timeout(timeoutMillis); return con.get(); } /** * Get safe HTML from untrusted input HTML, by parsing input HTML and * filtering it through a white-list of permitted tags and attributes. * * @param bodyHtml * input untrusted HTML * @param baseUri * URL to resolve relative URLs against * @param whitelist * white-list of permitted HTML elements * @return safe HTML * @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, baseUri); Cleaner cleaner = new Cleaner(whitelist); Document clean = cleaner.clean(dirty); return clean.body().html(); } /** * Get safe HTML from untrusted input HTML, by parsing input HTML and * filtering it through a white-list of permitted tags and attributes. * * @param bodyHtml * input untrusted HTML * @param whitelist * white-list of permitted HTML elements * @return safe HTML * @see Cleaner#clean(Document) */ public static String clean(String bodyHtml, Whitelist whitelist) { return clean(bodyHtml, "", whitelist); } /** * Test if the input HTML has only tags and attributes allowed by the * Whitelist. Useful for form validation. The input HTML should still be run * through the cleaner to set up enforced attributes, and to tidy the * output. * * @param bodyHtml * HTML to test * @param whitelist * whitelist to test against * @return true if no tags or attributes were removed; false otherwise * @see #clean(String, org.jsoup.safety.Whitelist) */ public static boolean isValid(String bodyHtml, Whitelist whitelist) { Document dirty = parseBodyFragment(bodyHtml, ""); Cleaner cleaner = new Cleaner(whitelist); return cleaner.isValid(dirty); } }