diff options
Diffstat (limited to 'src/org/jsoup/Jsoup.java')
-rw-r--r-- | src/org/jsoup/Jsoup.java | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/src/org/jsoup/Jsoup.java b/src/org/jsoup/Jsoup.java new file mode 100644 index 0000000000..8c6afcee36 --- /dev/null +++ b/src/org/jsoup/Jsoup.java @@ -0,0 +1,229 @@ +package org.jsoup; + +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.jsoup.safety.Cleaner; +import org.jsoup.safety.Whitelist; +import org.jsoup.helper.DataUtil; +import org.jsoup.helper.HttpConnection; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +/** + The core public access point to the jsoup functionality. + + @author Jonathan Hedley */ +public class Jsoup { + private Jsoup() {} + + /** + Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. + + @param html HTML to parse + @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur + before the HTML declares a {@code <base href>} tag. + @return sane HTML + */ + public static Document parse(String html, String baseUri) { + return Parser.parse(html, baseUri); + } + + /** + Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML + (non-HTML) parser. + + @param html HTML to parse + @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur + before the HTML declares a {@code <base href>} tag. + @param parser alternate {@link Parser#xmlParser() parser} to use. + @return sane HTML + */ + public static Document parse(String html, String baseUri, Parser parser) { + return parser.parseInput(html, baseUri); + } + + /** + Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a + {@code <base href>} tag. + + @param html HTML to parse + @return sane HTML + + @see #parse(String, String) + */ + public static Document parse(String html) { + return Parser.parse(html, ""); + } + + /** + * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. + * <p> + * Use examples: + * <ul> + * <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li> + * <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post(); + * </ul> + * @param url URL to connect to. The protocol must be {@code http} or {@code https}. + * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. + */ + public static Connection connect(String url) { + return HttpConnection.connect(url); + } + + /** + Parse the contents of a file as HTML. + + @param in file to load HTML from + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + public static Document parse(File in, String charsetName, String baseUri) throws IOException { + return DataUtil.load(in, charsetName, baseUri); + } + + /** + Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + + @param in file to load HTML from + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @see #parse(File, String, String) + */ + public static Document parse(File in, String charsetName) throws IOException { + return DataUtil.load(in, charsetName, in.getAbsolutePath()); + } + + /** + Read an input stream, and parse it to a Document. + + @param in input stream to read. Make sure to close it after parsing. + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException { + return DataUtil.load(in, charsetName, baseUri); + } + + /** + Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML + (non-HTML) parser. + + @param in input stream to read. Make sure to close it after parsing. + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @param parser alternate {@link Parser#xmlParser() parser} to use. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException { + return DataUtil.load(in, charsetName, baseUri, parser); + } + + /** + Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. + + @param bodyHtml body HTML fragment + @param baseUri URL to resolve relative URLs against. + @return sane HTML document + + @see Document#body() + */ + public static Document parseBodyFragment(String bodyHtml, String baseUri) { + return Parser.parseBodyFragment(bodyHtml, baseUri); + } + + /** + Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML. + + @param bodyHtml body HTML fragment + @return sane HTML document + + @see Document#body() + */ + public static Document parseBodyFragment(String bodyHtml) { + return Parser.parseBodyFragment(bodyHtml, ""); + } + + /** + Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead. + <p> + The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}. + + @param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}. + @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. + @return The parsed HTML. + + @throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading + the response stream. + + @see #connect(String) + */ + public static Document parse(URL url, int timeoutMillis) throws IOException { + Connection con = HttpConnection.connect(url); + con.timeout(timeoutMillis); + return con.get(); + } + + /** + Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted + tags and attributes. + + @param bodyHtml input untrusted HTML + @param baseUri URL to resolve relative URLs against + @param whitelist white-list of permitted HTML elements + @return safe HTML + + @see Cleaner#clean(Document) + */ + public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) { + Document dirty = parseBodyFragment(bodyHtml, baseUri); + Cleaner cleaner = new Cleaner(whitelist); + Document clean = cleaner.clean(dirty); + return clean.body().html(); + } + + /** + Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted + tags and attributes. + + @param bodyHtml input untrusted HTML + @param whitelist white-list of permitted HTML elements + @return safe HTML + + @see Cleaner#clean(Document) + */ + public static String clean(String bodyHtml, Whitelist whitelist) { + return clean(bodyHtml, "", whitelist); + } + + /** + Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should + still be run through the cleaner to set up enforced attributes, and to tidy the output. + @param bodyHtml HTML to test + @param whitelist whitelist to test against + @return true if no tags or attributes were removed; false otherwise + @see #clean(String, org.jsoup.safety.Whitelist) + */ + public static boolean isValid(String bodyHtml, Whitelist whitelist) { + Document dirty = parseBodyFragment(bodyHtml, ""); + Cleaner cleaner = new Cleaner(whitelist); + return cleaner.isValid(dirty); + } + +} |