package org.jsoup;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

import org.jsoup.helper.DataUtil;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;

/**
 * The core public access point to the jsoup functionality.
 * 
 * @author Jonathan Hedley
 */
public class Jsoup {
    private Jsoup() {
    }

    /**
     * Parse HTML into a Document. The parser will make a sensible, balanced
     * document tree out of any HTML.
     * 
     * @param html
     *            HTML to parse
     * @param baseUri
     *            The URL where the HTML was retrieved from. Used to resolve
     *            relative URLs to absolute URLs, that occur before the HTML
     *            declares a {@code <base href>} tag.
     * @return sane HTML
     */
    public static Document parse(String html, String baseUri) {
        return Parser.parse(html, baseUri);
    }

    /**
     * Parse HTML into a Document, using the provided Parser. You can provide an
     * alternate parser, such as a simple XML (non-HTML) parser.
     * 
     * @param html
     *            HTML to parse
     * @param baseUri
     *            The URL where the HTML was retrieved from. Used to resolve
     *            relative URLs to absolute URLs, that occur before the HTML
     *            declares a {@code <base href>} tag.
     * @param parser
     *            alternate {@link Parser#xmlParser() parser} to use.
     * @return sane HTML
     */
    public static Document parse(String html, String baseUri, Parser parser) {
        return parser.parseInput(html, baseUri);
    }

    /**
     * Parse HTML into a Document. As no base URI is specified, absolute URL
     * detection relies on the HTML including a {@code <base href>} tag.
     * 
     * @param html
     *            HTML to parse
     * @return sane HTML
     * @see #parse(String, String)
     */
    public static Document parse(String html) {
        return Parser.parse(html, "");
    }

    /**
     * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML
     * page.
     * <p>
     * Use examples:
     * <ul>
     * <li>
     * <code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code>
     * </li>
     * <li>
     * <code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
     * </ul>
     * 
     * @param url
     *            URL to connect to. The protocol must be {@code http} or
     *            {@code https}.
     * @return the connection. You can add data, cookies, and headers; set the
     *         user-agent, referrer, method; and then execute.
     */
    public static Connection connect(String url) {
        return HttpConnection.connect(url);
    }

    /**
     * Parse the contents of a file as HTML.
     * 
     * @param in
     *            file to load HTML from
     * @param charsetName
     *            (optional) character set of file contents. Set to {@code null}
     *            to determine from {@code http-equiv} meta tag, if present, or
     *            fall back to {@code UTF-8} (which is often safe to do).
     * @param baseUri
     *            The URL where the HTML was retrieved from, to resolve relative
     *            links against.
     * @return sane HTML
     * @throws IOException
     *             if the file could not be found, or read, or if the
     *             charsetName is invalid.
     */
    public static Document parse(File in, String charsetName, String baseUri)
            throws IOException {
        return DataUtil.load(in, charsetName, baseUri);
    }

    /**
     * Parse the contents of a file as HTML. The location of the file is used as
     * the base URI to qualify relative URLs.
     * 
     * @param in
     *            file to load HTML from
     * @param charsetName
     *            (optional) character set of file contents. Set to {@code null}
     *            to determine from {@code http-equiv} meta tag, if present, or
     *            fall back to {@code UTF-8} (which is often safe to do).
     * @return sane HTML
     * @throws IOException
     *             if the file could not be found, or read, or if the
     *             charsetName is invalid.
     * @see #parse(File, String, String)
     */
    public static Document parse(File in, String charsetName)
            throws IOException {
        return DataUtil.load(in, charsetName, in.getAbsolutePath());
    }

    /**
     * Read an input stream, and parse it to a Document.
     * 
     * @param in
     *            input stream to read. Make sure to close it after parsing.
     * @param charsetName
     *            (optional) character set of file contents. Set to {@code null}
     *            to determine from {@code http-equiv} meta tag, if present, or
     *            fall back to {@code UTF-8} (which is often safe to do).
     * @param baseUri
     *            The URL where the HTML was retrieved from, to resolve relative
     *            links against.
     * @return sane HTML
     * @throws IOException
     *             if the file could not be found, or read, or if the
     *             charsetName is invalid.
     */
    public static Document parse(InputStream in, String charsetName,
            String baseUri) throws IOException {
        return DataUtil.load(in, charsetName, baseUri);
    }

    /**
     * Read an input stream, and parse it to a Document. You can provide an
     * alternate parser, such as a simple XML (non-HTML) parser.
     * 
     * @param in
     *            input stream to read. Make sure to close it after parsing.
     * @param charsetName
     *            (optional) character set of file contents. Set to {@code null}
     *            to determine from {@code http-equiv} meta tag, if present, or
     *            fall back to {@code UTF-8} (which is often safe to do).
     * @param baseUri
     *            The URL where the HTML was retrieved from, to resolve relative
     *            links against.
     * @param parser
     *            alternate {@link Parser#xmlParser() parser} to use.
     * @return sane HTML
     * @throws IOException
     *             if the file could not be found, or read, or if the
     *             charsetName is invalid.
     */
    public static Document parse(InputStream in, String charsetName,
            String baseUri, Parser parser) throws IOException {
        return DataUtil.load(in, charsetName, baseUri, parser);
    }

    /**
     * Parse a fragment of HTML, with the assumption that it forms the
     * {@code body} of the HTML.
     * 
     * @param bodyHtml
     *            body HTML fragment
     * @param baseUri
     *            URL to resolve relative URLs against.
     * @return sane HTML document
     * @see Document#body()
     */
    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
        return Parser.parseBodyFragment(bodyHtml, baseUri);
    }

    /**
     * Parse a fragment of HTML, with the assumption that it forms the
     * {@code body} of the HTML.
     * 
     * @param bodyHtml
     *            body HTML fragment
     * @return sane HTML document
     * @see Document#body()
     */
    public static Document parseBodyFragment(String bodyHtml) {
        return Parser.parseBodyFragment(bodyHtml, "");
    }

    /**
     * Fetch a URL, and parse it as HTML. Provided for compatibility; in most
     * cases use {@link #connect(String)} instead.
     * <p>
     * The encoding character set is determined by the content-type header or
     * http-equiv meta tag, or falls back to {@code UTF-8}.
     * 
     * @param url
     *            URL to fetch (with a GET). The protocol must be {@code http}
     *            or {@code https}.
     * @param timeoutMillis
     *            Connection and read timeout, in milliseconds. If exceeded,
     *            IOException is thrown.
     * @return The parsed HTML.
     * @throws IOException
     *             If the final server response != 200 OK (redirects are
     *             followed), or if there's an error reading the response
     *             stream.
     * @see #connect(String)
     */
    public static Document parse(URL url, int timeoutMillis) throws IOException {
        Connection con = HttpConnection.connect(url);
        con.timeout(timeoutMillis);
        return con.get();
    }

    /**
     * Get safe HTML from untrusted input HTML, by parsing input HTML and
     * filtering it through a white-list of permitted tags and attributes.
     * 
     * @param bodyHtml
     *            input untrusted HTML
     * @param baseUri
     *            URL to resolve relative URLs against
     * @param whitelist
     *            white-list of permitted HTML elements
     * @return safe HTML
     * @see Cleaner#clean(Document)
     */
    public static String clean(String bodyHtml, String baseUri,
            Whitelist whitelist) {
        Document dirty = parseBodyFragment(bodyHtml, baseUri);
        Cleaner cleaner = new Cleaner(whitelist);
        Document clean = cleaner.clean(dirty);
        return clean.body().html();
    }

    /**
     * Get safe HTML from untrusted input HTML, by parsing input HTML and
     * filtering it through a white-list of permitted tags and attributes.
     * 
     * @param bodyHtml
     *            input untrusted HTML
     * @param whitelist
     *            white-list of permitted HTML elements
     * @return safe HTML
     * @see Cleaner#clean(Document)
     */
    public static String clean(String bodyHtml, Whitelist whitelist) {
        return clean(bodyHtml, "", whitelist);
    }

    /**
     * Test if the input HTML has only tags and attributes allowed by the
     * Whitelist. Useful for form validation. The input HTML should still be run
     * through the cleaner to set up enforced attributes, and to tidy the
     * output.
     * 
     * @param bodyHtml
     *            HTML to test
     * @param whitelist
     *            whitelist to test against
     * @return true if no tags or attributes were removed; false otherwise
     * @see #clean(String, org.jsoup.safety.Whitelist)
     */
    public static boolean isValid(String bodyHtml, Whitelist whitelist) {
        Document dirty = parseBodyFragment(bodyHtml, "");
        Cleaner cleaner = new Cleaner(whitelist);
        return cleaner.isValid(dirty);
    }

}