diff options
Diffstat (limited to 'server/src/org/jsoup/parser/Parser.java')
-rw-r--r-- | server/src/org/jsoup/parser/Parser.java | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/server/src/org/jsoup/parser/Parser.java b/server/src/org/jsoup/parser/Parser.java new file mode 100644 index 0000000000..2236219c06 --- /dev/null +++ b/server/src/org/jsoup/parser/Parser.java @@ -0,0 +1,157 @@ +package org.jsoup.parser; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + +import java.util.List; + +/** + * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods + * in {@link org.jsoup.Jsoup}. + */ +public class Parser { + private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. + + private TreeBuilder treeBuilder; + private int maxErrors = DEFAULT_MAX_ERRORS; + private ParseErrorList errors; + + /** + * Create a new Parser, using the specified TreeBuilder + * @param treeBuilder TreeBuilder to use to parse input into Documents. + */ + public Parser(TreeBuilder treeBuilder) { + this.treeBuilder = treeBuilder; + } + + public Document parseInput(String html, String baseUri) { + errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); + Document doc = treeBuilder.parse(html, baseUri, errors); + return doc; + } + + // gets & sets + /** + * Get the TreeBuilder currently in use. + * @return current TreeBuilder. + */ + public TreeBuilder getTreeBuilder() { + return treeBuilder; + } + + /** + * Update the TreeBuilder used when parsing content. + * @param treeBuilder current TreeBuilder + * @return this, for chaining + */ + public Parser setTreeBuilder(TreeBuilder treeBuilder) { + this.treeBuilder = treeBuilder; + return this; + } + + /** + * Check if parse error tracking is enabled. + * @return current track error state. + */ + public boolean isTrackErrors() { + return maxErrors > 0; + } + + /** + * Enable or disable parse error tracking for the next parse. + * @param maxErrors the maximum number of errors to track. Set to 0 to disable. + * @return this, for chaining + */ + public Parser setTrackErrors(int maxErrors) { + this.maxErrors = maxErrors; + return this; + } + + /** + * Retrieve the parse errors, if any, from the last parse. + * @return list of parse errors, up to the size of the maximum errors tracked. + */ + public List<ParseError> getErrors() { + return errors; + } + + // static parse functions below + /** + * Parse HTML into a Document. + * + * @param html HTML to parse + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return parsed Document + */ + public static Document parse(String html, String baseUri) { + TreeBuilder treeBuilder = new HtmlTreeBuilder(); + return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking()); + } + + /** + * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. + * + * @param fragmentHtml the fragment of HTML to parse + * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This + * provides stack context (for implicit element creation). + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. + */ + public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { + HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); + return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking()); + } + + /** + * Parse a fragment of HTML into the {@code body} of a Document. + * + * @param bodyHtml fragment of HTML + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return Document, with empty head, and HTML parsed into body + */ + public static Document parseBodyFragment(String bodyHtml, String baseUri) { + Document doc = Document.createShell(baseUri); + Element body = doc.body(); + List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); + Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented + for (Node node : nodes) { + body.appendChild(node); + } + return doc; + } + + /** + * @param bodyHtml HTML to parse + * @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return parsed Document + * @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead. + */ + public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) { + return parse(bodyHtml, baseUri); + } + + // builders + + /** + * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, + * based on a knowledge of the semantics of the incoming tags. + * @return a new HTML parser. + */ + public static Parser htmlParser() { + return new Parser(new HtmlTreeBuilder()); + } + + /** + * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, + * rather creates a simple tree directly from the input. + * @return a new simple XML parser. + */ + public static Parser xmlParser() { + return new Parser(new XmlTreeBuilder()); + } +} |