1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
package org.jsoup.parser;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import java.util.List;
/**
* Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods
* in {@link org.jsoup.Jsoup}.
*/
public class Parser {
private static final int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled.
private TreeBuilder treeBuilder;
private int maxErrors = DEFAULT_MAX_ERRORS;
private ParseErrorList errors;
/**
* Create a new Parser, using the specified TreeBuilder
* @param treeBuilder TreeBuilder to use to parse input into Documents.
*/
public Parser(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
}
public Document parseInput(String html, String baseUri) {
errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
Document doc = treeBuilder.parse(html, baseUri, errors);
return doc;
}
// gets & sets
/**
* Get the TreeBuilder currently in use.
* @return current TreeBuilder.
*/
public TreeBuilder getTreeBuilder() {
return treeBuilder;
}
/**
* Update the TreeBuilder used when parsing content.
* @param treeBuilder current TreeBuilder
* @return this, for chaining
*/
public Parser setTreeBuilder(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
return this;
}
/**
* Check if parse error tracking is enabled.
* @return current track error state.
*/
public boolean isTrackErrors() {
return maxErrors > 0;
}
/**
* Enable or disable parse error tracking for the next parse.
* @param maxErrors the maximum number of errors to track. Set to 0 to disable.
* @return this, for chaining
*/
public Parser setTrackErrors(int maxErrors) {
this.maxErrors = maxErrors;
return this;
}
/**
* Retrieve the parse errors, if any, from the last parse.
* @return list of parse errors, up to the size of the maximum errors tracked.
*/
public List<ParseError> getErrors() {
return errors;
}
// static parse functions below
/**
* Parse HTML into a Document.
*
* @param html HTML to parse
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return parsed Document
*/
public static Document parse(String html, String baseUri) {
TreeBuilder treeBuilder = new HtmlTreeBuilder();
return treeBuilder.parse(html, baseUri, ParseErrorList.noTracking());
}
/**
* Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
*
* @param fragmentHtml the fragment of HTML to parse
* @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
* provides stack context (for implicit element creation).
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
*/
public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
return treeBuilder.parseFragment(fragmentHtml, context, baseUri, ParseErrorList.noTracking());
}
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
/**
* @param bodyHtml HTML to parse
* @param baseUri baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return parsed Document
* @deprecated Use {@link #parseBodyFragment} or {@link #parseFragment} instead.
*/
public static Document parseBodyFragmentRelaxed(String bodyHtml, String baseUri) {
return parse(bodyHtml, baseUri);
}
// builders
/**
* Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
* based on a knowledge of the semantics of the incoming tags.
* @return a new HTML parser.
*/
public static Parser htmlParser() {
return new Parser(new HtmlTreeBuilder());
}
/**
* Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
* rather creates a simple tree directly from the input.
* @return a new simple XML parser.
*/
public static Parser xmlParser() {
return new Parser(new XmlTreeBuilder());
}
}
|