1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
package org.jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.HttpConnection;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
/**
The core public access point to the jsoup functionality.
@author Jonathan Hedley */
public class Jsoup {
private Jsoup() {}
/**
Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML.
@param html HTML to parse
@param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
before the HTML declares a {@code <base href>} tag.
@return sane HTML
*/
public static Document parse(String html, String baseUri) {
return Parser.parse(html, baseUri);
}
/**
Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
(non-HTML) parser.
@param html HTML to parse
@param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
before the HTML declares a {@code <base href>} tag.
@param parser alternate {@link Parser#xmlParser() parser} to use.
@return sane HTML
*/
public static Document parse(String html, String baseUri, Parser parser) {
return parser.parseInput(html, baseUri);
}
/**
Parse HTML into a Document. As no base URI is specified, absolute URL detection relies on the HTML including a
{@code <base href>} tag.
@param html HTML to parse
@return sane HTML
@see #parse(String, String)
*/
public static Document parse(String html) {
return Parser.parse(html, "");
}
/**
* Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
* <p>
* Use examples:
* <ul>
* <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
* <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();
* </ul>
* @param url URL to connect to. The protocol must be {@code http} or {@code https}.
* @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute.
*/
public static Connection connect(String url) {
return HttpConnection.connect(url);
}
/**
Parse the contents of a file as HTML.
@param in file to load HTML from
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
*/
public static Document parse(File in, String charsetName, String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
@param in file to load HTML from
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@see #parse(File, String, String)
*/
public static Document parse(File in, String charsetName) throws IOException {
return DataUtil.load(in, charsetName, in.getAbsolutePath());
}
/**
Read an input stream, and parse it to a Document.
@param in input stream to read. Make sure to close it after parsing.
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
*/
public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
(non-HTML) parser.
@param in input stream to read. Make sure to close it after parsing.
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@param parser alternate {@link Parser#xmlParser() parser} to use.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
*/
public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
return DataUtil.load(in, charsetName, baseUri, parser);
}
/**
Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
@param bodyHtml body HTML fragment
@param baseUri URL to resolve relative URLs against.
@return sane HTML document
@see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return Parser.parseBodyFragment(bodyHtml, baseUri);
}
/**
Parse a fragment of HTML, with the assumption that it forms the {@code body} of the HTML.
@param bodyHtml body HTML fragment
@return sane HTML document
@see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml) {
return Parser.parseBodyFragment(bodyHtml, "");
}
/**
Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use {@link #connect(String)} instead.
<p>
The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to {@code UTF-8}.
@param url URL to fetch (with a GET). The protocol must be {@code http} or {@code https}.
@param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown.
@return The parsed HTML.
@throws IOException If the final server response != 200 OK (redirects are followed), or if there's an error reading
the response stream.
@see #connect(String)
*/
public static Document parse(URL url, int timeoutMillis) throws IOException {
Connection con = HttpConnection.connect(url);
con.timeout(timeoutMillis);
return con.get();
}
/**
Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
tags and attributes.
@param bodyHtml input untrusted HTML
@param baseUri URL to resolve relative URLs against
@param whitelist white-list of permitted HTML elements
@return safe HTML
@see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
return clean.body().html();
}
/**
Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
tags and attributes.
@param bodyHtml input untrusted HTML
@param whitelist white-list of permitted HTML elements
@return safe HTML
@see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, Whitelist whitelist) {
return clean(bodyHtml, "", whitelist);
}
/**
Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
still be run through the cleaner to set up enforced attributes, and to tidy the output.
@param bodyHtml HTML to test
@param whitelist whitelist to test against
@return true if no tags or attributes were removed; false otherwise
@see #clean(String, org.jsoup.safety.Whitelist)
*/
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, "");
Cleaner cleaner = new Cleaner(whitelist);
return cleaner.isValid(dirty);
}
}
|