diff options
Diffstat (limited to 'server/src/org/jsoup/helper/HttpConnection.java')
-rw-r--r-- | server/src/org/jsoup/helper/HttpConnection.java | 658 |
1 files changed, 658 insertions, 0 deletions
diff --git a/server/src/org/jsoup/helper/HttpConnection.java b/server/src/org/jsoup/helper/HttpConnection.java new file mode 100644 index 0000000000..06200a2547 --- /dev/null +++ b/server/src/org/jsoup/helper/HttpConnection.java @@ -0,0 +1,658 @@ +package org.jsoup.helper; + +import org.jsoup.Connection; +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.jsoup.parser.TokenQueue; + +import java.io.*; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.*; +import java.util.zip.GZIPInputStream; + +/** + * Implementation of {@link Connection}. + * @see org.jsoup.Jsoup#connect(String) + */ +public class HttpConnection implements Connection { + public static Connection connect(String url) { + Connection con = new HttpConnection(); + con.url(url); + return con; + } + + public static Connection connect(URL url) { + Connection con = new HttpConnection(); + con.url(url); + return con; + } + + private Connection.Request req; + private Connection.Response res; + + private HttpConnection() { + req = new Request(); + res = new Response(); + } + + public Connection url(URL url) { + req.url(url); + return this; + } + + public Connection url(String url) { + Validate.notEmpty(url, "Must supply a valid URL"); + try { + req.url(new URL(url)); + } catch (MalformedURLException e) { + throw new IllegalArgumentException("Malformed URL: " + url, e); + } + return this; + } + + public Connection userAgent(String userAgent) { + Validate.notNull(userAgent, "User agent must not be null"); + req.header("User-Agent", userAgent); + return this; + } + + public Connection timeout(int millis) { + req.timeout(millis); + return this; + } + + public Connection followRedirects(boolean followRedirects) { + req.followRedirects(followRedirects); + return this; + } + + public Connection referrer(String referrer) { + Validate.notNull(referrer, "Referrer must not be null"); + req.header("Referer", referrer); + return this; + } + + public Connection method(Method method) { + req.method(method); + return this; + } + + public Connection ignoreHttpErrors(boolean ignoreHttpErrors) { + req.ignoreHttpErrors(ignoreHttpErrors); + return this; + } + + public Connection ignoreContentType(boolean ignoreContentType) { + req.ignoreContentType(ignoreContentType); + return this; + } + + public Connection data(String key, String value) { + req.data(KeyVal.create(key, value)); + return this; + } + + public Connection data(Map<String, String> data) { + Validate.notNull(data, "Data map must not be null"); + for (Map.Entry<String, String> entry : data.entrySet()) { + req.data(KeyVal.create(entry.getKey(), entry.getValue())); + } + return this; + } + + public Connection data(String... keyvals) { + Validate.notNull(keyvals, "Data key value pairs must not be null"); + Validate.isTrue(keyvals.length %2 == 0, "Must supply an even number of key value pairs"); + for (int i = 0; i < keyvals.length; i += 2) { + String key = keyvals[i]; + String value = keyvals[i+1]; + Validate.notEmpty(key, "Data key must not be empty"); + Validate.notNull(value, "Data value must not be null"); + req.data(KeyVal.create(key, value)); + } + return this; + } + + public Connection header(String name, String value) { + req.header(name, value); + return this; + } + + public Connection cookie(String name, String value) { + req.cookie(name, value); + return this; + } + + public Connection cookies(Map<String, String> cookies) { + Validate.notNull(cookies, "Cookie map must not be null"); + for (Map.Entry<String, String> entry : cookies.entrySet()) { + req.cookie(entry.getKey(), entry.getValue()); + } + return this; + } + + public Connection parser(Parser parser) { + req.parser(parser); + return this; + } + + public Document get() throws IOException { + req.method(Method.GET); + execute(); + return res.parse(); + } + + public Document post() throws IOException { + req.method(Method.POST); + execute(); + return res.parse(); + } + + public Connection.Response execute() throws IOException { + res = Response.execute(req); + return res; + } + + public Connection.Request request() { + return req; + } + + public Connection request(Connection.Request request) { + req = request; + return this; + } + + public Connection.Response response() { + return res; + } + + public Connection response(Connection.Response response) { + res = response; + return this; + } + + @SuppressWarnings({"unchecked"}) + private static abstract class Base<T extends Connection.Base> implements Connection.Base<T> { + URL url; + Method method; + Map<String, String> headers; + Map<String, String> cookies; + + private Base() { + headers = new LinkedHashMap<String, String>(); + cookies = new LinkedHashMap<String, String>(); + } + + public URL url() { + return url; + } + + public T url(URL url) { + Validate.notNull(url, "URL must not be null"); + this.url = url; + return (T) this; + } + + public Method method() { + return method; + } + + public T method(Method method) { + Validate.notNull(method, "Method must not be null"); + this.method = method; + return (T) this; + } + + public String header(String name) { + Validate.notNull(name, "Header name must not be null"); + return getHeaderCaseInsensitive(name); + } + + public T header(String name, String value) { + Validate.notEmpty(name, "Header name must not be empty"); + Validate.notNull(value, "Header value must not be null"); + removeHeader(name); // ensures we don't get an "accept-encoding" and a "Accept-Encoding" + headers.put(name, value); + return (T) this; + } + + public boolean hasHeader(String name) { + Validate.notEmpty(name, "Header name must not be empty"); + return getHeaderCaseInsensitive(name) != null; + } + + public T removeHeader(String name) { + Validate.notEmpty(name, "Header name must not be empty"); + Map.Entry<String, String> entry = scanHeaders(name); // remove is case insensitive too + if (entry != null) + headers.remove(entry.getKey()); // ensures correct case + return (T) this; + } + + public Map<String, String> headers() { + return headers; + } + + private String getHeaderCaseInsensitive(String name) { + Validate.notNull(name, "Header name must not be null"); + // quick evals for common case of title case, lower case, then scan for mixed + String value = headers.get(name); + if (value == null) + value = headers.get(name.toLowerCase()); + if (value == null) { + Map.Entry<String, String> entry = scanHeaders(name); + if (entry != null) + value = entry.getValue(); + } + return value; + } + + private Map.Entry<String, String> scanHeaders(String name) { + String lc = name.toLowerCase(); + for (Map.Entry<String, String> entry : headers.entrySet()) { + if (entry.getKey().toLowerCase().equals(lc)) + return entry; + } + return null; + } + + public String cookie(String name) { + Validate.notNull(name, "Cookie name must not be null"); + return cookies.get(name); + } + + public T cookie(String name, String value) { + Validate.notEmpty(name, "Cookie name must not be empty"); + Validate.notNull(value, "Cookie value must not be null"); + cookies.put(name, value); + return (T) this; + } + + public boolean hasCookie(String name) { + Validate.notEmpty("Cookie name must not be empty"); + return cookies.containsKey(name); + } + + public T removeCookie(String name) { + Validate.notEmpty("Cookie name must not be empty"); + cookies.remove(name); + return (T) this; + } + + public Map<String, String> cookies() { + return cookies; + } + } + + public static class Request extends Base<Connection.Request> implements Connection.Request { + private int timeoutMilliseconds; + private boolean followRedirects; + private Collection<Connection.KeyVal> data; + private boolean ignoreHttpErrors = false; + private boolean ignoreContentType = false; + private Parser parser; + + private Request() { + timeoutMilliseconds = 3000; + followRedirects = true; + data = new ArrayList<Connection.KeyVal>(); + method = Connection.Method.GET; + headers.put("Accept-Encoding", "gzip"); + parser = Parser.htmlParser(); + } + + public int timeout() { + return timeoutMilliseconds; + } + + public Request timeout(int millis) { + Validate.isTrue(millis >= 0, "Timeout milliseconds must be 0 (infinite) or greater"); + timeoutMilliseconds = millis; + return this; + } + + public boolean followRedirects() { + return followRedirects; + } + + public Connection.Request followRedirects(boolean followRedirects) { + this.followRedirects = followRedirects; + return this; + } + + public boolean ignoreHttpErrors() { + return ignoreHttpErrors; + } + + public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) { + this.ignoreHttpErrors = ignoreHttpErrors; + return this; + } + + public boolean ignoreContentType() { + return ignoreContentType; + } + + public Connection.Request ignoreContentType(boolean ignoreContentType) { + this.ignoreContentType = ignoreContentType; + return this; + } + + public Request data(Connection.KeyVal keyval) { + Validate.notNull(keyval, "Key val must not be null"); + data.add(keyval); + return this; + } + + public Collection<Connection.KeyVal> data() { + return data; + } + + public Request parser(Parser parser) { + this.parser = parser; + return this; + } + + public Parser parser() { + return parser; + } + } + + public static class Response extends Base<Connection.Response> implements Connection.Response { + private static final int MAX_REDIRECTS = 20; + private int statusCode; + private String statusMessage; + private ByteBuffer byteData; + private String charset; + private String contentType; + private boolean executed = false; + private int numRedirects = 0; + private Connection.Request req; + + Response() { + super(); + } + + private Response(Response previousResponse) throws IOException { + super(); + if (previousResponse != null) { + numRedirects = previousResponse.numRedirects + 1; + if (numRedirects >= MAX_REDIRECTS) + throw new IOException(String.format("Too many redirects occurred trying to load URL %s", previousResponse.url())); + } + } + + static Response execute(Connection.Request req) throws IOException { + return execute(req, null); + } + + static Response execute(Connection.Request req, Response previousResponse) throws IOException { + Validate.notNull(req, "Request must not be null"); + String protocol = req.url().getProtocol(); + Validate + .isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported"); + + // set up the request for execution + if (req.method() == Connection.Method.GET && req.data().size() > 0) + serialiseRequestUrl(req); // appends query string + HttpURLConnection conn = createConnection(req); + conn.connect(); + if (req.method() == Connection.Method.POST) + writePost(req.data(), conn.getOutputStream()); + + int status = conn.getResponseCode(); + boolean needsRedirect = false; + if (status != HttpURLConnection.HTTP_OK) { + if (status == HttpURLConnection.HTTP_MOVED_TEMP || status == HttpURLConnection.HTTP_MOVED_PERM || status == HttpURLConnection.HTTP_SEE_OTHER) + needsRedirect = true; + else if (!req.ignoreHttpErrors()) + throw new IOException(status + " error loading URL " + req.url().toString()); + } + Response res = new Response(previousResponse); + res.setupFromConnection(conn, previousResponse); + if (needsRedirect && req.followRedirects()) { + req.method(Method.GET); // always redirect with a get. any data param from original req are dropped. + req.data().clear(); + req.url(new URL(req.url(), res.header("Location"))); + for (Map.Entry<String, String> cookie : res.cookies.entrySet()) { // add response cookies to request (for e.g. login posts) + req.cookie(cookie.getKey(), cookie.getValue()); + } + return execute(req, res); + } + res.req = req; + + InputStream bodyStream = null; + InputStream dataStream = null; + try { + dataStream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); + bodyStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase("gzip") ? + new BufferedInputStream(new GZIPInputStream(dataStream)) : + new BufferedInputStream(dataStream); + + res.byteData = DataUtil.readToByteBuffer(bodyStream); + res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it + } finally { + if (bodyStream != null) bodyStream.close(); + if (dataStream != null) dataStream.close(); + } + + res.executed = true; + return res; + } + + public int statusCode() { + return statusCode; + } + + public String statusMessage() { + return statusMessage; + } + + public String charset() { + return charset; + } + + public String contentType() { + return contentType; + } + + public Document parse() throws IOException { + Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before parsing response"); + if (!req.ignoreContentType() && (contentType == null || !(contentType.startsWith("text/") || contentType.startsWith("application/xml") || contentType.startsWith("application/xhtml+xml")))) + throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*, application/xml, or application/xhtml+xml", + contentType, url.toString())); + Document doc = DataUtil.parseByteData(byteData, charset, url.toExternalForm(), req.parser()); + byteData.rewind(); + charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly + return doc; + } + + public String body() { + Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); + // charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet + String body; + if (charset == null) + body = Charset.forName(DataUtil.defaultCharset).decode(byteData).toString(); + else + body = Charset.forName(charset).decode(byteData).toString(); + byteData.rewind(); + return body; + } + + public byte[] bodyAsBytes() { + Validate.isTrue(executed, "Request must be executed (with .execute(), .get(), or .post() before getting response body"); + return byteData.array(); + } + + // set up connection defaults, and details from request + private static HttpURLConnection createConnection(Connection.Request req) throws IOException { + HttpURLConnection conn = (HttpURLConnection) req.url().openConnection(); + conn.setRequestMethod(req.method().name()); + conn.setInstanceFollowRedirects(false); // don't rely on native redirection support + conn.setConnectTimeout(req.timeout()); + conn.setReadTimeout(req.timeout()); + if (req.method() == Method.POST) + conn.setDoOutput(true); + if (req.cookies().size() > 0) + conn.addRequestProperty("Cookie", getRequestCookieString(req)); + for (Map.Entry<String, String> header : req.headers().entrySet()) { + conn.addRequestProperty(header.getKey(), header.getValue()); + } + return conn; + } + + // set up url, method, header, cookies + private void setupFromConnection(HttpURLConnection conn, Connection.Response previousResponse) throws IOException { + method = Connection.Method.valueOf(conn.getRequestMethod()); + url = conn.getURL(); + statusCode = conn.getResponseCode(); + statusMessage = conn.getResponseMessage(); + contentType = conn.getContentType(); + + Map<String, List<String>> resHeaders = conn.getHeaderFields(); + processResponseHeaders(resHeaders); + + // if from a redirect, map previous response cookies into this response + if (previousResponse != null) { + for (Map.Entry<String, String> prevCookie : previousResponse.cookies().entrySet()) { + if (!hasCookie(prevCookie.getKey())) + cookie(prevCookie.getKey(), prevCookie.getValue()); + } + } + } + + void processResponseHeaders(Map<String, List<String>> resHeaders) { + for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) { + String name = entry.getKey(); + if (name == null) + continue; // http/1.1 line + + List<String> values = entry.getValue(); + if (name.equalsIgnoreCase("Set-Cookie")) { + for (String value : values) { + if (value == null) + continue; + TokenQueue cd = new TokenQueue(value); + String cookieName = cd.chompTo("=").trim(); + String cookieVal = cd.consumeTo(";").trim(); + if (cookieVal == null) + cookieVal = ""; + // ignores path, date, domain, secure et al. req'd? + // name not blank, value not null + if (cookieName != null && cookieName.length() > 0) + cookie(cookieName, cookieVal); + } + } else { // only take the first instance of each header + if (!values.isEmpty()) + header(name, values.get(0)); + } + } + } + + private static void writePost(Collection<Connection.KeyVal> data, OutputStream outputStream) throws IOException { + OutputStreamWriter w = new OutputStreamWriter(outputStream, DataUtil.defaultCharset); + boolean first = true; + for (Connection.KeyVal keyVal : data) { + if (!first) + w.append('&'); + else + first = false; + + w.write(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)); + w.write('='); + w.write(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); + } + w.close(); + } + + private static String getRequestCookieString(Connection.Request req) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry<String, String> cookie : req.cookies().entrySet()) { + if (!first) + sb.append("; "); + else + first = false; + sb.append(cookie.getKey()).append('=').append(cookie.getValue()); + // todo: spec says only ascii, no escaping / encoding defined. validate on set? or escape somehow here? + } + return sb.toString(); + } + + // for get url reqs, serialise the data map into the url + private static void serialiseRequestUrl(Connection.Request req) throws IOException { + URL in = req.url(); + StringBuilder url = new StringBuilder(); + boolean first = true; + // reconstitute the query, ready for appends + url + .append(in.getProtocol()) + .append("://") + .append(in.getAuthority()) // includes host, port + .append(in.getPath()) + .append("?"); + if (in.getQuery() != null) { + url.append(in.getQuery()); + first = false; + } + for (Connection.KeyVal keyVal : req.data()) { + if (!first) + url.append('&'); + else + first = false; + url + .append(URLEncoder.encode(keyVal.key(), DataUtil.defaultCharset)) + .append('=') + .append(URLEncoder.encode(keyVal.value(), DataUtil.defaultCharset)); + } + req.url(new URL(url.toString())); + req.data().clear(); // moved into url as get params + } + } + + public static class KeyVal implements Connection.KeyVal { + private String key; + private String value; + + public static KeyVal create(String key, String value) { + Validate.notEmpty(key, "Data key must not be empty"); + Validate.notNull(value, "Data value must not be null"); + return new KeyVal(key, value); + } + + private KeyVal(String key, String value) { + this.key = key; + this.value = value; + } + + public KeyVal key(String key) { + Validate.notEmpty(key, "Data key must not be empty"); + this.key = key; + return this; + } + + public String key() { + return key; + } + + public KeyVal value(String value) { + Validate.notNull(value, "Data value must not be null"); + this.value = value; + return this; + } + + public String value() { + return value; + } + + @Override + public String toString() { + return key + "=" + value; + } + } +} |