summaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/helper/DataUtil.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/helper/DataUtil.java')
-rw-r--r--server/src/org/jsoup/helper/DataUtil.java135
1 files changed, 135 insertions, 0 deletions
diff --git a/server/src/org/jsoup/helper/DataUtil.java b/server/src/org/jsoup/helper/DataUtil.java
new file mode 100644
index 0000000000..9adfe42153
--- /dev/null
+++ b/server/src/org/jsoup/helper/DataUtil.java
@@ -0,0 +1,135 @@
+package org.jsoup.helper;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.parser.Parser;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Internal static utilities for handling data.
+ *
+ */
+public class DataUtil {
+ private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
+ static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
+ private static final int bufferSize = 0x20000; // ~130K.
+
+ private DataUtil() {}
+
+ /**
+ * Loads a file to a Document.
+ * @param in file to load
+ * @param charsetName character set of input
+ * @param baseUri base URI of document, to resolve relative links against
+ * @return Document
+ * @throws IOException on IO error
+ */
+ public static Document load(File in, String charsetName, String baseUri) throws IOException {
+ FileInputStream inStream = null;
+ try {
+ inStream = new FileInputStream(in);
+ ByteBuffer byteData = readToByteBuffer(inStream);
+ return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
+ } finally {
+ if (inStream != null)
+ inStream.close();
+ }
+ }
+
+ /**
+ * Parses a Document from an input steam.
+ * @param in input stream to parse. You will need to close it.
+ * @param charsetName character set of input
+ * @param baseUri base URI of document, to resolve relative links against
+ * @return Document
+ * @throws IOException on IO error
+ */
+ public static Document load(InputStream in, String charsetName, String baseUri) throws IOException {
+ ByteBuffer byteData = readToByteBuffer(in);
+ return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
+ }
+
+ /**
+ * Parses a Document from an input steam, using the provided Parser.
+ * @param in input stream to parse. You will need to close it.
+ * @param charsetName character set of input
+ * @param baseUri base URI of document, to resolve relative links against
+ * @param parser alternate {@link Parser#xmlParser() parser} to use.
+ * @return Document
+ * @throws IOException on IO error
+ */
+ public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+ ByteBuffer byteData = readToByteBuffer(in);
+ return parseByteData(byteData, charsetName, baseUri, parser);
+ }
+
+ // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
+ // switching the chartset midstream when a meta http-equiv tag defines the charset.
+ static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
+ String docData;
+ Document doc = null;
+ if (charsetName == null) { // determine from meta. safe parse as UTF-8
+ // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
+ docData = Charset.forName(defaultCharset).decode(byteData).toString();
+ doc = parser.parseInput(docData, baseUri);
+ Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
+ if (meta != null) { // if not found, will keep utf-8 as best attempt
+ String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
+ if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
+ charsetName = foundCharset;
+ byteData.rewind();
+ docData = Charset.forName(foundCharset).decode(byteData).toString();
+ doc = null;
+ }
+ }
+ } else { // specified by content type header (or by user on file load)
+ Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
+ docData = Charset.forName(charsetName).decode(byteData).toString();
+ }
+ if (doc == null) {
+ // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
+ // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
+ // into head mode
+ if (docData.charAt(0) == 65279)
+ docData = docData.substring(1);
+
+ doc = parser.parseInput(docData, baseUri);
+ doc.outputSettings().charset(charsetName);
+ }
+ return doc;
+ }
+
+ static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
+ byte[] buffer = new byte[bufferSize];
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
+ int read;
+ while(true) {
+ read = inStream.read(buffer);
+ if (read == -1) break;
+ outStream.write(buffer, 0, read);
+ }
+ ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
+ return byteData;
+ }
+
+ /**
+ * Parse out a charset from a content type header.
+ * @param contentType e.g. "text/html; charset=EUC-JP"
+ * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
+ */
+ static String getCharsetFromContentType(String contentType) {
+ if (contentType == null) return null;
+ Matcher m = charsetPattern.matcher(contentType);
+ if (m.find()) {
+ return m.group(1).trim().toUpperCase();
+ }
+ return null;
+ }
+
+
+}