summaryrefslogtreecommitdiffstats
path: root/src/org/jsoup/helper/DataUtil.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/org/jsoup/helper/DataUtil.java')
-rw-r--r--src/org/jsoup/helper/DataUtil.java135
1 files changed, 0 insertions, 135 deletions
diff --git a/src/org/jsoup/helper/DataUtil.java b/src/org/jsoup/helper/DataUtil.java
deleted file mode 100644
index 9adfe42153..0000000000
--- a/src/org/jsoup/helper/DataUtil.java
+++ /dev/null
@@ -1,135 +0,0 @@
-package org.jsoup.helper;
-
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.parser.Parser;
-
-import java.io.*;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Internal static utilities for handling data.
- *
- */
-public class DataUtil {
- private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
- static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
- private static final int bufferSize = 0x20000; // ~130K.
-
- private DataUtil() {}
-
- /**
- * Loads a file to a Document.
- * @param in file to load
- * @param charsetName character set of input
- * @param baseUri base URI of document, to resolve relative links against
- * @return Document
- * @throws IOException on IO error
- */
- public static Document load(File in, String charsetName, String baseUri) throws IOException {
- FileInputStream inStream = null;
- try {
- inStream = new FileInputStream(in);
- ByteBuffer byteData = readToByteBuffer(inStream);
- return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
- } finally {
- if (inStream != null)
- inStream.close();
- }
- }
-
- /**
- * Parses a Document from an input steam.
- * @param in input stream to parse. You will need to close it.
- * @param charsetName character set of input
- * @param baseUri base URI of document, to resolve relative links against
- * @return Document
- * @throws IOException on IO error
- */
- public static Document load(InputStream in, String charsetName, String baseUri) throws IOException {
- ByteBuffer byteData = readToByteBuffer(in);
- return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
- }
-
- /**
- * Parses a Document from an input steam, using the provided Parser.
- * @param in input stream to parse. You will need to close it.
- * @param charsetName character set of input
- * @param baseUri base URI of document, to resolve relative links against
- * @param parser alternate {@link Parser#xmlParser() parser} to use.
- * @return Document
- * @throws IOException on IO error
- */
- public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
- ByteBuffer byteData = readToByteBuffer(in);
- return parseByteData(byteData, charsetName, baseUri, parser);
- }
-
- // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
- // switching the chartset midstream when a meta http-equiv tag defines the charset.
- static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
- String docData;
- Document doc = null;
- if (charsetName == null) { // determine from meta. safe parse as UTF-8
- // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
- docData = Charset.forName(defaultCharset).decode(byteData).toString();
- doc = parser.parseInput(docData, baseUri);
- Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
- if (meta != null) { // if not found, will keep utf-8 as best attempt
- String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
- if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
- charsetName = foundCharset;
- byteData.rewind();
- docData = Charset.forName(foundCharset).decode(byteData).toString();
- doc = null;
- }
- }
- } else { // specified by content type header (or by user on file load)
- Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
- docData = Charset.forName(charsetName).decode(byteData).toString();
- }
- if (doc == null) {
- // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
- // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
- // into head mode
- if (docData.charAt(0) == 65279)
- docData = docData.substring(1);
-
- doc = parser.parseInput(docData, baseUri);
- doc.outputSettings().charset(charsetName);
- }
- return doc;
- }
-
- static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
- byte[] buffer = new byte[bufferSize];
- ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
- int read;
- while(true) {
- read = inStream.read(buffer);
- if (read == -1) break;
- outStream.write(buffer, 0, read);
- }
- ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
- return byteData;
- }
-
- /**
- * Parse out a charset from a content type header.
- * @param contentType e.g. "text/html; charset=EUC-JP"
- * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
- */
- static String getCharsetFromContentType(String contentType) {
- if (contentType == null) return null;
- Matcher m = charsetPattern.matcher(contentType);
- if (m.find()) {
- return m.group(1).trim().toUpperCase();
- }
- return null;
- }
-
-
-}