aboutsummaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/helper/DataUtil.java
diff options
context:
space:
mode:
Diffstat (limited to 'server/src/org/jsoup/helper/DataUtil.java')
-rw-r--r--server/src/org/jsoup/helper/DataUtil.java186
1 files changed, 0 insertions, 186 deletions
diff --git a/server/src/org/jsoup/helper/DataUtil.java b/server/src/org/jsoup/helper/DataUtil.java
deleted file mode 100644
index 26b85ea7dc..0000000000
--- a/server/src/org/jsoup/helper/DataUtil.java
+++ /dev/null
@@ -1,186 +0,0 @@
-package org.jsoup.helper;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.parser.Parser;
-
-/**
- * Internal static utilities for handling data.
- *
- */
-public class DataUtil {
- private static final Pattern charsetPattern = Pattern
- .compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
- static final String defaultCharset = "UTF-8"; // used if not found in header
- // or meta charset
- private static final int bufferSize = 0x20000; // ~130K.
-
- private DataUtil() {
- }
-
- /**
- * Loads a file to a Document.
- *
- * @param in
- * file to load
- * @param charsetName
- * character set of input
- * @param baseUri
- * base URI of document, to resolve relative links against
- * @return Document
- * @throws IOException
- * on IO error
- */
- public static Document load(File in, String charsetName, String baseUri)
- throws IOException {
- FileInputStream inStream = null;
- try {
- inStream = new FileInputStream(in);
- ByteBuffer byteData = readToByteBuffer(inStream);
- return parseByteData(byteData, charsetName, baseUri,
- Parser.htmlParser());
- } finally {
- if (inStream != null) {
- inStream.close();
- }
- }
- }
-
- /**
- * Parses a Document from an input steam.
- *
- * @param in
- * input stream to parse. You will need to close it.
- * @param charsetName
- * character set of input
- * @param baseUri
- * base URI of document, to resolve relative links against
- * @return Document
- * @throws IOException
- * on IO error
- */
- public static Document load(InputStream in, String charsetName,
- String baseUri) throws IOException {
- ByteBuffer byteData = readToByteBuffer(in);
- return parseByteData(byteData, charsetName, baseUri,
- Parser.htmlParser());
- }
-
- /**
- * Parses a Document from an input steam, using the provided Parser.
- *
- * @param in
- * input stream to parse. You will need to close it.
- * @param charsetName
- * character set of input
- * @param baseUri
- * base URI of document, to resolve relative links against
- * @param parser
- * alternate {@link Parser#xmlParser() parser} to use.
- * @return Document
- * @throws IOException
- * on IO error
- */
- public static Document load(InputStream in, String charsetName,
- String baseUri, Parser parser) throws IOException {
- ByteBuffer byteData = readToByteBuffer(in);
- return parseByteData(byteData, charsetName, baseUri, parser);
- }
-
- // reads bytes first into a buffer, then decodes with the appropriate
- // charset. done this way to support
- // switching the chartset midstream when a meta http-equiv tag defines the
- // charset.
- static Document parseByteData(ByteBuffer byteData, String charsetName,
- String baseUri, Parser parser) {
- String docData;
- Document doc = null;
- if (charsetName == null) { // determine from meta. safe parse as UTF-8
- // look for <meta http-equiv="Content-Type"
- // content="text/html;charset=gb2312"> or HTML5 <meta
- // charset="gb2312">
- docData = Charset.forName(defaultCharset).decode(byteData)
- .toString();
- doc = parser.parseInput(docData, baseUri);
- Element meta = doc.select(
- "meta[http-equiv=content-type], meta[charset]").first();
- if (meta != null) { // if not found, will keep utf-8 as best attempt
- String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta
- .attr("content")) : meta.attr("charset");
- if (foundCharset != null && foundCharset.length() != 0
- && !foundCharset.equals(defaultCharset)) { // need to
- // re-decode
- charsetName = foundCharset;
- byteData.rewind();
- docData = Charset.forName(foundCharset).decode(byteData)
- .toString();
- doc = null;
- }
- }
- } else { // specified by content type header (or by user on file load)
- Validate.notEmpty(
- charsetName,
- "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
- docData = Charset.forName(charsetName).decode(byteData).toString();
- }
- if (doc == null) {
- // there are times where there is a spurious byte-order-mark at the
- // start of the text. Shouldn't be present
- // in utf-8. If after decoding, there is a BOM, strip it; otherwise
- // will cause the parser to go straight
- // into head mode
- if (docData.charAt(0) == 65279) {
- docData = docData.substring(1);
- }
-
- doc = parser.parseInput(docData, baseUri);
- doc.outputSettings().charset(charsetName);
- }
- return doc;
- }
-
- static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
- byte[] buffer = new byte[bufferSize];
- ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
- int read;
- while (true) {
- read = inStream.read(buffer);
- if (read == -1) {
- break;
- }
- outStream.write(buffer, 0, read);
- }
- ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
- return byteData;
- }
-
- /**
- * Parse out a charset from a content type header.
- *
- * @param contentType
- * e.g. "text/html; charset=EUC-JP"
- * @return "EUC-JP", or null if not found. Charset is trimmed and
- * uppercased.
- */
- static String getCharsetFromContentType(String contentType) {
- if (contentType == null) {
- return null;
- }
- Matcher m = charsetPattern.matcher(contentType);
- if (m.find()) {
- return m.group(1).trim().toUpperCase();
- }
- return null;
- }
-
-}