aboutsummaryrefslogtreecommitdiffstats
path: root/server/src/org/jsoup/helper/DataUtil.java
diff options
context:
space:
mode:
authorLeif Åstrand <leif@vaadin.com>2012-09-05 19:50:56 +0300
committerLeif Åstrand <leif@vaadin.com>2012-09-05 19:51:15 +0300
commit7d25670284b11c7c62ba25183f265227cb3dba83 (patch)
treec8e76eb70dd3cdd5cf59a99419635f2188b25c24 /server/src/org/jsoup/helper/DataUtil.java
parent1d0c96de9595c243d88471476d21e5f248be63f7 (diff)
downloadvaadin-framework-7d25670284b11c7c62ba25183f265227cb3dba83.tar.gz
vaadin-framework-7d25670284b11c7c62ba25183f265227cb3dba83.zip
Reformat project
Diffstat (limited to 'server/src/org/jsoup/helper/DataUtil.java')
-rw-r--r--server/src/org/jsoup/helper/DataUtil.java149
1 files changed, 100 insertions, 49 deletions
diff --git a/server/src/org/jsoup/helper/DataUtil.java b/server/src/org/jsoup/helper/DataUtil.java
index 9adfe42153..26b85ea7dc 100644
--- a/server/src/org/jsoup/helper/DataUtil.java
+++ b/server/src/org/jsoup/helper/DataUtil.java
@@ -1,102 +1,147 @@
package org.jsoup.helper;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.parser.Parser;
-
-import java.io.*;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.parser.Parser;
+
/**
* Internal static utilities for handling data.
- *
+ *
*/
public class DataUtil {
- private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
- static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
+ private static final Pattern charsetPattern = Pattern
+ .compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
+ static final String defaultCharset = "UTF-8"; // used if not found in header
+ // or meta charset
private static final int bufferSize = 0x20000; // ~130K.
- private DataUtil() {}
+ private DataUtil() {
+ }
/**
* Loads a file to a Document.
- * @param in file to load
- * @param charsetName character set of input
- * @param baseUri base URI of document, to resolve relative links against
+ *
+ * @param in
+ * file to load
+ * @param charsetName
+ * character set of input
+ * @param baseUri
+ * base URI of document, to resolve relative links against
* @return Document
- * @throws IOException on IO error
+ * @throws IOException
+ * on IO error
*/
- public static Document load(File in, String charsetName, String baseUri) throws IOException {
+ public static Document load(File in, String charsetName, String baseUri)
+ throws IOException {
FileInputStream inStream = null;
try {
inStream = new FileInputStream(in);
ByteBuffer byteData = readToByteBuffer(inStream);
- return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
+ return parseByteData(byteData, charsetName, baseUri,
+ Parser.htmlParser());
} finally {
- if (inStream != null)
+ if (inStream != null) {
inStream.close();
+ }
}
}
/**
* Parses a Document from an input steam.
- * @param in input stream to parse. You will need to close it.
- * @param charsetName character set of input
- * @param baseUri base URI of document, to resolve relative links against
+ *
+ * @param in
+ * input stream to parse. You will need to close it.
+ * @param charsetName
+ * character set of input
+ * @param baseUri
+ * base URI of document, to resolve relative links against
* @return Document
- * @throws IOException on IO error
+ * @throws IOException
+ * on IO error
*/
- public static Document load(InputStream in, String charsetName, String baseUri) throws IOException {
+ public static Document load(InputStream in, String charsetName,
+ String baseUri) throws IOException {
ByteBuffer byteData = readToByteBuffer(in);
- return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
+ return parseByteData(byteData, charsetName, baseUri,
+ Parser.htmlParser());
}
/**
* Parses a Document from an input steam, using the provided Parser.
- * @param in input stream to parse. You will need to close it.
- * @param charsetName character set of input
- * @param baseUri base URI of document, to resolve relative links against
- * @param parser alternate {@link Parser#xmlParser() parser} to use.
+ *
+ * @param in
+ * input stream to parse. You will need to close it.
+ * @param charsetName
+ * character set of input
+ * @param baseUri
+ * base URI of document, to resolve relative links against
+ * @param parser
+ * alternate {@link Parser#xmlParser() parser} to use.
* @return Document
- * @throws IOException on IO error
+ * @throws IOException
+ * on IO error
*/
- public static Document load(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
+ public static Document load(InputStream in, String charsetName,
+ String baseUri, Parser parser) throws IOException {
ByteBuffer byteData = readToByteBuffer(in);
return parseByteData(byteData, charsetName, baseUri, parser);
}
- // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
- // switching the chartset midstream when a meta http-equiv tag defines the charset.
- static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
+ // reads bytes first into a buffer, then decodes with the appropriate
+ // charset. done this way to support
+ // switching the chartset midstream when a meta http-equiv tag defines the
+ // charset.
+ static Document parseByteData(ByteBuffer byteData, String charsetName,
+ String baseUri, Parser parser) {
String docData;
Document doc = null;
if (charsetName == null) { // determine from meta. safe parse as UTF-8
- // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
- docData = Charset.forName(defaultCharset).decode(byteData).toString();
+ // look for <meta http-equiv="Content-Type"
+ // content="text/html;charset=gb2312"> or HTML5 <meta
+ // charset="gb2312">
+ docData = Charset.forName(defaultCharset).decode(byteData)
+ .toString();
doc = parser.parseInput(docData, baseUri);
- Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
+ Element meta = doc.select(
+ "meta[http-equiv=content-type], meta[charset]").first();
if (meta != null) { // if not found, will keep utf-8 as best attempt
- String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
- if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
+ String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta
+ .attr("content")) : meta.attr("charset");
+ if (foundCharset != null && foundCharset.length() != 0
+ && !foundCharset.equals(defaultCharset)) { // need to
+ // re-decode
charsetName = foundCharset;
byteData.rewind();
- docData = Charset.forName(foundCharset).decode(byteData).toString();
+ docData = Charset.forName(foundCharset).decode(byteData)
+ .toString();
doc = null;
}
}
} else { // specified by content type header (or by user on file load)
- Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
+ Validate.notEmpty(
+ charsetName,
+ "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
docData = Charset.forName(charsetName).decode(byteData).toString();
}
if (doc == null) {
- // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
- // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
+ // there are times where there is a spurious byte-order-mark at the
+ // start of the text. Shouldn't be present
+ // in utf-8. If after decoding, there is a BOM, strip it; otherwise
+ // will cause the parser to go straight
// into head mode
- if (docData.charAt(0) == 65279)
+ if (docData.charAt(0) == 65279) {
docData = docData.substring(1);
+ }
doc = parser.parseInput(docData, baseUri);
doc.outputSettings().charset(charsetName);
@@ -108,9 +153,11 @@ public class DataUtil {
byte[] buffer = new byte[bufferSize];
ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
int read;
- while(true) {
- read = inStream.read(buffer);
- if (read == -1) break;
+ while (true) {
+ read = inStream.read(buffer);
+ if (read == -1) {
+ break;
+ }
outStream.write(buffer, 0, read);
}
ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
@@ -119,17 +166,21 @@ public class DataUtil {
/**
* Parse out a charset from a content type header.
- * @param contentType e.g. "text/html; charset=EUC-JP"
- * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
+ *
+ * @param contentType
+ * e.g. "text/html; charset=EUC-JP"
+ * @return "EUC-JP", or null if not found. Charset is trimmed and
+ * uppercased.
*/
static String getCharsetFromContentType(String contentType) {
- if (contentType == null) return null;
+ if (contentType == null) {
+ return null;
+ }
Matcher m = charsetPattern.matcher(contentType);
if (m.find()) {
return m.group(1).trim().toUpperCase();
}
return null;
}
-
-
+
}