summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2011-04-01 15:02:14 +0000
committerNick Burch <nick@apache.org>2011-04-01 15:02:14 +0000
commit296abdab3b027ec2aa8ccdc2e74e2b66963efbf9 (patch)
tree6e8b06f13d0821f887ce3b9a445a79630ea5c823
parentdcdb268d3bffa6fb652dbd4a15fd7afe788da608 (diff)
downloadpoi-296abdab3b027ec2aa8ccdc2e74e2b66963efbf9.tar.gz
poi-296abdab3b027ec2aa8ccdc2e74e2b66963efbf9.zip
Update OutlookTextExtractor to request 7 bit encoding guessing
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087734 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--src/documentation/content/xdocs/status.xml1
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java48
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java15
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java17
4 files changed, 62 insertions, 19 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index c619413f20..a4a3250d17 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
<changes>
<release version="3.8-beta3" date="2011-??-??">
+ <action dev="poi-developers" type="fix">OutlookTextExtractor now requests 7 bit encoding guessing</action>
<action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
<action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
</release>
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
index 19a2f13efa..133389bbcc 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
@@ -32,9 +32,8 @@ import org.apache.poi.util.StringUtil;
public class StringChunk extends Chunk {
private static final String DEFAULT_ENCODING = "CP1252";
private String encoding7Bit = DEFAULT_ENCODING;
- private String value;
- /** Only kept around for 7 bit strings */
private byte[] rawValue;
+ private String value;
/**
* Creates a String Chunk.
@@ -72,23 +71,22 @@ public class StringChunk extends Chunk {
// Re-read the String if we're a 7 bit one
if(type == Types.ASCII_STRING) {
- parseString(rawValue);
+ parseString();
}
}
public void readValue(InputStream value) throws IOException {
- byte[] data = IOUtils.toByteArray(value);
- parseString(data);
+ rawValue = IOUtils.toByteArray(value);
+ parseString();
}
- private void parseString(byte[] data) {
+ private void parseString() {
String tmpValue;
switch(type) {
case Types.ASCII_STRING:
- tmpValue = parseAs7BitData(data, encoding7Bit);
- this.rawValue = data;
+ tmpValue = parseAs7BitData(rawValue, encoding7Bit);
break;
case Types.UNICODE_STRING:
- tmpValue = StringUtil.getFromUnicodeLE(data);
+ tmpValue = StringUtil.getFromUnicodeLE(rawValue);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
@@ -99,34 +97,46 @@ public class StringChunk extends Chunk {
}
public void writeValue(OutputStream out) throws IOException {
- byte[] data;
-
+ out.write(rawValue);
+ }
+ private void storeString() {
switch(type) {
case Types.ASCII_STRING:
try {
- data = value.getBytes(encoding7Bit);
+ rawValue = value.getBytes(encoding7Bit);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
}
break;
case Types.UNICODE_STRING:
- data = new byte[value.length()*2];
- StringUtil.putUnicodeLE(value, data, 0);
+ rawValue = new byte[value.length()*2];
+ StringUtil.putUnicodeLE(value, rawValue, 0);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
}
-
- out.write(data);
}
+ /**
+ * Returns the Text value of the chunk
+ */
public String getValue() {
return this.value;
}
- public String toString() {
- return this.value;
- }
+
+ public byte[] getRawValue() {
+ return this.rawValue;
+ }
+ public void setValue(String str) {
+ this.value = str;
+ storeString();
+ }
+
+ public String toString() {
+ return this.value;
+ }
+
/**
* Parses as non-unicode, supposedly 7 bit CP1252 data
* and returns the string that that yields.
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
index 34e4737973..bc12df433c 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
@@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.hsmf.extractor;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
@@ -56,6 +57,15 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
public OutlookTextExtactor(InputStream inp) throws IOException {
this(new MAPIMessage(inp));
}
+
+ public static void main(String[] args) throws Exception {
+ for(String filename : args) {
+ OutlookTextExtactor extractor = new OutlookTextExtactor(
+ new NPOIFSFileSystem(new File(filename))
+ );
+ System.out.println( extractor.getText() );
+ }
+ }
/**
* Returns the underlying MAPI message
@@ -71,6 +81,11 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
MAPIMessage msg = (MAPIMessage)document;
StringBuffer s = new StringBuffer();
+ // See if we can get a suitable encoding for any
+ // non unicode text in the file
+ msg.guess7BitEncoding();
+
+ // Off we go
StringsIterator emails;
try {
emails = new StringsIterator(
diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
index ccbb681f59..1c86712b9c 100644
--- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
@@ -199,4 +199,21 @@ public final class TestOutlookTextExtractor extends TestCase {
// Embeded bits are checked in
// TestExtractorFactory
}
+
+ public void testEncodings() throws Exception {
+ POIFSFileSystem simple = new POIFSFileSystem(
+ new FileInputStream(samples.getFile("chinese-traditional.msg"))
+ );
+ MAPIMessage msg = new MAPIMessage(simple);
+ OutlookTextExtactor ext = new OutlookTextExtactor(msg);
+ String text = ext.getText();
+
+ // Check the english bits
+ assertContains(text, "From: Tests Chang@FT");
+ assertContains(text, "tests.chang@fengttt.com");
+
+ // And check some chinese bits
+ assertContains(text, "(\u5f35\u6bd3\u502b)");
+ assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
+ }
}