From 296abdab3b027ec2aa8ccdc2e74e2b66963efbf9 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Fri, 1 Apr 2011 15:02:14 +0000 Subject: [PATCH] Update OutlookTextExtractor to request 7 bit encoding guessing git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087734 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../poi/hsmf/datatypes/StringChunk.java | 48 +++++++++++-------- .../hsmf/extractor/OutlookTextExtactor.java | 15 ++++++ .../extractor/TestOutlookTextExtractor.java | 17 +++++++ 4 files changed, 62 insertions(+), 19 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index c619413f20..a4a3250d17 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + OutlookTextExtractor now requests 7 bit encoding guessing Improve HSMF encoding guessing for 7 bit fields in MAPIMessage Allow HSMF access to the HTML body contents in MAPIMessage diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java index 19a2f13efa..133389bbcc 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java @@ -32,9 +32,8 @@ import org.apache.poi.util.StringUtil; public class StringChunk extends Chunk { private static final String DEFAULT_ENCODING = "CP1252"; private String encoding7Bit = DEFAULT_ENCODING; - private String value; - /** Only kept around for 7 bit strings */ private byte[] rawValue; + private String value; /** * Creates a String Chunk. @@ -72,23 +71,22 @@ public class StringChunk extends Chunk { // Re-read the String if we're a 7 bit one if(type == Types.ASCII_STRING) { - parseString(rawValue); + parseString(); } } public void readValue(InputStream value) throws IOException { - byte[] data = IOUtils.toByteArray(value); - parseString(data); + rawValue = IOUtils.toByteArray(value); + parseString(); } - private void parseString(byte[] data) { + private void parseString() { String tmpValue; switch(type) { case Types.ASCII_STRING: - tmpValue = parseAs7BitData(data, encoding7Bit); - this.rawValue = data; + tmpValue = parseAs7BitData(rawValue, encoding7Bit); break; case Types.UNICODE_STRING: - tmpValue = StringUtil.getFromUnicodeLE(data); + tmpValue = StringUtil.getFromUnicodeLE(rawValue); break; default: throw new IllegalArgumentException("Invalid type " + type + " for String Chunk"); @@ -99,34 +97,46 @@ public class StringChunk extends Chunk { } public void writeValue(OutputStream out) throws IOException { - byte[] data; - + out.write(rawValue); + } + private void storeString() { switch(type) { case Types.ASCII_STRING: try { - data = value.getBytes(encoding7Bit); + rawValue = value.getBytes(encoding7Bit); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Encoding not found - " + encoding7Bit, e); } break; case Types.UNICODE_STRING: - data = new byte[value.length()*2]; - StringUtil.putUnicodeLE(value, data, 0); + rawValue = new byte[value.length()*2]; + StringUtil.putUnicodeLE(value, rawValue, 0); break; default: throw new IllegalArgumentException("Invalid type " + type + " for String Chunk"); } - - out.write(data); } + /** + * Returns the Text value of the chunk + */ public String getValue() { return this.value; } - public String toString() { - return this.value; - } + + public byte[] getRawValue() { + return this.rawValue; + } + public void setValue(String str) { + this.value = str; + storeString(); + } + + public String toString() { + return this.value; + } + /** * Parses as non-unicode, supposedly 7 bit CP1252 data * and returns the string that that yields. diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java index 34e4737973..bc12df433c 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.hsmf.extractor; +import java.io.File; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; @@ -56,6 +57,15 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { public OutlookTextExtactor(InputStream inp) throws IOException { this(new MAPIMessage(inp)); } + + public static void main(String[] args) throws Exception { + for(String filename : args) { + OutlookTextExtactor extractor = new OutlookTextExtactor( + new NPOIFSFileSystem(new File(filename)) + ); + System.out.println( extractor.getText() ); + } + } /** * Returns the underlying MAPI message @@ -71,6 +81,11 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { MAPIMessage msg = (MAPIMessage)document; StringBuffer s = new StringBuffer(); + // See if we can get a suitable encoding for any + // non unicode text in the file + msg.guess7BitEncoding(); + + // Off we go StringsIterator emails; try { emails = new StringsIterator( diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java index ccbb681f59..1c86712b9c 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java @@ -199,4 +199,21 @@ public final class TestOutlookTextExtractor extends TestCase { // Embeded bits are checked in // TestExtractorFactory } + + public void testEncodings() throws Exception { + POIFSFileSystem simple = new POIFSFileSystem( + new FileInputStream(samples.getFile("chinese-traditional.msg")) + ); + MAPIMessage msg = new MAPIMessage(simple); + OutlookTextExtactor ext = new OutlookTextExtactor(msg); + String text = ext.getText(); + + // Check the english bits + assertContains(text, "From: Tests Chang@FT"); + assertContains(text, "tests.chang@fengttt.com"); + + // And check some chinese bits + assertContains(text, "(\u5f35\u6bd3\u502b)"); + assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); + } } -- 2.39.5