aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2010-08-03 16:06:21 +0000
committerNick Burch <nick@apache.org>2010-08-03 16:06:21 +0000
commit62499bc4655a6409834d75825ccd68c1524e6775 (patch)
treee1647d235100c6a744dde791196c8a053cc2dcdd
parent452fa021823a9ebe51068f92885a26a93cd84bf6 (diff)
downloadpoi-62499bc4655a6409834d75825ccd68c1524e6775.tar.gz
poi-62499bc4655a6409834d75825ccd68c1524e6775.zip
Fix bug #49441 - Allow overriding and guessing of HSMF non-unicode string encodings
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@981947 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--src/documentation/content/xdocs/status.xml1
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java53
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java39
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java19
4 files changed, 105 insertions, 7 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 3c6ba2742c..840486d617 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
<changes>
<release version="3.7-beta2" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">49441 - Allow overriding and guessing of HSMF non-unicode string encodings</action>
<action dev="POI-DEVELOPERS" type="fix">49689 - Allow the setting of user style names on newly created HSSF cell styles</action>
<action dev="POI-DEVELOPERS" type="add">Make it easier to tell which content types each POIXMLTextExtractor handles</action>
<action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action>
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
index e04d299b13..3db17180fd 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
@@ -25,10 +25,13 @@ import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.poi.POIDocument;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.datatypes.AttachmentChunks.AttachmentChunksSorter;
+import org.apache.poi.hsmf.datatypes.Chunk;
import org.apache.poi.hsmf.datatypes.ChunkGroup;
import org.apache.poi.hsmf.datatypes.Chunks;
import org.apache.poi.hsmf.datatypes.NameIdChunks;
@@ -286,10 +289,58 @@ public class MAPIMessage extends POIDocument {
return names;
}
+
+ /**
+ * Many messages store their strings as unicode, which is
+ * nice and easy. Some use one-byte encodings for their
+ * strings, but don't easily store the encoding anywhere
+ * in the file!
+ * This method looks at the headers for the message, and
+ * tries to use these to guess the correct encoding for
+ * your file.
+ * Bug #49441 has more on why this is needed
+ */
+ public void guess7BitEncoding() {
+ try {
+ String[] headers = getHeaders();
+ if(headers == null || headers.length == 0) {
+ return;
+ }
+ // Look for a content type with a charset
+ Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?(.*?)[\"']?");
+ for(String header : headers) {
+ if(header.startsWith("Content-Type")) {
+ Matcher m = p.matcher(header);
+ if(m.matches()) {
+ // Found it! Tell all the string chunks
+ String charset = m.group(1);
+
+ for(Chunk c : mainChunks.getAll()) {
+ if(c instanceof StringChunk) {
+ ((StringChunk)c).set7BitEncoding(charset);
+ }
+ }
+ for(Chunk c : nameIdChunks.getAll()) {
+ if(c instanceof StringChunk) {
+ ((StringChunk)c).set7BitEncoding(charset);
+ }
+ }
+ for(RecipientChunks rc : recipientChunks) {
+ for(Chunk c : rc.getAll()) {
+ if(c instanceof StringChunk) {
+ ((StringChunk)c).set7BitEncoding(charset);
+ }
+ }
+ }
+ }
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+ }
/**
- *
+ * Returns all the headers, one entry per line
*/
public String[] getHeaders() throws ChunkNotFoundException {
String headers = getStringFromChunk(mainChunks.messageHeaders);
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
index b735058dc7..057bb07aad 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
@@ -30,8 +30,8 @@ import org.apache.poi.util.StringUtil;
* A Chunk made up of a single string.
*/
public class StringChunk extends Chunk {
-
private String value;
+ private String encoding7Bit = "CP1252";
/**
* Creates a String Chunk.
@@ -48,13 +48,33 @@ public class StringChunk extends Chunk {
super(chunkId, type);
}
+ /**
+ * Returns the Encoding that will be used to
+ * decode any "7 bit" (non unicode) data.
+ * Most files default to CP1252
+ */
+ public String get7BitEncoding() {
+ return encoding7Bit;
+ }
+
+ /**
+ * Sets the Encoding that will be used to
+ * decode any "7 bit" (non unicode) data.
+ * This doesn't appear to be stored anywhere
+ * specific in the file, so you may need
+ * to guess by looking at headers etc
+ */
+ public void set7BitEncoding(String encoding) {
+ this.encoding7Bit = encoding;
+ }
+
public void readValue(InputStream value) throws IOException {
String tmpValue;
byte[] data = IOUtils.toByteArray(value);
switch(type) {
case Types.ASCII_STRING:
- tmpValue = parseAs7BitData(data);
+ tmpValue = parseAs7BitData(data, encoding7Bit);
break;
case Types.UNICODE_STRING:
tmpValue = StringUtil.getFromUnicodeLE(data);
@@ -73,9 +93,9 @@ public class StringChunk extends Chunk {
switch(type) {
case Types.ASCII_STRING:
try {
- data = value.getBytes("CP1252");
+ data = value.getBytes(encoding7Bit);
} catch (UnsupportedEncodingException e) {
- throw new RuntimeException("Core encoding not found, JVM broken?", e);
+ throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
}
break;
case Types.UNICODE_STRING:
@@ -101,10 +121,17 @@ public class StringChunk extends Chunk {
* and returns the string that that yields.
*/
protected static String parseAs7BitData(byte[] data) {
+ return parseAs7BitData(data, "CP1252");
+ }
+ /**
+ * Parses as non-unicode, supposedly 7 bit data
+ * and returns the string that that yields.
+ */
+ protected static String parseAs7BitData(byte[] data, String encoding) {
try {
- return new String(data, "CP1252");
+ return new String(data, encoding);
} catch (UnsupportedEncodingException e) {
- throw new RuntimeException("Core encoding not found, JVM broken?", e);
+ throw new RuntimeException("Encoding not found - " + encoding, e);
}
}
}
diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
index 18a316491f..bb7c87262c 100644
--- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
+++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
@@ -34,6 +34,7 @@ public final class TestBasics extends TestCase {
private MAPIMessage outlook30;
private MAPIMessage attachments;
private MAPIMessage noRecipientAddress;
+ private MAPIMessage cyrillic;
/**
* Initialize this test, load up the blank.msg mapi message.
@@ -46,6 +47,7 @@ public final class TestBasics extends TestCase {
outlook30 = new MAPIMessage(samples.openResourceAsStream("outlook_30_msg.msg"));
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
+ cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
}
/**
@@ -177,4 +179,21 @@ public final class TestBasics extends TestCase {
noRecipientAddress.setReturnNullOnMissingChunk(false);
}
+
+ /**
+ * We default to CP1252, but can sometimes do better
+ * if needed.
+ * This file is really CP1251, according to the person
+ * who submitted it in bug #49441
+ */
+ public void testEncoding() throws Exception {
+ assertEquals(2, cyrillic.getRecipientDetailsChunks().length);
+ assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+ assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+
+ cyrillic.guess7BitEncoding();
+
+ assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+ assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+ }
}