aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/documentation/content/xdocs/status.xml4
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java77
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java6
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java33
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java24
-rw-r--r--test-data/hsmf/chinese-traditional.msgbin0 -> 48129 bytes
6 files changed, 115 insertions, 29 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 2730545cce..c619413f20 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -33,6 +33,10 @@
</developers>
<changes>
+ <release version="3.8-beta3" date="2011-??-??">
+ <action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
+ <action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
+ </release>
<release version="3.8-beta2" date="2011-??-??">
<action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action>
<action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action>
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
index f220ebfe55..e9cc82d14d 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
@@ -177,6 +177,16 @@ public class MAPIMessage extends POIDocument {
}
/**
+ * Gets the html body of this Outlook Message, if this email
+ * contains a html version.
+ * @return The string representation of the 'html' version of the body, if available.
+ * @throws ChunkNotFoundException
+ */
+ public String getHmtlBody() throws ChunkNotFoundException {
+ return getStringFromChunk(mainChunks.htmlBodyChunk);
+ }
+
+ /**
* Gets the subject line of the Outlook Message
* @throws ChunkNotFoundException
*/
@@ -331,28 +341,59 @@ public class MAPIMessage extends POIDocument {
if(m.matches()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
-
- for(Chunk c : mainChunks.getAll()) {
- if(c instanceof StringChunk) {
- ((StringChunk)c).set7BitEncoding(charset);
- }
- }
- for(Chunk c : nameIdChunks.getAll()) {
- if(c instanceof StringChunk) {
- ((StringChunk)c).set7BitEncoding(charset);
- }
- }
- for(RecipientChunks rc : recipientChunks) {
- for(Chunk c : rc.getAll()) {
- if(c instanceof StringChunk) {
- ((StringChunk)c).set7BitEncoding(charset);
- }
- }
- }
+ set7BitEncoding(charset);
+ return;
}
}
}
} catch(ChunkNotFoundException e) {}
+
+ // Nothing suitable in the headers, try HTML
+ try {
+ String html = getHmtlBody();
+
+ // Look for a content type in the meta headers
+ Pattern p = Pattern.compile(
+ "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
+ );
+ Matcher m = p.matcher(html);
+ if(m.find()) {
+ // Found it! Tell all the string chunks
+ String charset = m.group(1);
+ set7BitEncoding(charset);
+ return;
+ }
+ } catch(ChunkNotFoundException e) {}
+ }
+
+ /**
+ * Many messages store their strings as unicode, which is
+ * nice and easy. Some use one-byte encodings for their
+ * strings, but don't easily store the encoding anywhere
+ * in the file!
+ * If you know what the encoding is of your file, you can
+ * use this method to set the 7 bit encoding for all
+ * the non unicode strings in the file.
+ * @see #guess7BitEncoding()
+ */
+ public void set7BitEncoding(String charset) {
+ for(Chunk c : mainChunks.getAll()) {
+ if(c instanceof StringChunk) {
+ ((StringChunk)c).set7BitEncoding(charset);
+ }
+ }
+ for(Chunk c : nameIdChunks.getAll()) {
+ if(c instanceof StringChunk) {
+ ((StringChunk)c).set7BitEncoding(charset);
+ }
+ }
+ for(RecipientChunks rc : recipientChunks) {
+ for(Chunk c : rc.getAll()) {
+ if(c instanceof StringChunk) {
+ ((StringChunk)c).set7BitEncoding(charset);
+ }
+ }
+ }
}
/**
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
index 08b7e98988..d9a060fe4a 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
@@ -37,6 +37,8 @@ public final class Chunks implements ChunkGroup {
public StringChunk messageClass;
/** BODY Chunk, for plain/text messages */
public StringChunk textBodyChunk;
+ /** BODY Html Chunk, for html messages */
+ public StringChunk htmlBodyChunk;
/** Subject link chunk, in plain/text */
public StringChunk subjectChunk;
/** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */
@@ -117,6 +119,10 @@ public final class Chunks implements ChunkGroup {
else if(chunk.getChunkId() == MAPIProperty.BODY.id) {
textBodyChunk = (StringChunk)chunk;
}
+ else if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id &&
+ chunk instanceof StringChunk) {
+ htmlBodyChunk = (StringChunk)chunk;
+ }
// And add to the main list
allChunks.add(chunk);
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
index 057bb07aad..19a2f13efa 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
@@ -30,8 +30,11 @@ import org.apache.poi.util.StringUtil;
* A Chunk made up of a single string.
*/
public class StringChunk extends Chunk {
- private String value;
- private String encoding7Bit = "CP1252";
+ private static final String DEFAULT_ENCODING = "CP1252";
+ private String encoding7Bit = DEFAULT_ENCODING;
+ private String value;
+ /** Only kept around for 7 bit strings */
+ private byte[] rawValue;
/**
* Creates a String Chunk.
@@ -56,7 +59,7 @@ public class StringChunk extends Chunk {
public String get7BitEncoding() {
return encoding7Bit;
}
-
+
/**
* Sets the Encoding that will be used to
* decode any "7 bit" (non unicode) data.
@@ -66,25 +69,33 @@ public class StringChunk extends Chunk {
*/
public void set7BitEncoding(String encoding) {
this.encoding7Bit = encoding;
+
+ // Re-read the String if we're a 7 bit one
+ if(type == Types.ASCII_STRING) {
+ parseString(rawValue);
+ }
}
-
+
public void readValue(InputStream value) throws IOException {
- String tmpValue;
- byte[] data = IOUtils.toByteArray(value);
-
+ byte[] data = IOUtils.toByteArray(value);
+ parseString(data);
+ }
+ private void parseString(byte[] data) {
+ String tmpValue;
switch(type) {
case Types.ASCII_STRING:
tmpValue = parseAs7BitData(data, encoding7Bit);
- break;
+ this.rawValue = data;
+ break;
case Types.UNICODE_STRING:
tmpValue = StringUtil.getFromUnicodeLE(data);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
}
-
+
// Clean up
- this.value = tmpValue.replace("\0", "");
+ this.value = tmpValue.replace("\0", "");
}
public void writeValue(OutputStream out) throws IOException {
@@ -121,7 +132,7 @@ public class StringChunk extends Chunk {
* and returns the string that that yields.
*/
protected static String parseAs7BitData(byte[] data) {
- return parseAs7BitData(data, "CP1252");
+ return parseAs7BitData(data, DEFAULT_ENCODING);
}
/**
* Parses as non-unicode, supposedly 7 bit data
diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
index bb7c87262c..e2f6fe39d4 100644
--- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
+++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
@@ -35,6 +35,7 @@ public final class TestBasics extends TestCase {
private MAPIMessage attachments;
private MAPIMessage noRecipientAddress;
private MAPIMessage cyrillic;
+ private MAPIMessage chinese;
/**
* Initialize this test, load up the blank.msg mapi message.
@@ -48,6 +49,7 @@ public final class TestBasics extends TestCase {
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
+ chinese = new MAPIMessage(samples.openResourceAsStream("chinese-traditional.msg"));
}
/**
@@ -195,5 +197,27 @@ public final class TestBasics extends TestCase {
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+
+ // Override it, check it's taken
+ cyrillic.set7BitEncoding("UTF-8");
+ assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+ assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+
+
+ // Check with a file that has no headers
+ try {
+ chinese.getHeaders();
+ fail("File doesn't have headers!");
+ } catch(ChunkNotFoundException e) {}
+
+ String html = chinese.getHmtlBody();
+ assertTrue("Charset not found:\n" + html, html.contains("text/html; charset=big5"));
+
+ // Defaults to CP1251
+ assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+
+ // But after guessing goes to the correct one, Big 5
+ chinese.guess7BitEncoding();
+ assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
}
}
diff --git a/test-data/hsmf/chinese-traditional.msg b/test-data/hsmf/chinese-traditional.msg
new file mode 100644
index 0000000000..c2b84c0977
--- /dev/null
+++ b/test-data/hsmf/chinese-traditional.msg
Binary files differ