diff options
-rw-r--r-- | src/documentation/content/xdocs/status.xml | 4 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java | 77 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java | 6 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java | 33 | ||||
-rw-r--r-- | src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java | 24 | ||||
-rw-r--r-- | test-data/hsmf/chinese-traditional.msg | bin | 0 -> 48129 bytes |
6 files changed, 115 insertions, 29 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 2730545cce..c619413f20 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,10 @@ </developers> <changes> + <release version="3.8-beta3" date="2011-??-??"> + <action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action> + <action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action> + </release> <release version="3.8-beta2" date="2011-??-??"> <action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action> <action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action> diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index f220ebfe55..e9cc82d14d 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -177,6 +177,16 @@ public class MAPIMessage extends POIDocument { } /** + * Gets the html body of this Outlook Message, if this email + * contains a html version. + * @return The string representation of the 'html' version of the body, if available. + * @throws ChunkNotFoundException + */ + public String getHmtlBody() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.htmlBodyChunk); + } + + /** * Gets the subject line of the Outlook Message * @throws ChunkNotFoundException */ @@ -331,28 +341,59 @@ public class MAPIMessage extends POIDocument { if(m.matches()) { // Found it! Tell all the string chunks String charset = m.group(1); - - for(Chunk c : mainChunks.getAll()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } - for(Chunk c : nameIdChunks.getAll()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } - for(RecipientChunks rc : recipientChunks) { - for(Chunk c : rc.getAll()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(charset); - } - } - } + set7BitEncoding(charset); + return; } } } } catch(ChunkNotFoundException e) {} + + // Nothing suitable in the headers, try HTML + try { + String html = getHmtlBody(); + + // Look for a content type in the meta headers + Pattern p = Pattern.compile( + "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\"" + ); + Matcher m = p.matcher(html); + if(m.find()) { + // Found it! Tell all the string chunks + String charset = m.group(1); + set7BitEncoding(charset); + return; + } + } catch(ChunkNotFoundException e) {} + } + + /** + * Many messages store their strings as unicode, which is + * nice and easy. Some use one-byte encodings for their + * strings, but don't easily store the encoding anywhere + * in the file! + * If you know what the encoding is of your file, you can + * use this method to set the 7 bit encoding for all + * the non unicode strings in the file. + * @see #guess7BitEncoding() + */ + public void set7BitEncoding(String charset) { + for(Chunk c : mainChunks.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(charset); + } + } + for(Chunk c : nameIdChunks.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(charset); + } + } + for(RecipientChunks rc : recipientChunks) { + for(Chunk c : rc.getAll()) { + if(c instanceof StringChunk) { + ((StringChunk)c).set7BitEncoding(charset); + } + } + } } /** diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java index 08b7e98988..d9a060fe4a 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java @@ -37,6 +37,8 @@ public final class Chunks implements ChunkGroup { public StringChunk messageClass; /** BODY Chunk, for plain/text messages */ public StringChunk textBodyChunk; + /** BODY Html Chunk, for html messages */ + public StringChunk htmlBodyChunk; /** Subject link chunk, in plain/text */ public StringChunk subjectChunk; /** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */ @@ -117,6 +119,10 @@ public final class Chunks implements ChunkGroup { else if(chunk.getChunkId() == MAPIProperty.BODY.id) { textBodyChunk = (StringChunk)chunk; } + else if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id && + chunk instanceof StringChunk) { + htmlBodyChunk = (StringChunk)chunk; + } // And add to the main list allChunks.add(chunk); diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java index 057bb07aad..19a2f13efa 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java @@ -30,8 +30,11 @@ import org.apache.poi.util.StringUtil; * A Chunk made up of a single string. */ public class StringChunk extends Chunk { - private String value; - private String encoding7Bit = "CP1252"; + private static final String DEFAULT_ENCODING = "CP1252"; + private String encoding7Bit = DEFAULT_ENCODING; + private String value; + /** Only kept around for 7 bit strings */ + private byte[] rawValue; /** * Creates a String Chunk. @@ -56,7 +59,7 @@ public class StringChunk extends Chunk { public String get7BitEncoding() { return encoding7Bit; } - + /** * Sets the Encoding that will be used to * decode any "7 bit" (non unicode) data. @@ -66,25 +69,33 @@ public class StringChunk extends Chunk { */ public void set7BitEncoding(String encoding) { this.encoding7Bit = encoding; + + // Re-read the String if we're a 7 bit one + if(type == Types.ASCII_STRING) { + parseString(rawValue); + } } - + public void readValue(InputStream value) throws IOException { - String tmpValue; - byte[] data = IOUtils.toByteArray(value); - + byte[] data = IOUtils.toByteArray(value); + parseString(data); + } + private void parseString(byte[] data) { + String tmpValue; switch(type) { case Types.ASCII_STRING: tmpValue = parseAs7BitData(data, encoding7Bit); - break; + this.rawValue = data; + break; case Types.UNICODE_STRING: tmpValue = StringUtil.getFromUnicodeLE(data); break; default: throw new IllegalArgumentException("Invalid type " + type + " for String Chunk"); } - + // Clean up - this.value = tmpValue.replace("\0", ""); + this.value = tmpValue.replace("\0", ""); } public void writeValue(OutputStream out) throws IOException { @@ -121,7 +132,7 @@ public class StringChunk extends Chunk { * and returns the string that that yields. */ protected static String parseAs7BitData(byte[] data) { - return parseAs7BitData(data, "CP1252"); + return parseAs7BitData(data, DEFAULT_ENCODING); } /** * Parses as non-unicode, supposedly 7 bit data diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java index bb7c87262c..e2f6fe39d4 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java @@ -35,6 +35,7 @@ public final class TestBasics extends TestCase { private MAPIMessage attachments; private MAPIMessage noRecipientAddress; private MAPIMessage cyrillic; + private MAPIMessage chinese; /** * Initialize this test, load up the blank.msg mapi message. @@ -48,6 +49,7 @@ public final class TestBasics extends TestCase { attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg")); noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg")); cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg")); + chinese = new MAPIMessage(samples.openResourceAsStream("chinese-traditional.msg")); } /** @@ -195,5 +197,27 @@ public final class TestBasics extends TestCase { assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding()); + + // Override it, check it's taken + cyrillic.set7BitEncoding("UTF-8"); + assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); + assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding()); + + + // Check with a file that has no headers + try { + chinese.getHeaders(); + fail("File doesn't have headers!"); + } catch(ChunkNotFoundException e) {} + + String html = chinese.getHmtlBody(); + assertTrue("Charset not found:\n" + html, html.contains("text/html; charset=big5")); + + // Defaults to CP1251 + assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); + + // But after guessing goes to the correct one, Big 5 + chinese.guess7BitEncoding(); + assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding()); } } diff --git a/test-data/hsmf/chinese-traditional.msg b/test-data/hsmf/chinese-traditional.msg Binary files differnew file mode 100644 index 0000000000..c2b84c0977 --- /dev/null +++ b/test-data/hsmf/chinese-traditional.msg |