From: Nick Burch Date: Thu, 3 Jun 2010 15:33:54 +0000 (+0000) Subject: Parse the HSMF headers chunk if present, and use it to find Dates in text extraction... X-Git-Tag: REL_3_7_BETA1~25 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=93f034976a9bf290118a6cc1a75b2d25c2f0c1b7;p=poi.git Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 795d23d3ec..10c8334d44 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed 48494 - detect and support time formats like HH:MM;HH:MM 48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them 48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index 05c14482a8..9b2ccd7659 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocument { return names; } + + /** + * + */ + public String[] getHeaders() throws ChunkNotFoundException { + String headers = getStringFromChunk(mainChunks.messageHeaders); + if(headers == null) { + return null; + } + return headers.split("\\r?\\n"); + } /** * Gets the conversation topic of the parsed Outlook Message. diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java index 36b5202099..a92e60d017 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java @@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup { // 0x0050 -> 0x006F seem to be routing info or similar public static final int CONVERSATION_TOPIC = 0x0070; public static final int SENT_BY_SERVER_TYPE = 0x0075; + public static final int MESSAGE_HEADERS = 0x007D; // RECEIVEDEMAIL = 76 public static final int DISPLAY_TO = 0x0E04; public static final int DISPLAY_FROM = 0x0C1A; @@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup { public StringChunk conversationTopic; /** Type of server that the message originated from (SMTP, etc). */ public StringChunk sentByServerType; + /** The email headers */ + public StringChunk messageHeaders; /** TODO */ public MessageSubmissionChunk submissionChunk; /** TODO */ @@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup { case SENT_BY_SERVER_TYPE: sentByServerType = (StringChunk)chunk; break; + case MESSAGE_HEADERS: + messageHeaders = (StringChunk)chunk; + break; case DISPLAY_TO: displayToChunk = (StringChunk)chunk; break; diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java index 7c88efad7a..dcbb037128 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java @@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { handleEmails(s, "BCC", msg.getDisplayBCC(), emails); } catch(ChunkNotFoundException e) {} + // Date - try two ways to find it try { + // First try via the proper chunk SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss"); s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n"); - } catch(ChunkNotFoundException e) {} + } catch(ChunkNotFoundException e) { + try { + // Failing that try via the raw headers + String[] headers = msg.getHeaders(); + for(String header: headers) { + if(header.toLowerCase().startsWith("date:")) { + s.append( + "Date:" + + header.substring(header.indexOf(':')+1) + + "\n" + ); + break; + } + } + } catch(ChunkNotFoundException he) { + // We can't find the date, sorry... + } + } + try { s.append("Subject: " + msg.getSubject() + "\n"); } catch(ChunkNotFoundException e) {} diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java index 25c793339f..6e958c193f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java @@ -75,6 +75,35 @@ public final class TestBasics extends TestCase { assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject()); } + /** + * Test message headers + */ + public void testHeaders() throws Exception { + // Simple email first + assertEquals(26, simple.getHeaders().length); + assertTrue(simple.getHeaders()[0].startsWith("Return-path:")); + assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com")); + assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean")); + + // Quick doesn't have them + try { + quick.getHeaders(); + fail(); + } catch(ChunkNotFoundException e) {} + + // Attachments doesn't have them + try { + attachments.getHeaders(); + fail(); + } catch(ChunkNotFoundException e) {} + + // Outlook30 has some + assertEquals(33, outlook30.getHeaders().length); + assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers")); + assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:")); + assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future + } + /** * Test attachments */ diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java index b18a9eb26c..ff9ddde683 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java @@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase { assertEquals(-1, text.indexOf("CC:")); assertEquals(-1, text.indexOf("BCC:")); assertContains(text, "Subject: test message\n"); - assertEquals(-1, text.indexOf("Date:")); + assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n"); assertContains(text, "This is a test message."); } @@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase { "nick.burch@alfresco.com; 'Roy Wetherall' \n"); assertEquals(-1, text.indexOf("BCC:")); assertContains(text, "Subject: This is a test message please ignore\n"); - assertEquals(-1, text.indexOf("Date:")); + assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n"); assertContains(text, "The quick brown fox jumps over the lazy dog"); } }