From 93f034976a9bf290118a6cc1a75b2d25c2f0c1b7 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Thu, 3 Jun 2010 15:33:54 +0000 Subject: [PATCH] Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../src/org/apache/poi/hsmf/MAPIMessage.java | 11 +++++++ .../org/apache/poi/hsmf/datatypes/Chunks.java | 6 ++++ .../hsmf/extractor/OutlookTextExtactor.java | 22 +++++++++++++- .../org/apache/poi/hsmf/TestBasics.java | 29 +++++++++++++++++++ .../extractor/TestOutlookTextExtractor.java | 4 +-- 6 files changed, 70 insertions(+), 3 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 795d23d3ec..10c8334d44 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed 48494 - detect and support time formats like HH:MM;HH:MM 48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them 48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index 05c14482a8..9b2ccd7659 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocument { return names; } + + /** + * + */ + public String[] getHeaders() throws ChunkNotFoundException { + String headers = getStringFromChunk(mainChunks.messageHeaders); + if(headers == null) { + return null; + } + return headers.split("\\r?\\n"); + } /** * Gets the conversation topic of the parsed Outlook Message. diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java index 36b5202099..a92e60d017 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java @@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup { // 0x0050 -> 0x006F seem to be routing info or similar public static final int CONVERSATION_TOPIC = 0x0070; public static final int SENT_BY_SERVER_TYPE = 0x0075; + public static final int MESSAGE_HEADERS = 0x007D; // RECEIVEDEMAIL = 76 public static final int DISPLAY_TO = 0x0E04; public static final int DISPLAY_FROM = 0x0C1A; @@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup { public StringChunk conversationTopic; /** Type of server that the message originated from (SMTP, etc). */ public StringChunk sentByServerType; + /** The email headers */ + public StringChunk messageHeaders; /** TODO */ public MessageSubmissionChunk submissionChunk; /** TODO */ @@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup { case SENT_BY_SERVER_TYPE: sentByServerType = (StringChunk)chunk; break; + case MESSAGE_HEADERS: + messageHeaders = (StringChunk)chunk; + break; case DISPLAY_TO: displayToChunk = (StringChunk)chunk; break; diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java index 7c88efad7a..dcbb037128 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java @@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { handleEmails(s, "BCC", msg.getDisplayBCC(), emails); } catch(ChunkNotFoundException e) {} + // Date - try two ways to find it try { + // First try via the proper chunk SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss"); s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n"); - } catch(ChunkNotFoundException e) {} + } catch(ChunkNotFoundException e) { + try { + // Failing that try via the raw headers + String[] headers = msg.getHeaders(); + for(String header: headers) { + if(header.toLowerCase().startsWith("date:")) { + s.append( + "Date:" + + header.substring(header.indexOf(':')+1) + + "\n" + ); + break; + } + } + } catch(ChunkNotFoundException he) { + // We can't find the date, sorry... + } + } + try { s.append("Subject: " + msg.getSubject() + "\n"); } catch(ChunkNotFoundException e) {} diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java index 25c793339f..6e958c193f 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java @@ -75,6 +75,35 @@ public final class TestBasics extends TestCase { assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject()); } + /** + * Test message headers + */ + public void testHeaders() throws Exception { + // Simple email first + assertEquals(26, simple.getHeaders().length); + assertTrue(simple.getHeaders()[0].startsWith("Return-path:")); + assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com")); + assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean")); + + // Quick doesn't have them + try { + quick.getHeaders(); + fail(); + } catch(ChunkNotFoundException e) {} + + // Attachments doesn't have them + try { + attachments.getHeaders(); + fail(); + } catch(ChunkNotFoundException e) {} + + // Outlook30 has some + assertEquals(33, outlook30.getHeaders().length); + assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers")); + assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:")); + assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future + } + /** * Test attachments */ diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java index b18a9eb26c..ff9ddde683 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java @@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase { assertEquals(-1, text.indexOf("CC:")); assertEquals(-1, text.indexOf("BCC:")); assertContains(text, "Subject: test message\n"); - assertEquals(-1, text.indexOf("Date:")); + assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n"); assertContains(text, "This is a test message."); } @@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase { "nick.burch@alfresco.com; 'Roy Wetherall' \n"); assertEquals(-1, text.indexOf("BCC:")); assertContains(text, "Subject: This is a test message please ignore\n"); - assertEquals(-1, text.indexOf("Date:")); + assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n"); assertContains(text, "The quick brown fox jumps over the lazy dog"); } } -- 2.39.5