aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2010-06-03 15:33:54 +0000
committerNick Burch <nick@apache.org>2010-06-03 15:33:54 +0000
commit93f034976a9bf290118a6cc1a75b2d25c2f0c1b7 (patch)
treec11ff7cb2a395ae619b5db27397ef1e4ccf4cd40 /src
parent9c68267a68255e32aae5f4276af6b39f0def1162 (diff)
downloadpoi-93f034976a9bf290118a6cc1a75b2d25c2f0c1b7.tar.gz
poi-93f034976a9bf290118a6cc1a75b2d25c2f0c1b7.zip
Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/documentation/content/xdocs/status.xml1
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java11
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java6
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java22
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java29
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java4
6 files changed, 70 insertions, 3 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 795d23d3ec..10c8334d44 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
index 05c14482a8..9b2ccd7659 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
@@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocument {
return names;
}
+
+ /**
+ *
+ */
+ public String[] getHeaders() throws ChunkNotFoundException {
+ String headers = getStringFromChunk(mainChunks.messageHeaders);
+ if(headers == null) {
+ return null;
+ }
+ return headers.split("\\r?\\n");
+ }
/**
* Gets the conversation topic of the parsed Outlook Message.
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
index 36b5202099..a92e60d017 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
@@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup {
// 0x0050 -> 0x006F seem to be routing info or similar
public static final int CONVERSATION_TOPIC = 0x0070;
public static final int SENT_BY_SERVER_TYPE = 0x0075;
+ public static final int MESSAGE_HEADERS = 0x007D;
// RECEIVEDEMAIL = 76
public static final int DISPLAY_TO = 0x0E04;
public static final int DISPLAY_FROM = 0x0C1A;
@@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup {
public StringChunk conversationTopic;
/** Type of server that the message originated from (SMTP, etc). */
public StringChunk sentByServerType;
+ /** The email headers */
+ public StringChunk messageHeaders;
/** TODO */
public MessageSubmissionChunk submissionChunk;
/** TODO */
@@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup {
case SENT_BY_SERVER_TYPE:
sentByServerType = (StringChunk)chunk;
break;
+ case MESSAGE_HEADERS:
+ messageHeaders = (StringChunk)chunk;
+ break;
case DISPLAY_TO:
displayToChunk = (StringChunk)chunk;
break;
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
index 7c88efad7a..dcbb037128 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
@@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
} catch(ChunkNotFoundException e) {}
+ // Date - try two ways to find it
try {
+ // First try via the proper chunk
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
- } catch(ChunkNotFoundException e) {}
+ } catch(ChunkNotFoundException e) {
+ try {
+ // Failing that try via the raw headers
+ String[] headers = msg.getHeaders();
+ for(String header: headers) {
+ if(header.toLowerCase().startsWith("date:")) {
+ s.append(
+ "Date:" +
+ header.substring(header.indexOf(':')+1) +
+ "\n"
+ );
+ break;
+ }
+ }
+ } catch(ChunkNotFoundException he) {
+ // We can't find the date, sorry...
+ }
+ }
+
try {
s.append("Subject: " + msg.getSubject() + "\n");
} catch(ChunkNotFoundException e) {}
diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
index 25c793339f..6e958c193f 100644
--- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
+++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
@@ -76,6 +76,35 @@ public final class TestBasics extends TestCase {
}
/**
+ * Test message headers
+ */
+ public void testHeaders() throws Exception {
+ // Simple email first
+ assertEquals(26, simple.getHeaders().length);
+ assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
+ assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
+ assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
+
+ // Quick doesn't have them
+ try {
+ quick.getHeaders();
+ fail();
+ } catch(ChunkNotFoundException e) {}
+
+ // Attachments doesn't have them
+ try {
+ attachments.getHeaders();
+ fail();
+ } catch(ChunkNotFoundException e) {}
+
+ // Outlook30 has some
+ assertEquals(33, outlook30.getHeaders().length);
+ assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
+ assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
+ assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future
+ }
+
+ /**
* Test attachments
*/
public void testAttachments() throws Exception {
diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
index b18a9eb26c..ff9ddde683 100644
--- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java
@@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase {
assertEquals(-1, text.indexOf("CC:"));
assertEquals(-1, text.indexOf("BCC:"));
assertContains(text, "Subject: test message\n");
- assertEquals(-1, text.indexOf("Date:"));
+ assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
assertContains(text, "This is a test message.");
}
@@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase {
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
assertEquals(-1, text.indexOf("BCC:"));
assertContains(text, "Subject: This is a test message please ignore\n");
- assertEquals(-1, text.indexOf("Date:"));
+ assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
assertContains(text, "The quick brown fox jumps over the lazy dog");
}
}