<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
return names;
}
+
+ /**
+ *
+ */
+ public String[] getHeaders() throws ChunkNotFoundException {
+ String headers = getStringFromChunk(mainChunks.messageHeaders);
+ if(headers == null) {
+ return null;
+ }
+ return headers.split("\\r?\\n");
+ }
/**
* Gets the conversation topic of the parsed Outlook Message.
// 0x0050 -> 0x006F seem to be routing info or similar
public static final int CONVERSATION_TOPIC = 0x0070;
public static final int SENT_BY_SERVER_TYPE = 0x0075;
+ public static final int MESSAGE_HEADERS = 0x007D;
// RECEIVEDEMAIL = 76
public static final int DISPLAY_TO = 0x0E04;
public static final int DISPLAY_FROM = 0x0C1A;
public StringChunk conversationTopic;
/** Type of server that the message originated from (SMTP, etc). */
public StringChunk sentByServerType;
+ /** The email headers */
+ public StringChunk messageHeaders;
/** TODO */
public MessageSubmissionChunk submissionChunk;
/** TODO */
case SENT_BY_SERVER_TYPE:
sentByServerType = (StringChunk)chunk;
break;
+ case MESSAGE_HEADERS:
+ messageHeaders = (StringChunk)chunk;
+ break;
case DISPLAY_TO:
displayToChunk = (StringChunk)chunk;
break;
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
} catch(ChunkNotFoundException e) {}
+ // Date - try two ways to find it
try {
+ // First try via the proper chunk
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
- } catch(ChunkNotFoundException e) {}
+ } catch(ChunkNotFoundException e) {
+ try {
+ // Failing that try via the raw headers
+ String[] headers = msg.getHeaders();
+ for(String header: headers) {
+ if(header.toLowerCase().startsWith("date:")) {
+ s.append(
+ "Date:" +
+ header.substring(header.indexOf(':')+1) +
+ "\n"
+ );
+ break;
+ }
+ }
+ } catch(ChunkNotFoundException he) {
+ // We can't find the date, sorry...
+ }
+ }
+
try {
s.append("Subject: " + msg.getSubject() + "\n");
} catch(ChunkNotFoundException e) {}
assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject());
}
+ /**
+ * Test message headers
+ */
+ public void testHeaders() throws Exception {
+ // Simple email first
+ assertEquals(26, simple.getHeaders().length);
+ assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
+ assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
+ assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
+
+ // Quick doesn't have them
+ try {
+ quick.getHeaders();
+ fail();
+ } catch(ChunkNotFoundException e) {}
+
+ // Attachments doesn't have them
+ try {
+ attachments.getHeaders();
+ fail();
+ } catch(ChunkNotFoundException e) {}
+
+ // Outlook30 has some
+ assertEquals(33, outlook30.getHeaders().length);
+ assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
+ assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
+ assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future
+ }
+
/**
* Test attachments
*/
assertEquals(-1, text.indexOf("CC:"));
assertEquals(-1, text.indexOf("BCC:"));
assertContains(text, "Subject: test message\n");
- assertEquals(-1, text.indexOf("Date:"));
+ assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
assertContains(text, "This is a test message.");
}
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
assertEquals(-1, text.indexOf("BCC:"));
assertContains(text, "Subject: This is a test message please ignore\n");
- assertEquals(-1, text.indexOf("Date:"));
+ assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
assertContains(text, "The quick brown fox jumps over the lazy dog");
}
}