]> source.dussan.org Git - poi.git/commitdiff
Parse the HSMF headers chunk if present, and use it to find Dates in text extraction...
authorNick Burch <nick@apache.org>
Thu, 3 Jun 2010 15:33:54 +0000 (15:33 +0000)
committerNick Burch <nick@apache.org>
Thu, 3 Jun 2010 15:33:54 +0000 (15:33 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java

index 795d23d3ec329d2a991fef517dcbb7076b1767a5..10c8334d448cdb0e7b7701d544e6d1ccac881189 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed</action>
            <action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
            <action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
            <action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
index 05c14482a8a3e918b81f63ba968f699fc71573c2..9b2ccd7659fc4893cde138f702a10e3c2281336a 100644 (file)
@@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocument {
       return names;
    }
 
+   
+   /**
+    * 
+    */
+   public String[] getHeaders() throws ChunkNotFoundException {
+      String headers = getStringFromChunk(mainChunks.messageHeaders);
+      if(headers == null) {
+         return null;
+      }
+      return headers.split("\\r?\\n");
+   }
 
    /**
     * Gets the conversation topic of the parsed Outlook Message.
index 36b52020991702d62770cd3c73a6a71e6e9af800..a92e60d0173b2706183a6a5c49b353182d1f0833 100644 (file)
@@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup {
    // 0x0050 -> 0x006F seem to be routing info or similar
    public static final int CONVERSATION_TOPIC  = 0x0070;
    public static final int SENT_BY_SERVER_TYPE = 0x0075;
+   public static final int MESSAGE_HEADERS     = 0x007D;
    // RECEIVEDEMAIL = 76
    public static final int DISPLAY_TO          = 0x0E04;
    public static final int DISPLAY_FROM        = 0x0C1A;
@@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup {
    public StringChunk conversationTopic;
    /** Type of server that the message originated from (SMTP, etc). */
    public StringChunk sentByServerType;
+   /** The email headers */
+   public StringChunk messageHeaders;
    /** TODO */
    public MessageSubmissionChunk submissionChunk; 
    /** TODO */
@@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup {
       case SENT_BY_SERVER_TYPE:
          sentByServerType = (StringChunk)chunk;
          break;
+      case MESSAGE_HEADERS:
+         messageHeaders = (StringChunk)chunk;
+         break;
       case DISPLAY_TO:
          displayToChunk = (StringChunk)chunk;
          break;
index 7c88efad7a3aaa4a7e73b332387c09d33b317301..dcbb037128ba113a4642e452b0409877a69be6b3 100644 (file)
@@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
          handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
       } catch(ChunkNotFoundException e) {}
       
+      // Date - try two ways to find it
       try {
+         // First try via the proper chunk
          SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
          s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
-      } catch(ChunkNotFoundException e) {}
+      } catch(ChunkNotFoundException e) {
+         try {
+            // Failing that try via the raw headers 
+            String[] headers = msg.getHeaders();
+            for(String header: headers) {
+               if(header.toLowerCase().startsWith("date:")) {
+                  s.append(
+                        "Date:" + 
+                        header.substring(header.indexOf(':')+1) +
+                        "\n"
+                  );
+                  break;
+               }
+            }
+         } catch(ChunkNotFoundException he) {
+            // We can't find the date, sorry...
+         }
+      }
+      
       try {
          s.append("Subject: " + msg.getSubject() + "\n");
       } catch(ChunkNotFoundException e) {}
index 25c793339fa819261a3239104ef7d7aa4c4fd3cd..6e958c193f6609a8cd9c42a331c7858a7cdccec3 100644 (file)
@@ -75,6 +75,35 @@ public final class TestBasics extends TestCase {
       assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject());
        }
        
+       /**
+        * Test message headers
+        */
+       public void testHeaders() throws Exception {
+          // Simple email first
+          assertEquals(26, simple.getHeaders().length);
+          assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
+      assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
+      assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
+      
+      // Quick doesn't have them
+      try {
+         quick.getHeaders();
+         fail();
+      } catch(ChunkNotFoundException e) {}
+      
+      // Attachments doesn't have them
+      try {
+         attachments.getHeaders();
+         fail();
+      } catch(ChunkNotFoundException e) {}
+      
+      // Outlook30 has some
+      assertEquals(33, outlook30.getHeaders().length);
+      assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
+      assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
+      assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future
+       }
+       
        /**
         * Test attachments
         */
index b18a9eb26c5c8269397abd2b29f362672961f896..ff9ddde6832e4f7f942bbf0b57d607f37d41db3c 100644 (file)
@@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase {
       assertEquals(-1, text.indexOf("CC:"));
       assertEquals(-1, text.indexOf("BCC:"));
       assertContains(text, "Subject: test message\n");
-      assertEquals(-1, text.indexOf("Date:"));
+      assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
       assertContains(text, "This is a test message.");
    }
 
@@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase {
                "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
          assertEquals(-1, text.indexOf("BCC:"));
          assertContains(text, "Subject: This is a test message please ignore\n");
-         assertEquals(-1, text.indexOf("Date:"));
+         assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
          assertContains(text, "The quick brown fox jumps over the lazy dog");
       }
    }