]> source.dussan.org Git - poi.git/commitdiff
Add embeded (attachment) support to the outlook text extractor
authorNick Burch <nick@apache.org>
Fri, 8 Jan 2010 16:44:08 +0000 (16:44 +0000)
committerNick Burch <nick@apache.org>
Fri, 8 Jan 2010 16:44:08 +0000 (16:44 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java

index 15b37508159eb2658a14676ece156335a59d45a0..83042ae6b0a747fd68d9644bd63464c22918b0c2 100644 (file)
@@ -34,7 +34,8 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
-           <action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
+           <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
            <action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
            <action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
            <action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted()  </action>
index 7657635e750f857b0e7b5d6d8b990e571f5a440a..55d8499f5e2e805660ad9d96a01d77dab963472b 100644 (file)
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.extractor;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
@@ -139,9 +142,14 @@ public class ExtractorFactory {
                        if(entry.getName().equals("VisioDocument")) {
                                return new VisioTextExtractor(poifsDir, fs);
                        }
-                       if(entry.getName().equals("__substg1.0_1000001E") ||
+                       if(
+                             entry.getName().equals("__substg1.0_1000001E") ||
+               entry.getName().equals("__substg1.0_1000001F") ||
                              entry.getName().equals("__substg1.0_0047001E") ||
-                             entry.getName().equals("__substg1.0_0037001E")) {
+               entry.getName().equals("__substg1.0_0047001F") ||
+                             entry.getName().equals("__substg1.0_0037001E") ||
+               entry.getName().equals("__substg1.0_0037001F")
+                       ) {
                           return new OutlookTextExtactor(poifsDir, fs);
                        }
                }
@@ -157,8 +165,12 @@ public class ExtractorFactory {
         *  {@link POITextExtractor} for each embeded file.
         */
        public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
-               // Find all the embeded directories
+          // All the embded directories we spotted
                ArrayList<Entry> dirs = new ArrayList<Entry>();
+               // For anything else not directly held in as a POIFS directory
+               ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+               
+      // Find all the embeded directories
                POIFSFileSystem fs = ext.getFileSystem();
                if(fs == null) {
                        throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
@@ -189,20 +201,44 @@ public class ExtractorFactory {
                } else if(ext instanceof PowerPointExtractor) {
                        // Tricky, not stored directly in poifs
                        // TODO
+               } else if(ext instanceof OutlookTextExtactor) {
+                  // Stored in the Attachment blocks
+                  MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
+                  for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
+                     if(attachment.attachData != null) {
+                     byte[] data = attachment.attachData.getValue();
+                     nonPOIFS.add( new ByteArrayInputStream(data) );
+                     }
+                  }
                }
                
                // Create the extractors
-               if(dirs == null || dirs.size() == 0) {
+               if(
+                     (dirs == null || dirs.size() == 0) &&
+                     (nonPOIFS == null || nonPOIFS.size() == 0)
+               ){
                        return new POITextExtractor[0];
                }
                
-               POITextExtractor[] te = new POITextExtractor[dirs.size()];
-               for(int i=0; i<te.length; i++) {
-                       te[i] = createExtractor(
+               ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+               for(int i=0; i<dirs.size(); i++) {
+                       e.add( createExtractor(
                                        (DirectoryNode)dirs.get(i), ext.getFileSystem()
-                       );
+                       ) );
+               }
+               for(int i=0; i<nonPOIFS.size(); i++) {
+                  try {
+                     e.add( createExtractor(nonPOIFS.get(i)) );
+         } catch(IllegalArgumentException ie) {
+            // Ignore, just means it didn't contain
+            //  a format we support as yet
+                  } catch(XmlException xe) {
+                     throw new IOException(xe.getMessage());
+                  } catch(OpenXML4JException oe) {
+                     throw new IOException(oe.getMessage());
+                  }
                }
-               return te;
+               return e.toArray(new POITextExtractor[e.size()]);
        }
 
        /**
index f9b88794bd49cc8cd46fadf6e2f94a2a1578b1fd..d327c55828b4fcbe00f1fa14ab45928d2dd667c0 100644 (file)
@@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase {
    private File pptx;
 
    private File msg;
+   private File msgEmb;
+   
    private File vsd;
 
    protected void setUp() throws Exception {
@@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase {
       
       POIDataSamples olTests = POIDataSamples.getHSMFInstance();
       msg = olTests.getFile("quick.msg");
+      msgEmb = olTests.getFile("attachment_test_msg.msg");
    }
 
    public void testFile() throws Exception {
@@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase {
       assertEquals(1, numPpt);
       assertEquals(2, numXls);
       assertEquals(1, numWord);
+      
+      // Outlook
+      ext = (OutlookTextExtactor)
+      ExtractorFactory.createExtractor(msgEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0;
+      assertEquals(1, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(0, numPpt);
+      assertEquals(0, numXls);
+      assertEquals(1, numWord);
 
       // TODO - PowerPoint
       // TODO - Visio
-      // TODO - Outlook
    }
 }
index 54a2ddda67c283f782eaafe4c7c18a7bfc652478..a6ada5bb95795a41857498b4328c4b6fb48677dd 100644 (file)
@@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
       this(new MAPIMessage(inp));
    }
 
+   /**
+    * Returns the underlying MAPI message
+    */
+   public MAPIMessage getMAPIMessage() {
+      return (MAPIMessage)document;
+   }
+   
    /**
     * Outputs something a little like a RFC822 email
     */