]> source.dussan.org Git - poi.git/commitdiff
Wire up the new HSMFTextExtactor to the ExtractorFactory
authorNick Burch <nick@apache.org>
Fri, 8 Jan 2010 16:14:27 +0000 (16:14 +0000)
committerNick Burch <nick@apache.org>
Fri, 8 Jan 2010 16:14:27 +0000 (16:14 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java

index 34e4829de5a36010fb28a133aea9cdfdda1a81bc..28af05b3e2bb635c82b43751d2b8533b39189151 100644 (file)
@@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -138,6 +139,11 @@ public class ExtractorFactory {
                        if(entry.getName().equals("VisioDocument")) {
                                return new VisioTextExtractor(poifsDir, fs);
                        }
+                       if(entry.getName().equals("__substg1.0_1000001E") ||
+                             entry.getName().equals("__substg1.0_0047001E") ||
+                             entry.getName().equals("__substg1.0_0037001E")) {
+                          return new HSMFTextExtactor(poifsDir, fs);
+                       }
                }
                throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
        }
index 7ef1f002c399397424280c6c9a5379ac8a0eb84c..8518b6eab15f930dbef30e596acba21df927ae63 100644 (file)
@@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor;
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -42,132 +43,145 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
  */
 public class TestExtractorFactory extends TestCase {
 
-       private File txt;
-       
-       private File xls;
-       private File xlsx;
-    private File xltx;
-    private File xlsEmb;
-
-       private File doc;
-       private File docx;
-    private File dotx;
-    private File docEmb;
-
-       private File ppt;
-       private File pptx;
-       
-       private File vsd;
+   private File txt;
 
-       protected void setUp() throws Exception {
-               super.setUp();
-               
-        POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
-        xls = ssTests.getFile("SampleSS.xls");
-               xlsx = ssTests.getFile("SampleSS.xlsx");
-        xltx = ssTests.getFile("test.xltx");
-        xlsEmb = ssTests.getFile("excel_with_embeded.xls");
-
-        POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
-               doc = wpTests.getFile("SampleDoc.doc");
-               docx = wpTests.getFile("SampleDoc.docx");
-        dotx = wpTests.getFile("test.dotx");
-        docEmb = wpTests.getFile("word_with_embeded.doc");
-
-        POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
-               ppt = slTests.getFile("SampleShow.ppt");
-               pptx = slTests.getFile("SampleShow.pptx");
-        txt = slTests.getFile("SampleShow.txt");
-
-        POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
-               vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
-       }
+   private File xls;
+   private File xlsx;
+   private File xltx;
+   private File xlsEmb;
 
-       public void testFile() throws Exception {
-               // Excel
-               assertTrue(
-                               ExtractorFactory.createExtractor(xls)
-                               instanceof ExcelExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(xls).getText().length() > 200
-               );
-               
-               assertTrue(
-                               ExtractorFactory.createExtractor(xlsx)
-                               instanceof XSSFExcelExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(xlsx).getText().length() > 200
-               );
+   private File doc;
+   private File docx;
+   private File dotx;
+   private File docEmb;
 
-                assertTrue(
-                                ExtractorFactory.createExtractor(xltx)
-                                instanceof XSSFExcelExtractor
-                );
-                assertTrue(
-                                ExtractorFactory.createExtractor(xltx).getText().contains("test")
-                );
+   private File ppt;
+   private File pptx;
 
-               
-               // Word
-               assertTrue(
-                               ExtractorFactory.createExtractor(doc)
-                               instanceof WordExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(doc).getText().length() > 120
-               );
-               
-               assertTrue(
-                               ExtractorFactory.createExtractor(docx)
-                               instanceof XWPFWordExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(docx).getText().length() > 120
-               );
+   private File msg;
+   private File vsd;
 
-                assertTrue(
-                                ExtractorFactory.createExtractor(dotx)
-                                instanceof XWPFWordExtractor
-                );
-                assertTrue(
-                                ExtractorFactory.createExtractor(dotx).getText().contains("Test")
-                );
+   protected void setUp() throws Exception {
+      super.setUp();
 
-               // PowerPoint
-               assertTrue(
-                               ExtractorFactory.createExtractor(ppt)
-                               instanceof PowerPointExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(ppt).getText().length() > 120
-               );
-               
-               assertTrue(
-                               ExtractorFactory.createExtractor(pptx)
-                               instanceof XSLFPowerPointExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(pptx).getText().length() > 120
-               );
-               
-               // Visio
-               assertTrue(
-                               ExtractorFactory.createExtractor(vsd)
-                               instanceof VisioTextExtractor
-               );
-               assertTrue(
-                               ExtractorFactory.createExtractor(vsd).getText().length() > 50
-               );
-               
-               // Text
-               try {
-                       ExtractorFactory.createExtractor(txt);
-                       fail();
-               } catch(IllegalArgumentException e) {
-                       // Good
-               }
+      POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
+      xls = ssTests.getFile("SampleSS.xls");
+      xlsx = ssTests.getFile("SampleSS.xlsx");
+      xltx = ssTests.getFile("test.xltx");
+      xlsEmb = ssTests.getFile("excel_with_embeded.xls");
+
+      POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
+      doc = wpTests.getFile("SampleDoc.doc");
+      docx = wpTests.getFile("SampleDoc.docx");
+      dotx = wpTests.getFile("test.dotx");
+      docEmb = wpTests.getFile("word_with_embeded.doc");
+
+      POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
+      ppt = slTests.getFile("SampleShow.ppt");
+      pptx = slTests.getFile("SampleShow.pptx");
+      txt = slTests.getFile("SampleShow.txt");
+
+      POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
+      vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
+      
+      POIDataSamples olTests = POIDataSamples.getHSMFInstance();
+      msg = olTests.getFile("quick.msg");
+   }
+
+   public void testFile() throws Exception {
+      // Excel
+      assertTrue(
+            ExtractorFactory.createExtractor(xls)
+            instanceof ExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(xls).getText().length() > 200
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(xlsx)
+            instanceof XSSFExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(xlsx).getText().length() > 200
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(xltx)
+            instanceof XSSFExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(xltx).getText().contains("test")
+      );
+
+
+      // Word
+      assertTrue(
+            ExtractorFactory.createExtractor(doc)
+            instanceof WordExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(doc).getText().length() > 120
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(docx)
+            instanceof XWPFWordExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(docx).getText().length() > 120
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(dotx)
+            instanceof XWPFWordExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(dotx).getText().contains("Test")
+      );
+
+      // PowerPoint
+      assertTrue(
+            ExtractorFactory.createExtractor(ppt)
+            instanceof PowerPointExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(ppt).getText().length() > 120
+      );
+
+      assertTrue(
+            ExtractorFactory.createExtractor(pptx)
+            instanceof XSLFPowerPointExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(pptx).getText().length() > 120
+      );
+
+      // Visio
+      assertTrue(
+            ExtractorFactory.createExtractor(vsd)
+            instanceof VisioTextExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(vsd).getText().length() > 50
+      );
+      
+      // Outlook msg
+      assertTrue(
+            ExtractorFactory.createExtractor(msg)
+            instanceof HSMFTextExtactor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(msg).getText().length() > 50
+      );
+
+      // Text
+      try {
+         ExtractorFactory.createExtractor(txt);
+         fail();
+      } catch(IllegalArgumentException e) {
+         // Good
+      }
        }
        
        public void testInputStream() throws Exception {
@@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase {
                                ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
                );
                
+               // Outlook msg
+      assertTrue(
+            ExtractorFactory.createExtractor(new FileInputStream(msg))
+            instanceof HSMFTextExtactor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
+      );
+               
                // Text
                try {
                        ExtractorFactory.createExtractor(new FileInputStream(txt));
@@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase {
                                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
                );
                
+      // Outlook msg
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
+            instanceof HSMFTextExtactor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
+      );
+      
                // Text
                try {
                        ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
@@ -323,57 +355,58 @@ public class TestExtractorFactory extends TestCase {
                }
        }
 
-       /**
-        * Test embeded docs text extraction. For now, only
-        *  does poifs embeded, but will do ooxml ones 
-        *  at some point.
-        */
-       public void testEmbeded() throws Exception {
-               POIOLE2TextExtractor ext;
-               POITextExtractor[] embeds;
-
-               // No embedings
-               ext = (POIOLE2TextExtractor)
-                               ExtractorFactory.createExtractor(xls);
-               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-               assertEquals(0, embeds.length);
-               
-               // Excel
-               ext = (POIOLE2TextExtractor)
-                               ExtractorFactory.createExtractor(xlsEmb);
-               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
-               assertEquals(6, embeds.length);
-               int numWord = 0, numXls = 0, numPpt = 0;
-        for(int i=0; i<embeds.length; i++) {
-                       assertTrue(embeds[i].getText().length() > 20);
-
-            if(embeds[i] instanceof PowerPointExtractor) numPpt++;
-            else if(embeds[i] instanceof ExcelExtractor) numXls++;
-            else if(embeds[i] instanceof WordExtractor) numWord++;
-        }
-               assertEquals(2, numPpt);
-        assertEquals(2, numXls);
-        assertEquals(2, numWord);
-
-        // Word
-               ext = (POIOLE2TextExtractor)
-                               ExtractorFactory.createExtractor(docEmb);
-               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-               
-        numWord = 0; numXls = 0; numPpt = 0;
-               assertEquals(4, embeds.length);
-               for(int i=0; i<embeds.length; i++) {
-                       assertTrue(embeds[i].getText().length() > 20);
-            if(embeds[i] instanceof PowerPointExtractor) numPpt++;
-            else if(embeds[i] instanceof ExcelExtractor) numXls++;
-            else if(embeds[i] instanceof WordExtractor) numWord++;
-               }
-        assertEquals(1, numPpt);
-        assertEquals(2, numXls);
-        assertEquals(1, numWord);
+   /**
+    * Test embeded docs text extraction. For now, only
+    *  does poifs embeded, but will do ooxml ones 
+    *  at some point.
+    */
+   public void testEmbeded() throws Exception {
+      POIOLE2TextExtractor ext;
+      POITextExtractor[] embeds;
 
-               // TODO - PowerPoint
-               // TODO - Visio
-       }
+      // No embedings
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(xls);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+      assertEquals(0, embeds.length);
+
+      // Excel
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(xlsEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      assertEquals(6, embeds.length);
+      int numWord = 0, numXls = 0, numPpt = 0;
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(2, numPpt);
+      assertEquals(2, numXls);
+      assertEquals(2, numWord);
+
+      // Word
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(docEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0;
+      assertEquals(4, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(1, numPpt);
+      assertEquals(2, numXls);
+      assertEquals(1, numWord);
+
+      // TODO - PowerPoint
+      // TODO - Visio
+      // TODO - Outlook
+   }
 }
index 7e693b0c8d9e3928bb798159df450bf0554f94c7..ea7335c5b17cf878e0be84943bd1d19bd8827b0c 100644 (file)
@@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
 import org.apache.poi.hsmf.datatypes.StringChunk;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
 import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument {
           this(new POIFSFileSystem(in));
        }
    /**
-    * Constructor for reading MSG Files from an input stream.
+    * Constructor for reading MSG Files from a POIFS filesystem
     * @param in
     * @throws IOException
     */
    public MAPIMessage(POIFSFileSystem fs) throws IOException {
-               super(fs);
-               
+               this(fs.getRoot(), fs);
+   }
+   /**
+    * Constructor for reading MSG Files from a certain
+    *  point within a POIFS filesystem
+    * @param in
+    * @throws IOException
+    */
+   public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+      super(poifsDir, fs);
+      
                // Grab all the chunks
-               ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
+               ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
                
                // Grab interesting bits
                ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();
index 63bbeb35182a8f7bc6e145230cfefb21b1bbbc15..fd72feaa56a0b645bbb6e96249f675f1d4eb67ac 100644 (file)
@@ -23,12 +23,16 @@ import java.text.SimpleDateFormat;
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 public class HSMFTextExtactor extends POIOLE2TextExtractor {
    public HSMFTextExtactor(MAPIMessage msg) {
       super(msg);
    }
+   public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+      this(new MAPIMessage(poifsDir, fs));
+   }
    public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
       this(new MAPIMessage(fs));
    }