Initial ExtractorFactory support for building TextExtractors for embeded documents

author Nick Burch <nick@apache.org>

Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)

committer Nick Burch <nick@apache.org>

Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
author Nick Burch <nick@apache.org>
Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
committer Nick Burch <nick@apache.org>
Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml

index fc7206925ca5356508de964e28b0c61e4c2ae64a..14dcd3b4fc8088efdbacf34749a4874a3a793f1b 100644 (file)
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -41,6 +41,7 @@
          </release>
  -->
          <release version="3.5.1-beta2" date="2008-08-20">
+           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
             <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
             <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
             <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 94d1a47207a7309713203a657226147f1e4ba24a..e92ca9b704d99c42861ee15aa9b19a06761082de 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -38,6 +38,7 @@
          </release>
  -->
          <release version="3.5.1-beta2" date="2008-08-20">
+           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
             <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
             <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
             <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java

index d46c7e4aadd893a74d42736493eedb6b9f8c6776..f198c193372f7a29d36dbe4a0190b43c65bf9aa3 100644 (file)
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@@ -19,6 +19,7 @@ package org.apache.poi;
  import org.apache.poi.hpsf.DocumentSummaryInformation;
  import org.apache.poi.hpsf.SummaryInformation;
  import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
   * Common Parent for OLE2 based Text Extractors
@@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
         public POITextExtractor getMetadataTextExtractor() {
                 return new HPSFPropertiesExtractor(this);
         }
+
+       /**
+        * Return the underlying POIFS FileSystem of
+        *  this document.
+        */
+       public POIFSFileSystem getFileSystem() {
+               return document.filesystem;
+       }
  }
diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java

index 0381a0457341b544a8006ea21d58e9b73befdbb2..b9750fc58a98781af32795fd8d00923fb2daed39 100644 (file)
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
  import org.apache.poi.hssf.usermodel.HSSFRow;
  import org.apache.poi.hssf.usermodel.HSSFSheet;
  import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
@@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                 this.wb = wb;
         }
         public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-               this(new HSSFWorkbook(fs));
+               this(fs.getRoot(), fs);
+       }
+       public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HSSFWorkbook(dir, fs, true));
         }
         
  
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java

index 55a47065b9f76252a1b948b1d38e10fe424c97e4..8e07afd1fdc6dd291ea72db14845a2561d1e87dc 100644 (file)
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -18,9 +18,11 @@ package org.apache.poi.extractor;
  
  import java.io.File;
  import java.io.FileInputStream;
+import java.io.FileNotFoundException;
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.PushbackInputStream;
+import java.util.ArrayList;
  import java.util.Iterator;
  
  import org.apache.poi.POIOLE2TextExtractor;
@@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  import org.apache.poi.hslf.extractor.PowerPointExtractor;
  import org.apache.poi.hssf.extractor.ExcelExtractor;
  import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.Entry;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  import org.apache.poi.xslf.XSLFSlideShow;
@@ -105,24 +109,95 @@ public class ExtractorFactory {
         }
         
         public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+               return createExtractor(fs.getRoot(), fs);
+       }
+       public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
                 // Look for certain entries in the stream, to figure it
                 //  out from
-               for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
+               for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
                         Entry entry = (Entry)entries.next();
                         
                         if(entry.getName().equals("Workbook")) {
-                               return new ExcelExtractor(fs);
+                               return new ExcelExtractor(poifsDir, fs);
                         }
                         if(entry.getName().equals("WordDocument")) {
-                               return new WordExtractor(fs);
+                               return new WordExtractor(poifsDir, fs);
                         }
                         if(entry.getName().equals("PowerPoint Document")) {
-                               return new PowerPointExtractor(fs);
+                               return new PowerPointExtractor(poifsDir, fs);
                         }
                         if(entry.getName().equals("VisioDocument")) {
-                               return new VisioTextExtractor(fs);
+                               return new VisioTextExtractor(poifsDir, fs);
                         }
                 }
                 throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
         }
+       
+       
+       /**
+        * Returns an array of text extractors, one for each of
+        *  the embeded documents in the file (if there are any).
+        * If there are no embeded documents, you'll get back an
+        *  empty array. Otherwise, you'll get one open 
+        *  {@link POITextExtractor} for each embeded file.
+        */
+       public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+               // Find all the embeded directories
+               ArrayList dirs = new ArrayList();
+               POIFSFileSystem fs = ext.getFileSystem();
+               if(fs == null) {
+                       throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+               }
+               
+               if(ext instanceof ExcelExtractor) {
+                       // These are in MBD... under the root
+                       Iterator it = fs.getRoot().getEntries();
+                       while(it.hasNext()) {
+                               Entry entry = (Entry)it.next();
+                               if(entry.getName().startsWith("MBD")) {
+                                       dirs.add(entry);
+                               }
+                       }
+               } else if(ext instanceof WordExtractor) {
+                       // These are in ObjectPool -> _... under the root
+                       try {
+                               DirectoryEntry op = (DirectoryEntry)
+                                       fs.getRoot().getEntry("ObjectPool");
+                               Iterator it = op.getEntries();
+                               while(it.hasNext()) {
+                                       Entry entry = (Entry)it.next();
+                                       if(entry.getName().startsWith("_")) {
+                                               dirs.add(entry);
+                                       }
+                               }
+                       } catch(FileNotFoundException e) {}
+               } else if(ext instanceof PowerPointExtractor) {
+                       // Tricky, not stored directly in poifs
+                       // TODO
+               }
+               
+               // Create the extractors
+               if(dirs == null || dirs.size() == 0) {
+                       return new POITextExtractor[0];
+               }
+               
+               POITextExtractor[] te = new POITextExtractor[dirs.size()];
+               for(int i=0; i<te.length; i++) {
+                       te[i] = createExtractor(
+                                       (DirectoryNode)dirs.get(i), ext.getFileSystem()
+                       );
+               }
+               return te;
+       }
+
+       /**
+        * Returns an array of text extractors, one for each of
+        *  the embeded documents in the file (if there are any).
+        * If there are no embeded documents, you'll get back an
+        *  empty array. Otherwise, you'll get one open 
+        *  {@link POITextExtractor} for each embeded file.
+        */
+       public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
+               throw new IllegalStateException("Not yet supported");
+       }
  }
diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java

index 762eb92ec7cb4760d1e1747770dbb600dd3cd73e..e3db8edcd896e48e5a48b97ca9952d77074e4dab 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
+++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
@@ -20,6 +20,8 @@ import java.io.File;
  import java.io.FileInputStream;
  import java.io.IOException;
  
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.POITextExtractor;
  import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  import org.apache.poi.hslf.extractor.PowerPointExtractor;
  import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -42,6 +44,7 @@ public class TestExtractorFactory extends TestCase {
         private String word_dir;
         private String powerpoint_dir;
         private String visio_dir;
+       private String poifs_dir;
         
         private File txt;
         
@@ -63,6 +66,12 @@ public class TestExtractorFactory extends TestCase {
                 word_dir = System.getProperty("HWPF.testdata.path");
                 powerpoint_dir = System.getProperty("HSLF.testdata.path");
                 visio_dir = System.getProperty("HDGF.testdata.path");
+               poifs_dir = System.getProperty("POIFS.testdata.path");
+               assertNotNull(excel_dir);
+               assertNotNull(word_dir);
+               assertNotNull(powerpoint_dir);
+               assertNotNull(visio_dir);
+               assertNotNull(poifs_dir);
                 
                 txt = new File(powerpoint_dir, "SampleShow.txt");
                 
@@ -300,4 +309,56 @@ public class TestExtractorFactory extends TestCase {
                         // Good
                 }
         }
+
+       /**
+        * Test embeded docs text extraction. For now, only
+        *  does poifs embeded, but will do ooxml ones 
+        *  at some point.
+        */
+       public void testEmbeded() throws Exception {
+               POIOLE2TextExtractor ext;
+               POITextExtractor[] embeds;
+               File f;
+               
+               // No embedings
+               ext = (POIOLE2TextExtractor)
+                               ExtractorFactory.createExtractor(xls);
+               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+               assertEquals(0, embeds.length);
+               
+               // Excel
+               f = new File(poifs_dir, "excel_with_embeded.xls");
+               ext = (POIOLE2TextExtractor)
+                               ExtractorFactory.createExtractor(f);
+               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+               
+               assertEquals(6, embeds.length);
+               assertTrue(embeds[0] instanceof PowerPointExtractor);
+               assertTrue(embeds[1] instanceof ExcelExtractor);
+               assertTrue(embeds[2] instanceof ExcelExtractor);
+               assertTrue(embeds[3] instanceof PowerPointExtractor);
+               assertTrue(embeds[4] instanceof WordExtractor);
+               assertTrue(embeds[5] instanceof WordExtractor);
+               for(int i=0; i<embeds.length; i++) {
+                       assertTrue(embeds[i].getText().length() > 20);
+               }
+               
+               // Word
+               f = new File(poifs_dir, "word_with_embeded.doc");
+               ext = (POIOLE2TextExtractor)
+                               ExtractorFactory.createExtractor(f);
+               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+               
+               assertEquals(4, embeds.length);
+               assertTrue(embeds[0] instanceof WordExtractor);
+               assertTrue(embeds[1] instanceof ExcelExtractor);
+               assertTrue(embeds[2] instanceof ExcelExtractor);
+               assertTrue(embeds[3] instanceof PowerPointExtractor);
+               for(int i=0; i<embeds.length; i++) {
+                       assertTrue(embeds[i].getText().length() > 20);
+               }
+               
+               // TODO - PowerPoint
+               // TODO - Visio
+       }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java

index bb58933de8aa3ecc2c37a01d43eff31bd0ec232a..34a2f4f89360dc86f4a384771fb02177a1665446 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
@@ -64,6 +64,10 @@ public class ChunkFactory {
         private void processChunkParseCommands() throws IOException {
                 String line;
                 InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
+               if(cpd == null) {
+                       throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
+               }
+               
                 BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
                 while( (line = inp.readLine()) != null ) {
                         if(line.startsWith("#")) continue;
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java

index 9b1307cee34ce9d2f952a0e2b66e685f17f6fa60..9b861d6d62e83f2230cf046ac253b82186cd26b9 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
  import org.apache.poi.hdgf.streams.ChunkStream;
  import org.apache.poi.hdgf.streams.PointerContainingStream;
  import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
@@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
                 this.hdgf = hdgf;
         }
         public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
-               this(new HDGFDiagram(fs));
+               this(fs.getRoot(), fs);
+       }
+       public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HDGFDiagram(dir, fs));
                 this.fs = fs;
         }
         public VisioTextExtractor(InputStream inp) throws IOException {
diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java

index 841bd38f9c785b2c16d287fb40e05b703cd482ae..cc0205647b4d063314c2739a312848317d78e2a3 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
  import org.apache.poi.hslf.model.Slide;
  import org.apache.poi.hslf.model.TextRun;
  import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
@@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
         public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
                 this(new HSLFSlideShow(fs));
         }
+       public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HSLFSlideShow(dir, fs));
+       }
  
         /**
          * Creates a PowerPointExtractor, from a HSLFSlideShow
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index deaffc79afba891ae1c89d91d1a770203e03f81a..816b2c1221b55d686b00eed4f767f4e5235a1d44 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
  import org.apache.poi.hwpf.usermodel.HeaderStories;
  import org.apache.poi.hwpf.usermodel.Paragraph;
  import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
@@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
                 this(new HWPFDocument(fs));
                 this.fs = fs;
         }
+       public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HWPFDocument(dir, fs));
+               this.fs = fs;
+       }
         
         /**
          * Create a new Word Extractor
diff --git a/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx

new file mode 100755 (executable)

index 0000000..411de4a

Binary files /dev/null and b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx differ
author	Nick Burch <nick@apache.org>
	Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
committer	Nick Burch <nick@apache.org>
	Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
src/documentation/content/xdocs/changes.xml		patch \| blob \| history
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/java/org/apache/poi/POIOLE2TextExtractor.java		patch \| blob \| history
src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java		patch \| blob \| history
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx	[new file with mode: 0755]	patch \| blob