]> source.dussan.org Git - poi.git/commitdiff
Initial ExtractorFactory support for building TextExtractors for embeded documents
authorNick Burch <nick@apache.org>
Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
committerNick Burch <nick@apache.org>
Tue, 2 Sep 2008 19:37:52 +0000 (19:37 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@691351 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/status.xml
src/java/org/apache/poi/POIOLE2TextExtractor.java
src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx [new file with mode: 0755]

index fc7206925ca5356508de964e28b0c61e4c2ae64a..14dcd3b4fc8088efdbacf34749a4874a3a793f1b 100644 (file)
@@ -41,6 +41,7 @@
         </release>
 -->
         <release version="3.5.1-beta2" date="2008-08-20">
+           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
            <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
            <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
            <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
index 94d1a47207a7309713203a657226147f1e4ba24a..e92ca9b704d99c42861ee15aa9b19a06761082de 100644 (file)
@@ -38,6 +38,7 @@
         </release>
 -->
         <release version="3.5.1-beta2" date="2008-08-20">
+           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
            <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
            <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
            <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
index d46c7e4aadd893a74d42736493eedb6b9f8c6776..f198c193372f7a29d36dbe4a0190b43c65bf9aa3 100644 (file)
@@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
  * Common Parent for OLE2 based Text Extractors
@@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
        public POITextExtractor getMetadataTextExtractor() {
                return new HPSFPropertiesExtractor(this);
        }
+
+       /**
+        * Return the underlying POIFS FileSystem of
+        *  this document.
+        */
+       public POIFSFileSystem getFileSystem() {
+               return document.filesystem;
+       }
 }
index 0381a0457341b544a8006ea21d58e9b73befdbb2..b9750fc58a98781af32795fd8d00923fb2daed39 100644 (file)
@@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                this.wb = wb;
        }
        public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-               this(new HSSFWorkbook(fs));
+               this(fs.getRoot(), fs);
+       }
+       public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HSSFWorkbook(dir, fs, true));
        }
        
 
index 55a47065b9f76252a1b948b1d38e10fe424c97e4..8e07afd1fdc6dd291ea72db14845a2561d1e87dc 100644 (file)
@@ -18,9 +18,11 @@ package org.apache.poi.extractor;
 
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
+import java.util.ArrayList;
 import java.util.Iterator;
 
 import org.apache.poi.POIOLE2TextExtractor;
@@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.XSLFSlideShow;
@@ -105,24 +109,95 @@ public class ExtractorFactory {
        }
        
        public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+               return createExtractor(fs.getRoot(), fs);
+       }
+       public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
                // Look for certain entries in the stream, to figure it
                //  out from
-               for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
+               for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
                        Entry entry = (Entry)entries.next();
                        
                        if(entry.getName().equals("Workbook")) {
-                               return new ExcelExtractor(fs);
+                               return new ExcelExtractor(poifsDir, fs);
                        }
                        if(entry.getName().equals("WordDocument")) {
-                               return new WordExtractor(fs);
+                               return new WordExtractor(poifsDir, fs);
                        }
                        if(entry.getName().equals("PowerPoint Document")) {
-                               return new PowerPointExtractor(fs);
+                               return new PowerPointExtractor(poifsDir, fs);
                        }
                        if(entry.getName().equals("VisioDocument")) {
-                               return new VisioTextExtractor(fs);
+                               return new VisioTextExtractor(poifsDir, fs);
                        }
                }
                throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
        }
+       
+       
+       /**
+        * Returns an array of text extractors, one for each of
+        *  the embeded documents in the file (if there are any).
+        * If there are no embeded documents, you'll get back an
+        *  empty array. Otherwise, you'll get one open 
+        *  {@link POITextExtractor} for each embeded file.
+        */
+       public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+               // Find all the embeded directories
+               ArrayList dirs = new ArrayList();
+               POIFSFileSystem fs = ext.getFileSystem();
+               if(fs == null) {
+                       throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+               }
+               
+               if(ext instanceof ExcelExtractor) {
+                       // These are in MBD... under the root
+                       Iterator it = fs.getRoot().getEntries();
+                       while(it.hasNext()) {
+                               Entry entry = (Entry)it.next();
+                               if(entry.getName().startsWith("MBD")) {
+                                       dirs.add(entry);
+                               }
+                       }
+               } else if(ext instanceof WordExtractor) {
+                       // These are in ObjectPool -> _... under the root
+                       try {
+                               DirectoryEntry op = (DirectoryEntry)
+                                       fs.getRoot().getEntry("ObjectPool");
+                               Iterator it = op.getEntries();
+                               while(it.hasNext()) {
+                                       Entry entry = (Entry)it.next();
+                                       if(entry.getName().startsWith("_")) {
+                                               dirs.add(entry);
+                                       }
+                               }
+                       } catch(FileNotFoundException e) {}
+               } else if(ext instanceof PowerPointExtractor) {
+                       // Tricky, not stored directly in poifs
+                       // TODO
+               }
+               
+               // Create the extractors
+               if(dirs == null || dirs.size() == 0) {
+                       return new POITextExtractor[0];
+               }
+               
+               POITextExtractor[] te = new POITextExtractor[dirs.size()];
+               for(int i=0; i<te.length; i++) {
+                       te[i] = createExtractor(
+                                       (DirectoryNode)dirs.get(i), ext.getFileSystem()
+                       );
+               }
+               return te;
+       }
+
+       /**
+        * Returns an array of text extractors, one for each of
+        *  the embeded documents in the file (if there are any).
+        * If there are no embeded documents, you'll get back an
+        *  empty array. Otherwise, you'll get one open 
+        *  {@link POITextExtractor} for each embeded file.
+        */
+       public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
+               throw new IllegalStateException("Not yet supported");
+       }
 }
index 762eb92ec7cb4760d1e1747770dbb600dd3cd73e..e3db8edcd896e48e5a48b97ca9952d77074e4dab 100644 (file)
@@ -20,6 +20,8 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.POITextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
@@ -42,6 +44,7 @@ public class TestExtractorFactory extends TestCase {
        private String word_dir;
        private String powerpoint_dir;
        private String visio_dir;
+       private String poifs_dir;
        
        private File txt;
        
@@ -63,6 +66,12 @@ public class TestExtractorFactory extends TestCase {
                word_dir = System.getProperty("HWPF.testdata.path");
                powerpoint_dir = System.getProperty("HSLF.testdata.path");
                visio_dir = System.getProperty("HDGF.testdata.path");
+               poifs_dir = System.getProperty("POIFS.testdata.path");
+               assertNotNull(excel_dir);
+               assertNotNull(word_dir);
+               assertNotNull(powerpoint_dir);
+               assertNotNull(visio_dir);
+               assertNotNull(poifs_dir);
                
                txt = new File(powerpoint_dir, "SampleShow.txt");
                
@@ -300,4 +309,56 @@ public class TestExtractorFactory extends TestCase {
                        // Good
                }
        }
+
+       /**
+        * Test embeded docs text extraction. For now, only
+        *  does poifs embeded, but will do ooxml ones 
+        *  at some point.
+        */
+       public void testEmbeded() throws Exception {
+               POIOLE2TextExtractor ext;
+               POITextExtractor[] embeds;
+               File f;
+               
+               // No embedings
+               ext = (POIOLE2TextExtractor)
+                               ExtractorFactory.createExtractor(xls);
+               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+               assertEquals(0, embeds.length);
+               
+               // Excel
+               f = new File(poifs_dir, "excel_with_embeded.xls");
+               ext = (POIOLE2TextExtractor)
+                               ExtractorFactory.createExtractor(f);
+               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+               
+               assertEquals(6, embeds.length);
+               assertTrue(embeds[0] instanceof PowerPointExtractor);
+               assertTrue(embeds[1] instanceof ExcelExtractor);
+               assertTrue(embeds[2] instanceof ExcelExtractor);
+               assertTrue(embeds[3] instanceof PowerPointExtractor);
+               assertTrue(embeds[4] instanceof WordExtractor);
+               assertTrue(embeds[5] instanceof WordExtractor);
+               for(int i=0; i<embeds.length; i++) {
+                       assertTrue(embeds[i].getText().length() > 20);
+               }
+               
+               // Word
+               f = new File(poifs_dir, "word_with_embeded.doc");
+               ext = (POIOLE2TextExtractor)
+                               ExtractorFactory.createExtractor(f);
+               embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+               
+               assertEquals(4, embeds.length);
+               assertTrue(embeds[0] instanceof WordExtractor);
+               assertTrue(embeds[1] instanceof ExcelExtractor);
+               assertTrue(embeds[2] instanceof ExcelExtractor);
+               assertTrue(embeds[3] instanceof PowerPointExtractor);
+               for(int i=0; i<embeds.length; i++) {
+                       assertTrue(embeds[i].getText().length() > 20);
+               }
+               
+               // TODO - PowerPoint
+               // TODO - Visio
+       }
 }
index bb58933de8aa3ecc2c37a01d43eff31bd0ec232a..34a2f4f89360dc86f4a384771fb02177a1665446 100644 (file)
@@ -64,6 +64,10 @@ public class ChunkFactory {
        private void processChunkParseCommands() throws IOException {
                String line;
                InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
+               if(cpd == null) {
+                       throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
+               }
+               
                BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
                while( (line = inp.readLine()) != null ) {
                        if(line.startsWith("#")) continue;
index 9b1307cee34ce9d2f952a0e2b66e685f17f6fa60..9b861d6d62e83f2230cf046ac253b82186cd26b9 100644 (file)
@@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
 import org.apache.poi.hdgf.streams.ChunkStream;
 import org.apache.poi.hdgf.streams.PointerContainingStream;
 import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
                this.hdgf = hdgf;
        }
        public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
-               this(new HDGFDiagram(fs));
+               this(fs.getRoot(), fs);
+       }
+       public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HDGFDiagram(dir, fs));
                this.fs = fs;
        }
        public VisioTextExtractor(InputStream inp) throws IOException {
index 841bd38f9c785b2c16d287fb40e05b703cd482ae..cc0205647b4d063314c2739a312848317d78e2a3 100644 (file)
@@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
 import org.apache.poi.hslf.model.Slide;
 import org.apache.poi.hslf.model.TextRun;
 import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
        public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
                this(new HSLFSlideShow(fs));
        }
+       public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HSLFSlideShow(dir, fs));
+       }
 
        /**
         * Creates a PowerPointExtractor, from a HSLFSlideShow
index deaffc79afba891ae1c89d91d1a770203e03f81a..816b2c1221b55d686b00eed4f767f4e5235a1d44 100644 (file)
@@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
                this(new HWPFDocument(fs));
                this.fs = fs;
        }
+       public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+               this(new HWPFDocument(dir, fs));
+               this.fs = fs;
+       }
        
        /**
         * Create a new Word Extractor
diff --git a/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx
new file mode 100755 (executable)
index 0000000..411de4a
Binary files /dev/null and b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx differ