From: Nick Burch Date: Tue, 2 Sep 2008 19:37:52 +0000 (+0000) Subject: Initial ExtractorFactory support for building TextExtractors for embeded documents X-Git-Tag: REL_3_5_BETA3~35 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=e4ff06ec794e82ac0e7c50d71e4fe7dabbfc2ece;p=poi.git Initial ExtractorFactory support for building TextExtractors for embeded documents git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@691351 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index fc7206925c..14dcd3b4fc 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -41,6 +41,7 @@ --> + Initial ExtractorFactory support for building TextExtractors for embeded documents Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor 45539 - Improve XWPFWordExtractor to extract headers and footers diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 94d1a47207..e92ca9b704 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -38,6 +38,7 @@ --> + Initial ExtractorFactory support for building TextExtractors for embeded documents Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor 45539 - Improve XWPFWordExtractor to extract headers and footers diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index d46c7e4aad..f198c19337 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -19,6 +19,7 @@ package org.apache.poi; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * Common Parent for OLE2 based Text Extractors @@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public POITextExtractor getMetadataTextExtractor() { return new HPSFPropertiesExtractor(this); } + + /** + * Return the underlying POIFS FileSystem of + * this document. + */ + public POIFSFileSystem getFileSystem() { + return document.filesystem; + } } diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java index 0381a04573..b9750fc58a 100644 --- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java @@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p this.wb = wb; } public ExcelExtractor(POIFSFileSystem fs) throws IOException { - this(new HSSFWorkbook(fs)); + this(fs.getRoot(), fs); + } + public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HSSFWorkbook(dir, fs, true)); } diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 55a47065b9..8e07afd1fd 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -18,9 +18,11 @@ package org.apache.poi.extractor; import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; +import java.util.ArrayList; import java.util.Iterator; import org.apache.poi.POIOLE2TextExtractor; @@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xslf.XSLFSlideShow; @@ -105,24 +109,95 @@ public class ExtractorFactory { } public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { + return createExtractor(fs.getRoot(), fs); + } + public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { // Look for certain entries in the stream, to figure it // out from - for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) { + for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) { Entry entry = (Entry)entries.next(); if(entry.getName().equals("Workbook")) { - return new ExcelExtractor(fs); + return new ExcelExtractor(poifsDir, fs); } if(entry.getName().equals("WordDocument")) { - return new WordExtractor(fs); + return new WordExtractor(poifsDir, fs); } if(entry.getName().equals("PowerPoint Document")) { - return new PowerPointExtractor(fs); + return new PowerPointExtractor(poifsDir, fs); } if(entry.getName().equals("VisioDocument")) { - return new VisioTextExtractor(fs); + return new VisioTextExtractor(poifsDir, fs); } } throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); } + + + /** + * Returns an array of text extractors, one for each of + * the embeded documents in the file (if there are any). + * If there are no embeded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embeded file. + */ + public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { + // Find all the embeded directories + ArrayList dirs = new ArrayList(); + POIFSFileSystem fs = ext.getFileSystem(); + if(fs == null) { + throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); + } + + if(ext instanceof ExcelExtractor) { + // These are in MBD... under the root + Iterator it = fs.getRoot().getEntries(); + while(it.hasNext()) { + Entry entry = (Entry)it.next(); + if(entry.getName().startsWith("MBD")) { + dirs.add(entry); + } + } + } else if(ext instanceof WordExtractor) { + // These are in ObjectPool -> _... under the root + try { + DirectoryEntry op = (DirectoryEntry) + fs.getRoot().getEntry("ObjectPool"); + Iterator it = op.getEntries(); + while(it.hasNext()) { + Entry entry = (Entry)it.next(); + if(entry.getName().startsWith("_")) { + dirs.add(entry); + } + } + } catch(FileNotFoundException e) {} + } else if(ext instanceof PowerPointExtractor) { + // Tricky, not stored directly in poifs + // TODO + } + + // Create the extractors + if(dirs == null || dirs.size() == 0) { + return new POITextExtractor[0]; + } + + POITextExtractor[] te = new POITextExtractor[dirs.size()]; + for(int i=0; i 20); + } + + // Word + f = new File(poifs_dir, "word_with_embeded.doc"); + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(f); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + assertEquals(4, embeds.length); + assertTrue(embeds[0] instanceof WordExtractor); + assertTrue(embeds[1] instanceof ExcelExtractor); + assertTrue(embeds[2] instanceof ExcelExtractor); + assertTrue(embeds[3] instanceof PowerPointExtractor); + for(int i=0; i 20); + } + + // TODO - PowerPoint + // TODO - Visio + } } diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java index bb58933de8..34a2f4f893 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java @@ -64,6 +64,10 @@ public class ChunkFactory { private void processChunkParseCommands() throws IOException { String line; InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName); + if(cpd == null) { + throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName); + } + BufferedReader inp = new BufferedReader(new InputStreamReader(cpd)); while( (line = inp.readLine()) != null ) { if(line.startsWith("#")) continue; diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java index 9b1307cee3..9b861d6d62 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java @@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command; import org.apache.poi.hdgf.streams.ChunkStream; import org.apache.poi.hdgf.streams.PointerContainingStream; import org.apache.poi.hdgf.streams.Stream; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor { this.hdgf = hdgf; } public VisioTextExtractor(POIFSFileSystem fs) throws IOException { - this(new HDGFDiagram(fs)); + this(fs.getRoot(), fs); + } + public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HDGFDiagram(dir, fs)); this.fs = fs; } public VisioTextExtractor(InputStream inp) throws IOException { diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java index 841bd38f9c..cc0205647b 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java @@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes; import org.apache.poi.hslf.model.Slide; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.usermodel.SlideShow; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { public PowerPointExtractor(POIFSFileSystem fs) throws IOException { this(new HSLFSlideShow(fs)); } + public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HSLFSlideShow(dir, fs)); + } /** * Creates a PowerPointExtractor, from a HSLFSlideShow diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index deaffc79af..816b2c1221 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.usermodel.HeaderStories; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor { this(new HWPFDocument(fs)); this.fs = fs; } + public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this(new HWPFDocument(dir, fs)); + this.fs = fs; + } /** * Create a new Word Extractor diff --git a/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx new file mode 100755 index 0000000000..411de4a08c Binary files /dev/null and b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx differ