</release>
-->
<release version="3.5.1-beta2" date="2008-08-20">
+ <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
</release>
-->
<release version="3.5.1-beta2" date="2008-08-20">
+ <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Common Parent for OLE2 based Text Extractors
public POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
+
+ /**
+ * Return the underlying POIFS FileSystem of
+ * this document.
+ */
+ public POIFSFileSystem getFileSystem() {
+ return document.filesystem;
+ }
}
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
this.wb = wb;
}
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
- this(new HSSFWorkbook(fs));
+ this(fs.getRoot(), fs);
+ }
+ public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HSSFWorkbook(dir, fs, true));
}
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
+import java.util.ArrayList;
import java.util.Iterator;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow;
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+ return createExtractor(fs.getRoot(), fs);
+ }
+ public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
// Look for certain entries in the stream, to figure it
// out from
- for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
+ for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
Entry entry = (Entry)entries.next();
if(entry.getName().equals("Workbook")) {
- return new ExcelExtractor(fs);
+ return new ExcelExtractor(poifsDir, fs);
}
if(entry.getName().equals("WordDocument")) {
- return new WordExtractor(fs);
+ return new WordExtractor(poifsDir, fs);
}
if(entry.getName().equals("PowerPoint Document")) {
- return new PowerPointExtractor(fs);
+ return new PowerPointExtractor(poifsDir, fs);
}
if(entry.getName().equals("VisioDocument")) {
- return new VisioTextExtractor(fs);
+ return new VisioTextExtractor(poifsDir, fs);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
+
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embeded documents in the file (if there are any).
+ * If there are no embeded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embeded file.
+ */
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ // Find all the embeded directories
+ ArrayList dirs = new ArrayList();
+ POIFSFileSystem fs = ext.getFileSystem();
+ if(fs == null) {
+ throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+ }
+
+ if(ext instanceof ExcelExtractor) {
+ // These are in MBD... under the root
+ Iterator it = fs.getRoot().getEntries();
+ while(it.hasNext()) {
+ Entry entry = (Entry)it.next();
+ if(entry.getName().startsWith("MBD")) {
+ dirs.add(entry);
+ }
+ }
+ } else if(ext instanceof WordExtractor) {
+ // These are in ObjectPool -> _... under the root
+ try {
+ DirectoryEntry op = (DirectoryEntry)
+ fs.getRoot().getEntry("ObjectPool");
+ Iterator it = op.getEntries();
+ while(it.hasNext()) {
+ Entry entry = (Entry)it.next();
+ if(entry.getName().startsWith("_")) {
+ dirs.add(entry);
+ }
+ }
+ } catch(FileNotFoundException e) {}
+ } else if(ext instanceof PowerPointExtractor) {
+ // Tricky, not stored directly in poifs
+ // TODO
+ }
+
+ // Create the extractors
+ if(dirs == null || dirs.size() == 0) {
+ return new POITextExtractor[0];
+ }
+
+ POITextExtractor[] te = new POITextExtractor[dirs.size()];
+ for(int i=0; i<te.length; i++) {
+ te[i] = createExtractor(
+ (DirectoryNode)dirs.get(i), ext.getFileSystem()
+ );
+ }
+ return te;
+ }
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embeded documents in the file (if there are any).
+ * If there are no embeded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embeded file.
+ */
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
+ throw new IllegalStateException("Not yet supported");
+ }
}
import java.io.FileInputStream;
import java.io.IOException;
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.POITextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
private String word_dir;
private String powerpoint_dir;
private String visio_dir;
+ private String poifs_dir;
private File txt;
word_dir = System.getProperty("HWPF.testdata.path");
powerpoint_dir = System.getProperty("HSLF.testdata.path");
visio_dir = System.getProperty("HDGF.testdata.path");
+ poifs_dir = System.getProperty("POIFS.testdata.path");
+ assertNotNull(excel_dir);
+ assertNotNull(word_dir);
+ assertNotNull(powerpoint_dir);
+ assertNotNull(visio_dir);
+ assertNotNull(poifs_dir);
txt = new File(powerpoint_dir, "SampleShow.txt");
// Good
}
}
+
+ /**
+ * Test embeded docs text extraction. For now, only
+ * does poifs embeded, but will do ooxml ones
+ * at some point.
+ */
+ public void testEmbeded() throws Exception {
+ POIOLE2TextExtractor ext;
+ POITextExtractor[] embeds;
+ File f;
+
+ // No embedings
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(xls);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+ assertEquals(0, embeds.length);
+
+ // Excel
+ f = new File(poifs_dir, "excel_with_embeded.xls");
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(f);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ assertEquals(6, embeds.length);
+ assertTrue(embeds[0] instanceof PowerPointExtractor);
+ assertTrue(embeds[1] instanceof ExcelExtractor);
+ assertTrue(embeds[2] instanceof ExcelExtractor);
+ assertTrue(embeds[3] instanceof PowerPointExtractor);
+ assertTrue(embeds[4] instanceof WordExtractor);
+ assertTrue(embeds[5] instanceof WordExtractor);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ }
+
+ // Word
+ f = new File(poifs_dir, "word_with_embeded.doc");
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(f);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ assertEquals(4, embeds.length);
+ assertTrue(embeds[0] instanceof WordExtractor);
+ assertTrue(embeds[1] instanceof ExcelExtractor);
+ assertTrue(embeds[2] instanceof ExcelExtractor);
+ assertTrue(embeds[3] instanceof PowerPointExtractor);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ }
+
+ // TODO - PowerPoint
+ // TODO - Visio
+ }
}
private void processChunkParseCommands() throws IOException {
String line;
InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
+ if(cpd == null) {
+ throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
+ }
+
BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
while( (line = inp.readLine()) != null ) {
if(line.startsWith("#")) continue;
import org.apache.poi.hdgf.streams.ChunkStream;
import org.apache.poi.hdgf.streams.PointerContainingStream;
import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
- this(new HDGFDiagram(fs));
+ this(fs.getRoot(), fs);
+ }
+ public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HDGFDiagram(dir, fs));
this.fs = fs;
}
public VisioTextExtractor(InputStream inp) throws IOException {
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
this(new HSLFSlideShow(fs));
}
+ public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HSLFSlideShow(dir, fs));
+ }
/**
* Creates a PowerPointExtractor, from a HSLFSlideShow
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
this(new HWPFDocument(fs));
this.fs = fs;
}
+ public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this(new HWPFDocument(dir, fs));
+ this.fs = fs;
+ }
/**
* Create a new Word Extractor