throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
}
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
- return createExtractor(fs.getRoot(), fs);
+ public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ // Only ever an OLE2 one from the root of the FS
+ return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
}
- public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+ public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Look for certain entries in the stream, to figure it
// out from
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
) {
return new OutlookTextExtactor(poifsDir, fs);
}
+ if(entry.getName().equals("Package")) {
+ OPCPackage pkg = OPCPackage.open(
+ poifsDir.createDocumentInputStream(entry.getName())
+ );
+ return createExtractor(pkg);
+ }
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embeded file.
*/
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
private File docx;
private File dotx;
private File docEmb;
+ private File docEmbOOXML;
private File ppt;
private File pptx;
docx = wpTests.getFile("SampleDoc.docx");
dotx = wpTests.getFile("test.dotx");
docEmb = wpTests.getFile("word_with_embeded.doc");
+ docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
ppt = slTests.getFile("SampleShow.ppt");
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
+ int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
assertEquals(1, numWord);
assertEquals(0, numMsg);
+ // Word which contains an OOXML file
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(docEmbOOXML);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
+ assertEquals(3, embeds.length);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+ else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
+ }
+ assertEquals(1, numPpt);
+ assertEquals(1, numXls);
+ assertEquals(0, numWord);
+ assertEquals(1, numWordX);
+ assertEquals(0, numMsg);
+
// Outlook
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb);