git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049802 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_8_BETA1
@@ -191,10 +191,11 @@ public class ExtractorFactory { | |||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); | |||
} | |||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { | |||
return createExtractor(fs.getRoot(), fs); | |||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { | |||
// Only ever an OLE2 one from the root of the FS | |||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs); | |||
} | |||
public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { | |||
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { | |||
// Look for certain entries in the stream, to figure it | |||
// out from | |||
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) { | |||
@@ -234,6 +235,12 @@ public class ExtractorFactory { | |||
) { | |||
return new OutlookTextExtactor(poifsDir, fs); | |||
} | |||
if(entry.getName().equals("Package")) { | |||
OPCPackage pkg = OPCPackage.open( | |||
poifsDir.createDocumentInputStream(entry.getName()) | |||
); | |||
return createExtractor(pkg); | |||
} | |||
} | |||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); | |||
} | |||
@@ -246,7 +253,7 @@ public class ExtractorFactory { | |||
* empty array. Otherwise, you'll get one open | |||
* {@link POITextExtractor} for each embeded file. | |||
*/ | |||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { | |||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { | |||
// All the embded directories we spotted | |||
ArrayList<Entry> dirs = new ArrayList<Entry>(); | |||
// For anything else not directly held in as a POIFS directory |
@@ -60,6 +60,7 @@ public class TestExtractorFactory extends TestCase { | |||
private File docx; | |||
private File dotx; | |||
private File docEmb; | |||
private File docEmbOOXML; | |||
private File ppt; | |||
private File pptx; | |||
@@ -88,6 +89,7 @@ public class TestExtractorFactory extends TestCase { | |||
docx = wpTests.getFile("SampleDoc.docx"); | |||
dotx = wpTests.getFile("test.dotx"); | |||
docEmb = wpTests.getFile("word_with_embeded.doc"); | |||
docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc"); | |||
POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); | |||
ppt = slTests.getFile("SampleShow.ppt"); | |||
@@ -536,7 +538,7 @@ public class TestExtractorFactory extends TestCase { | |||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); | |||
assertEquals(6, embeds.length); | |||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0; | |||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; | |||
for(int i=0; i<embeds.length; i++) { | |||
assertTrue(embeds[i].getText().length() > 20); | |||
@@ -569,6 +571,27 @@ public class TestExtractorFactory extends TestCase { | |||
assertEquals(1, numWord); | |||
assertEquals(0, numMsg); | |||
// Word which contains an OOXML file | |||
ext = (POIOLE2TextExtractor) | |||
ExtractorFactory.createExtractor(docEmbOOXML); | |||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); | |||
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; | |||
assertEquals(3, embeds.length); | |||
for(int i=0; i<embeds.length; i++) { | |||
assertTrue(embeds[i].getText().length() > 20); | |||
if(embeds[i] instanceof PowerPointExtractor) numPpt++; | |||
else if(embeds[i] instanceof ExcelExtractor) numXls++; | |||
else if(embeds[i] instanceof WordExtractor) numWord++; | |||
else if(embeds[i] instanceof OutlookTextExtactor) numMsg++; | |||
else if(embeds[i] instanceof XWPFWordExtractor) numWordX++; | |||
} | |||
assertEquals(1, numPpt); | |||
assertEquals(1, numXls); | |||
assertEquals(0, numWord); | |||
assertEquals(1, numWordX); | |||
assertEquals(0, numMsg); | |||
// Outlook | |||
ext = (OutlookTextExtactor) | |||
ExtractorFactory.createExtractor(msgEmb); |