]> source.dussan.org Git - poi.git/commitdiff
Inside ExtractorFactory, support finding embedded OOXML documents and providing extra...
authorNick Burch <nick@apache.org>
Thu, 16 Dec 2010 07:39:21 +0000 (07:39 +0000)
committerNick Burch <nick@apache.org>
Thu, 16 Dec 2010 07:39:21 +0000 (07:39 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049802 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
test-data/document/word_with_embeded_ooxml.doc [new file with mode: 0644]

index 52912848e51cddeac3e9f13bd20dc61d547472c3..57b9aa914fa446870da8c1ab53b75c65774527f5 100644 (file)
@@ -191,10 +191,11 @@ public class ExtractorFactory {
        throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
        }
        
-       public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
-               return createExtractor(fs.getRoot(), fs);
+       public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+          // Only ever an OLE2 one from the root of the FS
+               return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
        }
-       public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+       public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
                // Look for certain entries in the stream, to figure it
                //  out from
                for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
@@ -234,6 +235,12 @@ public class ExtractorFactory {
                        ) {
                           return new OutlookTextExtactor(poifsDir, fs);
                        }
+                       if(entry.getName().equals("Package")) {
+                          OPCPackage pkg = OPCPackage.open(
+                                poifsDir.createDocumentInputStream(entry.getName())
+                          );
+                          return createExtractor(pkg);
+                       }
                }
                throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
        }
@@ -246,7 +253,7 @@ public class ExtractorFactory {
         *  empty array. Otherwise, you'll get one open 
         *  {@link POITextExtractor} for each embeded file.
         */
-       public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+       public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
           // All the embded directories we spotted
                ArrayList<Entry> dirs = new ArrayList<Entry>();
                // For anything else not directly held in as a POIFS directory
index 4def3d326825743b7b18bb5c89765ab98bdc0aea..57574c6ab3d5f25745bf9ce59a35d8780a6a314e 100644 (file)
@@ -60,6 +60,7 @@ public class TestExtractorFactory extends TestCase {
    private File docx;
    private File dotx;
    private File docEmb;
+   private File docEmbOOXML;
 
    private File ppt;
    private File pptx;
@@ -88,6 +89,7 @@ public class TestExtractorFactory extends TestCase {
       docx = wpTests.getFile("SampleDoc.docx");
       dotx = wpTests.getFile("test.dotx");
       docEmb = wpTests.getFile("word_with_embeded.doc");
+      docEmbOOXML = wpTests.getFile("word_with_embeded_ooxml.doc");
 
       POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
       ppt = slTests.getFile("SampleShow.ppt");
@@ -536,7 +538,7 @@ public class TestExtractorFactory extends TestCase {
       embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
 
       assertEquals(6, embeds.length);
-      int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0;
+      int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
       for(int i=0; i<embeds.length; i++) {
          assertTrue(embeds[i].getText().length() > 20);
 
@@ -569,6 +571,27 @@ public class TestExtractorFactory extends TestCase {
       assertEquals(1, numWord);
       assertEquals(0, numMsg);
       
+      // Word which contains an OOXML file
+      ext = (POIOLE2TextExtractor)
+      ExtractorFactory.createExtractor(docEmbOOXML);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
+      assertEquals(3, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+         else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+         else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
+      }
+      assertEquals(1, numPpt);
+      assertEquals(1, numXls);
+      assertEquals(0, numWord);
+      assertEquals(1, numWordX);
+      assertEquals(0, numMsg);
+      
       // Outlook
       ext = (OutlookTextExtactor)
       ExtractorFactory.createExtractor(msgEmb);
diff --git a/test-data/document/word_with_embeded_ooxml.doc b/test-data/document/word_with_embeded_ooxml.doc
new file mode 100644 (file)
index 0000000..f25cacf
Binary files /dev/null and b/test-data/document/word_with_embeded_ooxml.doc differ