]> source.dussan.org Git - poi.git/commitdiff
#62319 - Decommission XSLF-/PowerPointExtractor
authorAndreas Beeker <kiwiwings@apache.org>
Fri, 20 Apr 2018 12:52:59 +0000 (12:52 +0000)
committerAndreas Beeker <kiwiwings@apache.org>
Fri, 20 Apr 2018 12:52:59 +0000 (12:52 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68

27 files changed:
src/integrationtest/org/apache/poi/TestAllFiles.java
src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
src/java/org/apache/poi/POIOLE2TextExtractor.java
src/java/org/apache/poi/POITextExtractor.java
src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
src/java/org/apache/poi/poifs/filesystem/DocumentFactoryHelper.java
src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java
src/java/org/apache/poi/sl/usermodel/SlideShow.java
src/java/org/apache/poi/sl/usermodel/SlideShowFactory.java
src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java
src/ooxml/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java
src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFPlaceholderDetails.java
src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFSlide.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/ooxml/testcases/org/apache/poi/poifs/crypt/TestHxxFEncryption.java
src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java
src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java
src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java
src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java
src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowFactory.java
src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java
src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java
src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java
test-data/slideshow/SampleShow.pptx

index 333a8ebcdc611aa4d1f74a552130388c881844e8..8721c50948c0c07ef51accb655a3a219bc513505 100644 (file)
@@ -330,8 +330,6 @@ public class TestAllFiles {
     );
 
     private static final Set<String> IGNORED = unmodifiableHashSet(
-        // need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
-        "slideshow/42474-2.ppt",
         // OPC handler works / XSSF handler fails
         "spreadsheet/57181.xlsm",
         "spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop
index 7aab657650d322f9dfb5aa1f4d4c06873f4fe81e..80fe858249b9a732945f4c384585e1a343bf69f2 100644 (file)
@@ -24,6 +24,7 @@ import java.io.FileInputStream;
 import java.io.InputStream;
 
 import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler {
         
         // additionally try the other getText() methods
 
-               try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) {
+               try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
                        assertNotNull(extractor);
+                       extractor.setSlidesByDefault(true);
+                       extractor.setNotesByDefault(true);
+                       extractor.setMasterByDefault(true);
 
-                       assertNotNull(extractor.getText(true, true, true));
-                       assertEquals("With all options disabled we should not get text",
-                                       "", extractor.getText(false, false, false));
+                       assertNotNull(extractor.getText());
+
+                       extractor.setSlidesByDefault(false);
+                       extractor.setNotesByDefault(false);
+                       extractor.setMasterByDefault(false);
+
+                       assertEquals("With all options disabled we should not get text", "", extractor.getText());
                }
     }
 
index 05c7781057d87e3f5f8be1d3792033b742b62185..0fccf71c4a3df96c5286821c6879ac47630f4811 100644 (file)
@@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
      *
      * @return the underlying POIDocument
      */
+    @Override
     public POIDocument getDocument() {
         return document;
     }
index 58cf5e6ddbcbe54232627a4a9d487e1394f8e85b..55d0832f16a3d2d5f95af9ac06ff0f954345feee 100644 (file)
@@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable {
                    fsToClose.close();
                }
        }
+
+       /**
+        * @return the processed document
+        */
+       public abstract Object getDocument();
 }
index 5f895dc3752896ff1b9e99fc3e778816c2eb6f0c..5e52f9d6c2ab07b252f36c27ce93f53266d14246 100644 (file)
@@ -115,26 +115,23 @@ public class OLE2ExtractorFactory {
         return threadPreferEventExtractors.get();
     }
 
-    public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
-        // Only ever an OLE2 one from the root of the FS
-        return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+    public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
+        return (T)createExtractor(fs.getRoot());
     }
-    public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
-        // Only ever an OLE2 one from the root of the FS
-        return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+    public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
+        return (T)createExtractor(fs.getRoot());
     }
-    public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
-        // Only ever an OLE2 one from the root of the FS
-        return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+    public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
+        return (T)createExtractor(fs.getRoot());
     }
 
-    public static POITextExtractor createExtractor(InputStream input) throws IOException {
+    public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
         Class<?> cls = getOOXMLClass();
         if (cls != null) {
             // Use Reflection to get us the full OOXML-enabled version
             try {
                 Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
-                return (POITextExtractor)m.invoke(null, input);
+                return (T)m.invoke(null, input);
             } catch (IllegalArgumentException iae) {
                 throw iae;
             } catch (Exception e) {
index 0657b9bb63a96626e784ba06f49cbada5f074121..57b57a1757c538cda0722e20604b0031307cde6b 100644 (file)
@@ -44,8 +44,30 @@ public class DocumentFactoryHelper {
      * @throws IOException If an error occurs while decrypting or if the password does not match
      */
     public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
+    throws IOException {
+        // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
+        // as well when the resulting OPCPackage is closed
+        return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
+            @Override
+            public void close() throws IOException {
+                fs.close();
+                super.close();
+            }
+        };
+    }
+
+    /**
+     * Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
+     * the given password.
+     *
+     * @param root The OLE2 directory node for the document
+     * @param password The password, null if the default password should be used
+     * @return A stream for reading the decrypted data
+     * @throws IOException If an error occurs while decrypting or if the password does not match
+     */
+    public static InputStream getDecryptedStream(final DirectoryNode root, String password)
             throws IOException {
-        EncryptionInfo info = new EncryptionInfo(fs);
+        EncryptionInfo info = new EncryptionInfo(root);
         Decryptor d = Decryptor.getInstance(info);
 
         try {
@@ -58,21 +80,11 @@ public class DocumentFactoryHelper {
             }
 
             if (passwordCorrect) {
-                // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
-                // as well when the resulting OPCPackage is closed
-                return new FilterInputStream(d.getDataStream(fs.getRoot())) {
-                    @Override
-                    public void close() throws IOException {
-                        fs.close();
-
-                        super.close();
-                    }
-                };
+                return d.getDataStream(root);
+            } else if (password != null) {
+                throw new EncryptedDocumentException("Password incorrect");
             } else {
-                if (password != null)
-                    throw new EncryptedDocumentException("Password incorrect");
-                else
-                    throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
+                throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
             }
         } catch (GeneralSecurityException e) {
             throw new IOException(e);
index bf234f9d635853b0e2494e8a55a6224ef823c590..a7283acbee0eae1f1592b2e92c54bfe9b7ca5820 100644 (file)
@@ -1,3 +1,20 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
 package org.apache.poi.sl.extractor;
 
 import java.util.ArrayList;
@@ -48,6 +65,16 @@ public class SlideShowExtractor<
         this.slideshow = slideshow;
     }
 
+    /**
+     * Returns opened document
+     *
+     * @return the opened document
+     */
+    @Override
+    public final Object getDocument() {
+        return slideshow.getPersistDocument();
+    }
+
     /**
      * Should a call to getText() return slide text? Default is yes
      */
@@ -219,7 +246,6 @@ public class SlideShowExtractor<
             return;
         }
         for (final P para : paraList) {
-            final int oldLen = sb.length();
             for (final TextRun tr : para) {
                 final String str = tr.getRawText().replace("\r", "");
                 final String newStr;
index 8ddd96217c2870a4f090ebdef1e87778a2643eba..9894e687d8204722a2a5cf12c7a977f2c83c3ba5 100644 (file)
@@ -126,4 +126,13 @@ public interface SlideShow<
      * @since POI 4.0.0
      */
     POITextExtractor getMetadataTextExtractor();
+
+    /**
+     * @return the instance which handles the persisting of the slideshow,
+     * which is either a subclass of {@link org.apache.poi.POIDocument}
+     * or {@link org.apache.poi.POIXMLDocument}
+     *
+     * @since POI 4.0.0
+     */
+    Object getPersistDocument();
 }
index cdee782c3210166224619e97a23f30f5f75b1eaa..02e9fcb1466b1d60d9f19a51ade75b4220bc1214 100644 (file)
@@ -60,13 +60,40 @@ public class SlideShowFactory {
      * @throws IOException if an error occurs while reading the data
      */
     public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
-        DirectoryNode root = fs.getRoot();
+        return create(fs.getRoot(), password);
+    }
+
+    /**
+     * Creates a SlideShow from the given NPOIFSFileSystem.
+     *
+     * @param root The {@link DirectoryNode} to start reading the document from
+     *
+     * @return The created SlideShow
+     *
+     * @throws IOException if an error occurs while reading the data
+     */
+    public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
+        return create(root, null);
+    }
 
+
+    /**
+     * Creates a SlideShow from the given NPOIFSFileSystem, which may
+     * be password protected
+     *
+     * @param root The {@link DirectoryNode} to start reading the document from
+     * @param password The password that should be used or null if no password is necessary.
+     *
+     * @return The created SlideShow
+     *
+     * @throws IOException if an error occurs while reading the data
+     */
+    public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
         // Encrypted OOXML files go inside OLE2 containers, is this one?
         if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
             InputStream stream = null;
             try {
-                stream = DocumentFactoryHelper.getDecryptedStream(fs, password);
+                stream = DocumentFactoryHelper.getDecryptedStream(root, password);
 
                 return createXSLFSlideShow(stream);
             } finally {
@@ -82,7 +109,7 @@ public class SlideShowFactory {
             passwordSet = true;
         }
         try {
-            return createHSLFSlideShow(fs);
+            return createHSLFSlideShow(root);
         } finally {
             if (passwordSet) {
                 Biff8EncryptionKey.setCurrentUserPassword(null);
index 26b2cd84c2e3317dd4d9786f5d546deb6fd7017d..003fe353f161df9f1ef2776142fb744501240d7d 100644 (file)
@@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
         * 
         * @return the opened document
         */
+       @Override
        public final POIXMLDocument getDocument() {
                return _document;
        }
index 910baf6a648a2e383b7402539996de2a653f41de..9a7765af0dc60b0ce061d8a1430270dc6c7a56f9 100644 (file)
@@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.NotImplemented;
 import org.apache.poi.util.POILogFactory;
@@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger;
 import org.apache.poi.util.Removal;
 import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlideShow;
 import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@@ -127,20 +129,20 @@ public class ExtractorFactory {
          return OLE2ExtractorFactory.getPreferEventExtractor();
     }
 
-    public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
+    public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
         NPOIFSFileSystem fs = null;
         try {
             fs = new NPOIFSFileSystem(f);
             if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
-                return createEncryptedOOXMLExtractor(fs);
+                return (T)createEncryptedOOXMLExtractor(fs);
             }
-            POIOLE2TextExtractor extractor = createExtractor(fs);
+            POITextExtractor extractor = createExtractor(fs);
             extractor.setFilesystem(fs);
-            return extractor;
+            return (T)extractor;
         } catch (OfficeXmlFileException e) {
             // ensure file-handle release
             IOUtils.closeQuietly(fs);
-            return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
+            return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
         } catch (NotOLE2FileException ne) {
             // ensure file-handle release
             IOUtils.closeQuietly(fs);
@@ -179,7 +181,7 @@ public class ExtractorFactory {
      * @throws XmlException If an XML parsing error occurs.
      * @throws IllegalArgumentException If no matching file type could be found.
      */
-    public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
+    public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
         try {
             // Check for the normal Office core document
             PackageRelationshipCollection core;
@@ -226,13 +228,13 @@ public class ExtractorFactory {
             // Is it XSLF?
             for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
                 if ( rel.getContentType().equals( contentType ) ) {
-                    return new XSLFPowerPointExtractor(pkg);
+                    return new SlideShowExtractor(new XMLSlideShow(pkg));
                 }
             }
      
             // special handling for SlideShow-Theme-files, 
             if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
-                return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+                return new SlideShowExtractor(new XMLSlideShow(pkg));
             }
 
             // How about xlsb?
@@ -252,28 +254,28 @@ public class ExtractorFactory {
         }
     }
 
-    public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
-        return OLE2ExtractorFactory.createExtractor(fs);
+    public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+        return createExtractor(fs.getRoot());
     }
-    public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
-        return OLE2ExtractorFactory.createExtractor(fs);
+    public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+        return createExtractor(fs.getRoot());
     }
-    public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
-        return OLE2ExtractorFactory.createExtractor(fs);
+    public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+        return createExtractor(fs.getRoot());
     }
 
-    public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
+    public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
     {
         // First, check for OOXML
         for (String entryName : poifsDir.getEntryNames()) {
             if (entryName.equals("Package")) {
                 OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
-                return createExtractor(pkg);
+                return (T)createExtractor(pkg);
             }
         }
 
         // If not, ask the OLE2 code to check, with Scratchpad if possible
-        return OLE2ExtractorFactory.createExtractor(poifsDir);
+        return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
     }
 
     /**
@@ -403,7 +405,7 @@ public class ExtractorFactory {
         throw new IllegalStateException("Not yet supported");
     }
     
-    private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
+    private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
     throws IOException {
         String pass = Biff8EncryptionKey.getCurrentUserPassword();
         if (pass == null) {
index ef9097f15e334ed49b07895c47e6a6b015037d92..b3cc7c58795c747861e88a61d86f9c07f2a43164 100644 (file)
@@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException;
  * @deprecated use {@link SlideShowExtractor}
  */
 @Deprecated
-@Removal(version="4.2.0")
+@Removal(version="5.0.0")
 public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
     public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
             XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
index 8734f3603e35e24d2366470bc164131e9203721c..ffc6f0d86bff4435b71bb40e7fcff74e440b6b88 100644 (file)
@@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument
     public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
         return new POIXMLPropertiesTextExtractor(this);
     }
+
+    @Override
+    public Object getPersistDocument() {
+        return this;
+    }
 }
index 25fbd6344367574b33246ac135d591fce5d6cde6..e5d321bc0d958aa8ff0ff75d2571610448971578 100644 (file)
@@ -1,3 +1,20 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
 package org.apache.poi.xslf.usermodel;
 
 import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;
index ce650c152db503403730a857e79221c24f8b2e10..7ba272ed63dad306abcb52e457f9cab22fb6af05 100644 (file)
@@ -182,12 +182,20 @@ implements Slide<XSLFShape,XSLFTextParagraph> {
      */
     public XSLFCommentAuthors getCommentAuthorsPart() {
         if(_commentAuthors == null) {
+            // first scan the slide relations
             for (POIXMLDocumentPart p : getRelations()) {
                 if (p instanceof XSLFCommentAuthors) {
                     _commentAuthors = (XSLFCommentAuthors)p;
                     return _commentAuthors;
                 }
             }
+            // then scan the presentation relations
+            for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
+                if (p instanceof XSLFCommentAuthors) {
+                    _commentAuthors = (XSLFCommentAuthors)p;
+                    return _commentAuthors;
+                }
+            }
         }
 
         return null;
index 3670f6fc7a7f7b7f68e65ecf211921885790395d..0a885377a6283a5b18cd37ac1c9c5197cdf8c9e2 100644 (file)
@@ -27,16 +27,15 @@ import static org.junit.Assert.fail;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.util.Locale;
 
 import org.apache.poi.POIDataSamples;
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.POITextExtractor;
-import org.apache.poi.POIXMLException;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.UnsupportedFileFormatException;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
-import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.HSSFTestDataSamples;
 import org.apache.poi.hssf.OldExcelFormatException;
@@ -44,18 +43,20 @@ import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
-import org.junit.BeforeClass;
+import org.apache.xmlbeans.XmlException;
 import org.junit.Test;
 
 /**
@@ -65,34 +66,39 @@ public class TestExtractorFactory {
 
     private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
 
-    private static File txt;
-
-    private static File xls;
-    private static File xlsx;
-    private static File xlsxStrict;
-    private static File xltx;
-    private static File xlsEmb;
-    private static File xlsb;
-
-    private static File doc;
-    private static File doc6;
-    private static File doc95;
-    private static File docx;
-    private static File dotx;
-    private static File docEmb;
-    private static File docEmbOOXML;
-
-    private static File ppt;
-    private static File pptx;
-
-    private static File msg;
-    private static File msgEmb;
-    private static File msgEmbMsg;
-
-    private static File vsd;
-    private static File vsdx;
-
-    private static File pub;
+    private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
+    private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
+    private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
+    private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
+    private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
+    private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
+    private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
+
+    private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
+    private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
+    private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
+    private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
+    private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
+    private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
+    private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
+    private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
+
+    private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
+    private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
+    private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
+    private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
+
+    private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
+    private static final File msg = getFileAndCheck(olTests, "quick.msg");
+    private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
+    private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
+
+    private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
+    private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
+    private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
+
+    private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
+    private static File pub = getFileAndCheck(pubTests, "Simple.pub");
 
     private static File getFileAndCheck(POIDataSamples samples, String name) {
         File file = samples.getFile(name);
@@ -104,595 +110,133 @@ public class TestExtractorFactory {
         return file;
     }
 
-    @BeforeClass
-    public static void setUp() throws Exception {
-
-        POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
-        xls = getFileAndCheck(ssTests, "SampleSS.xls");
-        xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
-        xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
-        xltx = getFileAndCheck(ssTests, "test.xltx");
-        xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
-        xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
-
-        POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
-        doc = getFileAndCheck(wpTests, "SampleDoc.doc");
-        doc6 = getFileAndCheck(wpTests, "Word6.doc");
-        doc95 = getFileAndCheck(wpTests, "Word95.doc");
-        docx = getFileAndCheck(wpTests, "SampleDoc.docx");
-        dotx = getFileAndCheck(wpTests, "test.dotx");
-        docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
-        docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
-
-        POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
-        ppt = getFileAndCheck(slTests, "SampleShow.ppt");
-        pptx = getFileAndCheck(slTests, "SampleShow.pptx");
-        txt = getFileAndCheck(slTests, "SampleShow.txt");
-
-        POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
-        vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
-        vsdx = getFileAndCheck(dgTests, "test.vsdx");
-
-        POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
-        pub = getFileAndCheck(pubTests, "Simple.pub");
-
-        POIDataSamples olTests = POIDataSamples.getHSMFInstance();
-        msg = getFileAndCheck(olTests, "quick.msg");
-        msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
-        msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
-    }
-
-    @Test
-    public void testFile() throws Exception {
-        // Excel
-        POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
-        assertNotNull("Had empty extractor for " + xls, xlsExtractor);
-        assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), 
-                xlsExtractor
-                instanceof ExcelExtractor
-        );
-        assertTrue(
-                xlsExtractor.getText().length() > 200
-        );
-        xlsExtractor.close();
-
-        POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
-        assertTrue(
-                extractor.getClass().getName(),
-                extractor
-                instanceof XSSFExcelExtractor
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(xlsx);
-        assertTrue(
-                extractor.getText().length() > 200
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(xltx);
-        assertTrue(
-                extractor.getClass().getName(),
-                extractor
-                instanceof XSSFExcelExtractor
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(xlsb);
-        assertContains(extractor.getText(), "test");
-        extractor.close();
-
-
-        extractor = ExtractorFactory.createExtractor(xltx);
-        assertContains(extractor.getText(), "test");
-        extractor.close();
+    private static final Object[] TEST_SET = {
+        "Excel", xls, ExcelExtractor.class, 200,
+        "Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200,
+        "Excel - xltx", xltx, XSSFExcelExtractor.class, -1,
+        "Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1,
+        "Word", doc, WordExtractor.class, 120,
+        "Word - docx", docx, XWPFWordExtractor.class, 120,
+        "Word - dotx", dotx, XWPFWordExtractor.class, -1,
+        "Word 6", doc6, Word6Extractor.class, 20,
+        "Word 95", doc95, Word6Extractor.class, 120,
+        "PowerPoint", ppt, SlideShowExtractor.class, 120,
+        "PowerPoint - pptx", pptx, SlideShowExtractor.class, 120,
+        "Visio", vsd, VisioTextExtractor.class, 50,
+        "Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20,
+        "Publisher", pub, PublisherTextExtractor.class, 50,
+        "Outlook msg", msg, OutlookTextExtactor.class, 50,
 
         // TODO Support OOXML-Strict, see bug #57699
-        try {
-            /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
-            fail("OOXML-Strict isn't yet supported");
-        } catch (POIXMLException e) {
-            // Expected, for now
-        }
-//        extractor = ExtractorFactory.createExtractor(xlsxStrict);
-//        assertTrue(
-//                extractor
-//                instanceof XSSFExcelExtractor
-//        );
-//        extractor.close();
-//
-//        extractor = ExtractorFactory.createExtractor(xlsxStrict);
-//        assertTrue(
-//                extractor.getText().contains("test")
-//        );
-//        extractor.close();
-
-
-        // Word
-        extractor = ExtractorFactory.createExtractor(doc);
-        assertTrue(
-                extractor
-                instanceof WordExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(doc6);
-        assertTrue(
-                extractor
-                instanceof Word6Extractor
-        );
-        assertTrue(
-                extractor.getText().length() > 20
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(doc95);
-        assertTrue(
-                extractor
-                instanceof Word6Extractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(docx);
-        assertTrue(
-                extractor instanceof XWPFWordExtractor
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(docx);
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(dotx);
-        assertTrue(
-                extractor instanceof XWPFWordExtractor
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(dotx);
-        assertContains(extractor.getText(), "Test");
-        extractor.close();
-
-        // PowerPoint (PPT)
-        extractor = ExtractorFactory.createExtractor(ppt);
-        assertTrue(
-                extractor
-                instanceof PowerPointExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        // PowerPoint (PPTX)
-        extractor = ExtractorFactory.createExtractor(pptx);
-        assertTrue(
-                extractor
-                instanceof XSLFPowerPointExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
+        // xlsxStrict
+    };
 
-        // Visio - binary
-        extractor = ExtractorFactory.createExtractor(vsd);
-        assertTrue(
-                extractor
-                instanceof VisioTextExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 50
-        );
-        extractor.close();
+    @FunctionalInterface
+    interface FunctionEx<T, R> {
+        R apply(T t) throws IOException, OpenXML4JException, XmlException;
+    }
 
-        // Visio - vsdx
-        extractor = ExtractorFactory.createExtractor(vsdx);
-        assertTrue(
-                extractor
-                instanceof XDGFVisioExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 20
-        );
-        extractor.close();
 
-        // Publisher
-        extractor = ExtractorFactory.createExtractor(pub);
-        assertTrue(
-                extractor
-                instanceof PublisherTextExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 50
-        );
-        extractor.close();
-
-        // Outlook msg
-        extractor = ExtractorFactory.createExtractor(msg);
-        assertTrue(
-                extractor
-                instanceof OutlookTextExtactor
-        );
-        assertTrue(
-                extractor.getText().length() > 50
-        );
-        extractor.close();
+    @Test
+    public void testFile() throws Exception {
+        for (int i = 0; i < TEST_SET.length; i += 4) {
+            try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
+                testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
+            }
+        }
+    }
 
+    @Test(expected = IllegalArgumentException.class)
+    public void testFileInvalid() throws Exception {
         // Text
-        try {
-            ExtractorFactory.createExtractor(txt);
-            fail("expected IllegalArgumentException");
-        } catch(IllegalArgumentException e) {
-            // Good
-        }
+        try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {}
     }
 
     @Test
     public void testInputStream() throws Exception {
-        // Excel
-        POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
-        assertTrue(
-                extractor
-                instanceof ExcelExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 200
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
-        assertTrue(
-                extractor.getClass().getName(),
-                extractor
-                instanceof XSSFExcelExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 200
-        );
-        // TODO Support OOXML-Strict, see bug #57699
-//        assertTrue(
-//                ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
-//                instanceof XSSFExcelExtractor
-//        );
-//        assertTrue(
-//                ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
-//        );
-        extractor.close();
-
-        // Word
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
-        assertTrue(
-                extractor.getClass().getName(),
-                extractor
-                instanceof WordExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
-        assertTrue(
-                extractor.getClass().getName(),
-                extractor
-                instanceof Word6Extractor
-        );
-        assertTrue(
-                extractor.getText().length() > 20
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
-        assertTrue(
-                extractor.getClass().getName(),
-                extractor
-                instanceof Word6Extractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
-        assertTrue(
-                extractor
-                instanceof XWPFWordExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        // PowerPoint
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
-        assertTrue(
-                extractor
-                instanceof PowerPointExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
-        assertTrue(
-                extractor
-                instanceof XSLFPowerPointExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 120
-        );
-        extractor.close();
-
-        // Visio
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
-        assertTrue(
-                extractor
-                instanceof VisioTextExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 50
-        );
-        extractor.close();
-
-        // Visio - vsdx
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
-        assertTrue(
-                extractor
-                instanceof XDGFVisioExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 20
-        );
-        extractor.close();
-        
-        // Publisher
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
-        assertTrue(
-                extractor
-                instanceof PublisherTextExtractor
-        );
-        assertTrue(
-                extractor.getText().length() > 50
-        );
-        extractor.close();
-
-        // Outlook msg
-        extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
-        assertTrue(
-                extractor
-                instanceof OutlookTextExtactor
-        );
-        assertTrue(
-                extractor.getText().length() > 50
-        );
-        extractor.close();
+        testStream((f) -> ExtractorFactory.createExtractor(f), true);
+    }
 
-        // Text
-        try (FileInputStream stream = new FileInputStream(txt)) {
-            ExtractorFactory.createExtractor(stream);
-            fail("expected IllegalArgumentException");
-        } catch(IllegalArgumentException e) {
-            // Good
-        }
+    @Test(expected = IllegalArgumentException.class)
+    public void testInputStreamInvalid() throws Exception {
+        testInvalid((f) -> ExtractorFactory.createExtractor(f));
     }
 
     @Test
     public void testPOIFS() throws Exception {
-        // Excel
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
-                instanceof ExcelExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
-        );
-
-        // Word
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
-                instanceof WordExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
-        );
-
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
-                instanceof Word6Extractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
-        );
-
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
-                instanceof Word6Extractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
-        );
-
-        // PowerPoint
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
-                instanceof PowerPointExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
-        );
-
-        // Visio
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
-                instanceof VisioTextExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
-        );
-
-        // Publisher
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
-                instanceof PublisherTextExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
-        );
-
-        // Outlook msg
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
-                instanceof OutlookTextExtactor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
-        );
-
-        // Text
-        try {
-            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
-            fail("expected IllegalArgumentException");
-        } catch(IOException e) {
-            // Good
-        }
+        testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
     }
 
+    @Test(expected = IOException.class)
+    public void testPOIFSInvalid() throws Exception {
+        testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
+    }
 
     @Test
     public void testOPOIFS() throws Exception {
-        // Excel
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
-                        instanceof ExcelExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
-        );
-
-        // Word
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
-                        instanceof WordExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
-        );
-
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
-                        instanceof Word6Extractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
-        );
-
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
-                        instanceof Word6Extractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
-        );
+        testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false);
+    }
 
-        // PowerPoint
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
-                        instanceof PowerPointExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
-        );
+    @Test(expected = IOException.class)
+    public void testOPOIFSInvalid() throws Exception {
+        testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)));
+    }
 
-        // Visio
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
-                        instanceof VisioTextExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
-        );
 
-        // Publisher
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
-                        instanceof PublisherTextExtractor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
-        );
+    private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
+    throws IOException, OpenXML4JException, XmlException {
+        for (int i = 0; i < TEST_SET.length; i += 4) {
+            File testFile = (File) TEST_SET[i + 1];
+            if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
+                continue;
+            }
+            try (FileInputStream fis = new FileInputStream(testFile);
+                 POITextExtractor ext = poifsIS.apply(fis)) {
+                testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
+            } catch (IllegalArgumentException e) {
+                fail("failed to process "+testFile);
+            }
+        }
+    }
 
-        // Outlook msg
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
-                        instanceof OutlookTextExtactor
-        );
-        assertTrue(
-                ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
-        );
+    private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) {
+        assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext));
+        final String actual = ext.getText();
+        if (minLength == -1) {
+            assertContains(actual.toLowerCase(Locale.ROOT), "test");
+        } else {
+            assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
+        }
+    }
 
+    private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
         // Text
-        try {
-            ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
-            fail("expected IllegalArgumentException");
-        } catch(IOException e) {
-            // Good
+        try (FileInputStream fis = new FileInputStream(txt);
+             POITextExtractor te = poifs.apply(fis)) {
         }
     }
 
     @Test
     public void testPackage() throws Exception {
-        // Excel
-        POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
-        assertTrue(extractor instanceof XSSFExcelExtractor);
-        extractor.close();
-        extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
-        assertTrue(extractor.getText().length() > 200);
-        extractor.close();
-
-        // Word
-        extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
-        assertTrue(extractor instanceof XWPFWordExtractor);
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
-        assertTrue(extractor.getText().length() > 120);
-        extractor.close();
-
-        // PowerPoint
-        extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
-        assertTrue(extractor instanceof XSLFPowerPointExtractor);
-        extractor.close();
+        for (int i = 0; i < TEST_SET.length; i += 4) {
+            final File testFile = (File) TEST_SET[i + 1];
+            if (!testFile.getName().endsWith("x")) {
+                continue;
+            }
 
-        extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
-        assertTrue(extractor.getText().length() > 120);
-        extractor.close();
-        
-        // Visio
-        extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
-        assertTrue(extractor instanceof XDGFVisioExtractor);
-        assertTrue(extractor.getText().length() > 20);
-        extractor.close();
+            try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
+                 final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
+                testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
+                pkg.revert();
+            }
+        }
+    }
 
+    @Test(expected = UnsupportedFileFormatException.class)
+    public void testPackageInvalid() throws Exception {
         // Text
-        try {
-            ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
-            fail("TestExtractorFactory.testPackage() failed on " + txt);
-        } catch(UnsupportedFileFormatException e) {
-            // Good
-        } catch (Exception e) {
-            LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt);
-            throw e;
-        }
+        try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
+             final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {}
     }
 
     @Test
@@ -781,142 +325,49 @@ public class TestExtractorFactory {
      *  does poifs embedded, but will do ooxml ones
      *  at some point.
      */
-    @SuppressWarnings("deprecation")
     @Test
     public void testEmbedded() throws Exception {
-        POIOLE2TextExtractor ext;
-        POITextExtractor[] embeds;
-
-        // No embeddings
-        ext = (POIOLE2TextExtractor)
-                ExtractorFactory.createExtractor(xls);
-        embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-        assertEquals(0, embeds.length);
-        ext.close();
-
-        // No embeddings
-        ext = (POIOLE2TextExtractor)
-                ExtractorFactory.createExtractor(xls);
-        embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
-        assertEquals(0, embeds.length);
-        ext.close();
-
-        // Excel
-        ext = (POIOLE2TextExtractor)
-                ExtractorFactory.createExtractor(xlsEmb);
-        embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-        assertNotNull(embeds);
-        ext.close();
-
-        // Excel
-        ext = (POIOLE2TextExtractor)
-                ExtractorFactory.createExtractor(xlsEmb);
-        embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
-
-        assertEquals(6, embeds.length);
-        int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
-        for (POITextExtractor embed : embeds) {
-            assertTrue(embed.getText().length() > 20);
-
-            if (embed instanceof PowerPointExtractor) numPpt++;
-            else if (embed instanceof ExcelExtractor) numXls++;
-            else if (embed instanceof WordExtractor) numWord++;
-            else if (embed instanceof OutlookTextExtactor) numMsg++;
-        }
-        assertEquals(2, numPpt);
-        assertEquals(2, numXls);
-        assertEquals(2, numWord);
-        assertEquals(0, numMsg);
-        ext.close();
-
-        // Word
-        ext = (POIOLE2TextExtractor)
-                ExtractorFactory.createExtractor(docEmb);
-        embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
-        numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
-        assertEquals(4, embeds.length);
-        for (POITextExtractor embed : embeds) {
-            assertTrue(embed.getText().length() > 20);
-            if (embed instanceof PowerPointExtractor) numPpt++;
-            else if (embed instanceof ExcelExtractor) numXls++;
-            else if (embed instanceof WordExtractor) numWord++;
-            else if (embed instanceof OutlookTextExtactor) numMsg++;
-        }
-        assertEquals(1, numPpt);
-        assertEquals(2, numXls);
-        assertEquals(1, numWord);
-        assertEquals(0, numMsg);
-        ext.close();
-
-        // Word which contains an OOXML file
-        ext = (POIOLE2TextExtractor)
-                ExtractorFactory.createExtractor(docEmbOOXML);
-        embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
-        numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
-        assertEquals(3, embeds.length);
-        for (POITextExtractor embed : embeds) {
-            assertTrue(embed.getText().length() > 20);
-            if (embed instanceof PowerPointExtractor) numPpt++;
-            else if (embed instanceof ExcelExtractor) numXls++;
-            else if (embed instanceof WordExtractor) numWord++;
-            else if (embed instanceof OutlookTextExtactor) numMsg++;
-            else if (embed instanceof XWPFWordExtractor) numWordX++;
-        }
-        assertEquals(1, numPpt);
-        assertEquals(1, numXls);
-        assertEquals(0, numWord);
-        assertEquals(1, numWordX);
-        assertEquals(0, numMsg);
-        ext.close();
-
-        // Outlook
-        ext = (OutlookTextExtactor)
-                ExtractorFactory.createExtractor(msgEmb);
-        embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
-        numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
-        assertEquals(1, embeds.length);
-        for (POITextExtractor embed : embeds) {
-            assertTrue(embed.getText().length() > 20);
-            if (embed instanceof PowerPointExtractor) numPpt++;
-            else if (embed instanceof ExcelExtractor) numXls++;
-            else if (embed instanceof WordExtractor) numWord++;
-            else if (embed instanceof OutlookTextExtactor) numMsg++;
-        }
-        assertEquals(0, numPpt);
-        assertEquals(0, numXls);
-        assertEquals(1, numWord);
-        assertEquals(0, numMsg);
-        ext.close();
-
-        // Outlook with another outlook file in it
-        ext = (OutlookTextExtactor)
-                ExtractorFactory.createExtractor(msgEmbMsg);
-        embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
-        numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
-        assertEquals(1, embeds.length);
-        for (POITextExtractor embed : embeds) {
-            assertTrue(embed.getText().length() > 20);
-            if (embed instanceof PowerPointExtractor) numPpt++;
-            else if (embed instanceof ExcelExtractor) numXls++;
-            else if (embed instanceof WordExtractor) numWord++;
-            else if (embed instanceof OutlookTextExtactor) numMsg++;
+        final Object[] testObj = {
+            "No embeddings", xls, "0-0-0-0-0-0",
+            "Excel", xlsEmb, "6-2-2-2-0-0",
+            "Word", docEmb, "4-1-2-1-0-0",
+            "Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
+            "Outlook", msgEmb, "1-1-0-0-0-0",
+            "Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
+        };
+
+        for (int i=0; i<testObj.length; i+=3) {
+            try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
+                final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
+
+                int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
+                for (POITextExtractor embed : embeds) {
+                    assertTrue(embed.getText().length() > 20);
+                    if (embed instanceof SlideShowExtractor) {
+                        numPpt++;
+                    } else if (embed instanceof ExcelExtractor) {
+                        numXls++;
+                    } else if (embed instanceof WordExtractor) {
+                        numWord++;
+                    } else if (embed instanceof OutlookTextExtactor) {
+                        numMsg++;
+                    } else if (embed instanceof XWPFWordExtractor) {
+                        numWordX++;
+                    }
+                }
+
+                final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
+                final String expected = (String)testObj[i+2];
+                assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
+            }
         }
-        assertEquals(0, numPpt);
-        assertEquals(0, numXls);
-        assertEquals(0, numWord);
-        assertEquals(1, numMsg);
-        ext.close();
 
         // TODO - PowerPoint
         // TODO - Publisher
         // TODO - Visio
     }
 
-    private static final String[] EXPECTED_FAILURES = new String[] {
+    private static final String[] EXPECTED_FAILURES = {
         // password protected files
         "spreadsheet/password.xls",
         "spreadsheet/protected_passtika.xlsx",
@@ -1018,37 +469,26 @@ public class TestExtractorFactory {
      *  #59074 - Excel 95 files should give a helpful message, not just 
      *   "No supported documents found in the OLE2 stream"
      */
-    @Test
+    @Test(expected = OldExcelFormatException.class)
     public void bug59074() throws Exception {
-        try {
-            ExtractorFactory.createExtractor(
-                    POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
-            fail("Old excel formats not supported via ExtractorFactory");
-        } catch (OldExcelFormatException e) {
-            // expected here
-        }
+        ExtractorFactory.createExtractor(
+                POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
     }
 
     @SuppressWarnings("deprecation")
-    @Test
-    public void testGetEmbeddedFromXMLExtractor() {
-        try {
-            // currently not implemented
-            ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
-            fail("Unsupported currently");
-        } catch (IllegalStateException e) {
-            // expected here
-        }
+    @Test(expected = IllegalStateException.class)
+    public void testGetEmbedFromXMLExtractor() {
+        // currently not implemented
+        ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
+    }
 
-        try {
-            // currently not implemented
-            ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
-            fail("Unsupported currently");
-        } catch (IllegalStateException e) {
-            // expected here
-        }
+    @SuppressWarnings("deprecation")
+    @Test(expected = IllegalStateException.class)
+    public void testGetEmbeddedFromXMLExtractor() {
+        // currently not implemented
+        ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
     }
-    
+
     // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
     // When this happens, change this from @Test(expected=...) to @Test
     // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
index 68c8bfa054e88ce27e22087d1d54fc9a81ac28c5..7403ecc3d71066baf3d1b175fef83e20ad145648 100644 (file)
@@ -120,10 +120,10 @@ public class TestHxxFEncryption {
     public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
         Biff8EncryptionKey.setCurrentUserPassword(password);
         File f = sampleDir.getFile(file);
-        POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
+        POITextExtractor te1 = ExtractorFactory.createExtractor(f);
         Biff8EncryptionKey.setCurrentUserPassword(newPass);
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
-        POIDocument doc = te1.getDocument();
+        POIDocument doc = (POIDocument)te1.getDocument();
         doc.write(bos);
         doc.close();
         te1.close();
@@ -140,25 +140,25 @@ public class TestHxxFEncryption {
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
         Biff8EncryptionKey.setCurrentUserPassword(password);
         File f = sampleDir.getFile(file);
-        POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
+        POITextExtractor te1 = ExtractorFactory.createExtractor(f);
         // first remove encryption
         Biff8EncryptionKey.setCurrentUserPassword(null);
-        POIDocument doc = te1.getDocument();
+        POIDocument doc = (POIDocument)te1.getDocument();
         doc.write(bos);
         doc.close();
         te1.close();
         // then use default setting, which is cryptoapi
         String newPass = "newPass";
-        POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
+        POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
         Biff8EncryptionKey.setCurrentUserPassword(newPass);
-        doc = te2.getDocument();
+        doc = (POIDocument)te2.getDocument();
         bos.reset();
         doc.write(bos);
         doc.close();
         te2.close();
         // and finally update cryptoapi setting
-        POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
-        doc = te3.getDocument();
+        POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
+        doc = (POIDocument)te3.getDocument();
         // need to cache data (i.e. read all data) before changing the key size
         if (doc instanceof HSLFSlideShowImpl) {
             HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
@@ -175,8 +175,8 @@ public class TestHxxFEncryption {
         doc.close();
         te3.close();
         // check the setting
-        POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
-        doc = te4.getDocument();
+        POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
+        doc = (POIDocument)te4.getDocument();
         ei = doc.getEncryptionInfo();
         assertNotNull(ei);
         assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);
index 94e5a5077f97ddee498e46fd80c499e99ceb01cd..a6a6467f9e103f405e3db7a845efd1bc6945350e 100644 (file)
@@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.sl.draw.DrawPaint;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.sl.usermodel.PaintStyle;
 import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
 import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
@@ -221,28 +222,27 @@ public class TestXSLFBugs {
      *  rID2 -> slide3.xml
      */
     @Test
-    public void bug54916() throws Exception {
-        XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx");
-        XSLFSlide slide;
+    public void bug54916() throws IOException {
+        try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
+            XSLFSlide slide;
 
-        // Should find 4 slides
-        assertEquals(4, ss.getSlides().size());
+            // Should find 4 slides
+            assertEquals(4, ss.getSlides().size());
 
-        // Check the text, to see we got them in order
-        slide = ss.getSlides().get(0);
-        assertContains(getSlideText(slide), "POI cannot read this");
+            // Check the text, to see we got them in order
+            slide = ss.getSlides().get(0);
+            assertContains(getSlideText(ss, slide), "POI cannot read this");
 
-        slide = ss.getSlides().get(1);
-        assertContains(getSlideText(slide), "POI can read this");
-        assertContains(getSlideText(slide), "Has a relationship to another slide");
+            slide = ss.getSlides().get(1);
+            assertContains(getSlideText(ss, slide), "POI can read this");
+            assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
 
-        slide = ss.getSlides().get(2);
-        assertContains(getSlideText(slide), "POI can read this");
+            slide = ss.getSlides().get(2);
+            assertContains(getSlideText(ss, slide), "POI can read this");
 
-        slide = ss.getSlides().get(3);
-        assertContains(getSlideText(slide), "POI can read this");
-
-        ss.close();
+            slide = ss.getSlides().get(3);
+            assertContains(getSlideText(ss, slide), "POI can read this");
+        }
     }
 
     /**
@@ -311,8 +311,15 @@ public class TestXSLFBugs {
         ss.close();
     }
 
-    protected String getSlideText(XSLFSlide slide) {
-        return XSLFPowerPointExtractor.getText(slide, true, false, false);
+    protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
+        try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
+            // do not auto-close the slideshow
+            extr.setFilesystem(null);
+            extr.setSlidesByDefault(true);
+            extr.setNotesByDefault(false);
+            extr.setMasterByDefault(false);
+            return extr.getText(slide);
+        }
     }
 
     @Test
@@ -458,7 +465,7 @@ public class TestXSLFBugs {
 
         for (int i = 0; i < slideTexts.length; i++) {
             XSLFSlide slide = ss.getSlides().get(i);
-            assertContains(getSlideText(slide), slideTexts[i]);
+            assertContains(getSlideText(ss, slide), slideTexts[i]);
         }
     }
 
index 2b6a96ad8d761d93d78d7d279e110e670b45a898..353ce7cbd4da1a0cc099dcbcd85945e69b3846c2 100644 (file)
@@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.poi.POIDataSamples;
-import org.apache.poi.POITextExtractor;
 import org.apache.poi.extractor.ExtractorFactory;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.xmlbeans.XmlException;
+import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -44,188 +45,189 @@ public class TestXSLFPowerPointExtractor {
 
        /**
         * Get text out of the simple file
-        * @throws XmlException
-        * @throws OpenXML4JException
         */
     @Test
-    public void testGetSimpleText()
-    throws IOException, XmlException, OpenXML4JException {
-        XMLSlideShow xmlA = openPPTX("sample.pptx");
-        @SuppressWarnings("resource")
-        OPCPackage pkg = xmlA.getPackage();
-
-           new XSLFPowerPointExtractor(xmlA).close();
-               new XSLFPowerPointExtractor(pkg).close();
-
-               XSLFPowerPointExtractor extractor =
-                       new XSLFPowerPointExtractor(xmlA);
-               extractor.getText();
-
-               String text = extractor.getText();
-               assertTrue(text.length() > 0);
-
-               // Check Basics
-               assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
-               assertContains(text, "amet\n\n");
-
-               // Our placeholder master text
-               // This shouldn't show up in the output
-           // String masterText =
-           //     "Click to edit Master title style\n" +
-        //     "Click to edit Master subtitle style\n" +
-        //     "\n\n\n\n\n\n" +
-        //     "Click to edit Master title style\n" +
-        //     "Click to edit Master text styles\n" +
-        //     "Second level\n" +
-        //     "Third level\n" +
-        //     "Fourth level\n" +
-        //     "Fifth level\n";
-
-               // Just slides, no notes
-               text = extractor.getText(true, false, false);
-               String slideText =
-               "Lorem ipsum dolor sit amet\n" +
-            "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
-            "\n" +
-            "Lorem ipsum dolor sit amet\n" +
-            "Lorem\n" +
-            "ipsum\n" +
-            "dolor\n" +
-            "sit\n" +
-            "amet\n" +
-            "\n";
-               assertEquals(slideText, text);
-
-               // Just notes, no slides
-               text = extractor.getText(false, true);
-               assertEquals("\n\n1\n\n\n2\n", text);
-
-               // Both
-               text = extractor.getText(true, true, false);
-               String bothText =
-               "Lorem ipsum dolor sit amet\n" +
-            "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
-            "\n\n\n1\n" +
-            "Lorem ipsum dolor sit amet\n" +
-            "Lorem\n" +
-            "ipsum\n" +
-            "dolor\n" +
-            "sit\n" +
-            "amet\n" +
-            "\n\n\n2\n";
-        assertEquals(bothText, text);
-
-               // With Slides and Master Text
-        text = extractor.getText(true, false, true);
-        String smText =
-            "Lorem ipsum dolor sit amet\n" +
-            "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
-            "\n" +
-            "Lorem ipsum dolor sit amet\n" +
-            "Lorem\n" +
-            "ipsum\n" +
-            "dolor\n" +
-            "sit\n" +
-            "amet\n" +
-            "\n";
-        assertEquals(smText, text);
-
-               // With Slides, Notes and Master Text
-        text = extractor.getText(true, true, true);
-        String snmText =
-            "Lorem ipsum dolor sit amet\n" +
-            "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
-            "\n\n\n1\n" +
-            "Lorem ipsum dolor sit amet\n" +
-            "Lorem\n" +
-            "ipsum\n" +
-            "dolor\n" +
-            "sit\n" +
-            "amet\n" +
-            "\n\n\n2\n";
-        assertEquals(snmText, text);
-
-               // Via set defaults
-               extractor.setSlidesByDefault(false);
-               extractor.setNotesByDefault(true);
-               text = extractor.getText();
-               assertEquals("\n\n1\n\n\n2\n", text);
-
-               extractor.close();
-               xmlA.close();
+    public void testGetSimpleText() throws IOException {
+        try (XMLSlideShow xmlA = openPPTX("sample.pptx");
+             SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
+
+            extractor.getText();
+
+            String text = extractor.getText();
+            assertTrue(text.length() > 0);
+
+            // Check Basics
+            assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
+            assertContains(text, "amet\n\n");
+
+            // Our placeholder master text
+            // This shouldn't show up in the output
+            // String masterText =
+            //     "Click to edit Master title style\n" +
+            //     "Click to edit Master subtitle style\n" +
+            //     "\n\n\n\n\n\n" +
+            //     "Click to edit Master title style\n" +
+            //     "Click to edit Master text styles\n" +
+            //     "Second level\n" +
+            //     "Third level\n" +
+            //     "Fourth level\n" +
+            //     "Fifth level\n";
+
+            // Just slides, no notes
+            extractor.setSlidesByDefault(true);
+            extractor.setNotesByDefault(false);
+            extractor.setMasterByDefault(false);
+            text = extractor.getText();
+            String slideText =
+                "Lorem ipsum dolor sit amet\n" +
+                "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                "\n" +
+                "Lorem ipsum dolor sit amet\n" +
+                "Lorem\n" +
+                "ipsum\n" +
+                "dolor\n" +
+                "sit\n" +
+                "amet\n" +
+                "\n";
+            assertEquals(slideText, text);
+
+            // Just notes, no slides
+            extractor.setSlidesByDefault(false);
+            extractor.setNotesByDefault(true);
+            text = extractor.getText();
+            assertEquals("\n\n1\n\n\n2\n", text);
+
+            // Both
+            extractor.setSlidesByDefault(true);
+            extractor.setNotesByDefault(true);
+            text = extractor.getText();
+            String bothText =
+                "Lorem ipsum dolor sit amet\n" +
+                "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                "\n\n\n1\n" +
+                "Lorem ipsum dolor sit amet\n" +
+                "Lorem\n" +
+                "ipsum\n" +
+                "dolor\n" +
+                "sit\n" +
+                "amet\n" +
+                "\n\n\n2\n";
+            assertEquals(bothText, text);
+
+            // With Slides and Master Text
+            extractor.setSlidesByDefault(true);
+            extractor.setNotesByDefault(false);
+            extractor.setMasterByDefault(true);
+            text = extractor.getText();
+            String smText =
+                "Lorem ipsum dolor sit amet\n" +
+                "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                "\n" +
+                "Lorem ipsum dolor sit amet\n" +
+                "Lorem\n" +
+                "ipsum\n" +
+                "dolor\n" +
+                "sit\n" +
+                "amet\n" +
+                "\n";
+            assertEquals(smText, text);
+
+            // With Slides, Notes and Master Text
+            extractor.setSlidesByDefault(true);
+            extractor.setNotesByDefault(true);
+            extractor.setMasterByDefault(true);
+            text = extractor.getText();
+            String snmText =
+                "Lorem ipsum dolor sit amet\n" +
+                "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                "\n\n\n1\n" +
+                "Lorem ipsum dolor sit amet\n" +
+                "Lorem\n" +
+                "ipsum\n" +
+                "dolor\n" +
+                "sit\n" +
+                "amet\n" +
+                "\n\n\n2\n";
+            assertEquals(snmText, text);
+
+            // Via set defaults
+            extractor.setSlidesByDefault(false);
+            extractor.setNotesByDefault(true);
+            text = extractor.getText();
+            assertEquals("\n\n1\n\n\n2\n", text);
+        }
        }
 
+    @Test
     public void testGetComments() throws IOException {
-        XMLSlideShow xml = openPPTX("45545_Comment.pptx");
-        XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
-        String text = extractor.getText();
-        assertTrue(text.length() > 0);
+        try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
+            SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+            extractor.setCommentsByDefault(true);
 
-        // Check comments are there
-        assertContains(text, "testdoc");
-        assertContains(text, "test phrase");
+            String text = extractor.getText();
+            assertTrue(text.length() > 0);
 
-        // Check the authors came through too
-        assertContains(text, "XPVMWARE01");
+            // Check comments are there
+            assertContains(text, "testdoc");
+            assertContains(text, "test phrase");
 
-        extractor.close();
-        xml.close();
+            // Check the authors came through too
+            assertContains(text, "XPVMWARE01");
+        }
     }
 
+    @Test
+    @Ignore("currently slidelayouts aren't yet supported")
        public void testGetMasterText() throws Exception {
-           XMLSlideShow xml = openPPTX("WithMaster.pptx");
-           XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-           extractor.setSlidesByDefault(true);
-           extractor.setNotesByDefault(false);
-           extractor.setMasterByDefault(true);
-
-           String text = extractor.getText();
-           assertTrue(text.length() > 0);
-
-           // Check master text is there
-           assertContains(text, "Footer from the master slide");
-
-           // Theme text shouldn't show up
-           // String themeText =
-        //     "Theme Master Title\n" +
-        //     "Theme Master first level\n" +
-        //     "And the 2nd level\n" +
-        //     "Our 3rd level goes here\n" +
-        //     "And onto the 4th, such fun....\n" +
-        //     "Finally is the Fifth level\n";
-
-           // Check the whole text
-           String wholeText =
-            "First page title\n" +
-            "First page subtitle\n" +
-            "This is the Master Title\n" +
-            "This text comes from the Master Slide\n" +
-            "\n" +
-            // TODO Detect we didn't have a title, and include the master one
-            "2nd page subtitle\n" +
-            "Footer from the master slide\n" +
-            "This is the Master Title\n" +
-            "This text comes from the Master Slide\n";
-           assertEquals(wholeText, text);
-
-               extractor.close();
-               xml.close();
+           try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
+            SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+            extractor.setSlidesByDefault(true);
+            extractor.setNotesByDefault(false);
+            extractor.setMasterByDefault(true);
+
+
+            String text = extractor.getText();
+            assertTrue(text.length() > 0);
+
+            // Check master text is there
+            assertContains(text, "Footer from the master slide");
+
+            // Theme text shouldn't show up
+            // String themeText =
+            //     "Theme Master Title\n" +
+            //     "Theme Master first level\n" +
+            //     "And the 2nd level\n" +
+            //     "Our 3rd level goes here\n" +
+            //     "And onto the 4th, such fun....\n" +
+            //     "Finally is the Fifth level\n";
+
+            // Check the whole text
+            String wholeText =
+                "First page title\n" +
+                "First page subtitle\n" +
+                "This is the Master Title\n" +
+                "This text comes from the Master Slide\n" +
+                "\n" +
+                // TODO Detect we didn't have a title, and include the master one
+                "2nd page subtitle\n" +
+                "Footer from the master slide\n" +
+                "This is the Master Title\n" +
+                "This text comes from the Master Slide\n";
+            assertEquals(wholeText, text);
+        }
        }
 
        @Test
        public void testTable() throws Exception {
-        XMLSlideShow xml = openPPTX("present1.pptx");
-        XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
-        String text = extractor.getText();
-        assertTrue(text.length() > 0);
+        try (XMLSlideShow xml = openPPTX("present1.pptx");
+            SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
 
-        // Check comments are there
-        assertContains(text, "TEST");
+            String text = extractor.getText();
+            assertTrue(text.length() > 0);
 
-               extractor.close();
-               xml.close();
+            // Check comments are there
+            assertContains(text, "TEST");
+        }
     }
 
     /**
@@ -241,74 +243,76 @@ public class TestXSLFPowerPointExtractor {
         };
         for(String extension : extensions) {
             String filename = "testPPT." + extension;
-            XMLSlideShow xml = openPPTX(filename);
-            XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
 
-            String text = extractor.getText();
-            if (extension.equals("thmx")) {
-                // Theme file doesn't have any textual content
-                assertEquals(filename, 0, text.length());
-                continue;
+            try (XMLSlideShow xml = openPPTX(filename);
+                SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+
+                String text = extractor.getText();
+                if (extension.equals("thmx")) {
+                    // Theme file doesn't have any textual content
+                    assertEquals(filename, 0, text.length());
+                    continue;
+                }
+
+                assertTrue(filename, text.length() > 0);
+                assertContains(filename, text, "Attachment Test");
+                assertContains(filename, text, "This is a test file data with the same content");
+                assertContains(filename, text, "content parsing");
+                assertContains(filename, text, "Different words to test against");
+                assertContains(filename, text, "Mystery");
             }
-
-         assertTrue(filename, text.length() > 0);
-         assertContains(filename, text, "Attachment Test");
-         assertContains(filename, text, "This is a test file data with the same content");
-         assertContains(filename, text, "content parsing");
-         assertContains(filename, text, "Different words to test against");
-         assertContains(filename, text, "Mystery");
-
-                extractor.close();
-                xml.close();
        }
     }
 
     @Test
-    public void test45541() throws Exception {
+    public void test45541() throws IOException, OpenXML4JException, XmlException {
         // extract text from a powerpoint that has a header in the notes-element
-        POITextExtractor extr = ExtractorFactory.createExtractor(
-            slTests.getFile("45541_Header.pptx"));
-        String text = extr.getText();
-        assertNotNull(text);
-        assertFalse("Had: " + text, text.contains("testdoc"));
-
-        text = ((XSLFPowerPointExtractor)extr).getText(false, true);
-        assertContains(text, "testdoc");
-        extr.close();
-       assertNotNull(text);
+        final File headerFile = slTests.getFile("45541_Header.pptx");
+        try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
+            String text = extr.getText();
+            assertNotNull(text);
+            assertFalse("Had: " + text, text.contains("testdoc"));
+
+            extr.setSlidesByDefault(false);
+            extr.setNotesByDefault(true);
+
+            text = extr.getText();
+            assertContains(text, "testdoc");
+            assertNotNull(text);
+        }
 
         // extract text from a powerpoint that has a footer in the master-slide
-        extr = ExtractorFactory.createExtractor(
-            slTests.getFile("45541_Footer.pptx"));
-        text = extr.getText();
-        assertNotContained(text, "testdoc");
-
-        text = ((XSLFPowerPointExtractor)extr).getText(false, true);
-        assertNotContained(text, "testdoc");
-
-        text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
-        assertNotContained(text, "testdoc");
-
-        extr.close();
+        final File footerFile = slTests.getFile("45541_Footer.pptx");
+        try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
+            String text = extr.getText();
+            assertNotContained(text, "testdoc");
+
+            extr.setSlidesByDefault(false);
+            extr.setNotesByDefault(true);
+            text = extr.getText();
+            assertNotContained(text, "testdoc");
+
+            extr.setSlidesByDefault(false);
+            extr.setNotesByDefault(false);
+            extr.setMasterByDefault(true);
+            text = extr.getText();
+            assertNotContained(text, "testdoc");
+        }
     }
 
 
     @Test
     public void bug54570() throws IOException {
-        XMLSlideShow xml = openPPTX("bug54570.pptx");
-        XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-        String text = extractor.getText();
-        assertNotNull(text);
-        extractor.close();
-        xml.close();
+        try (XMLSlideShow xml = openPPTX("bug54570.pptx");
+            SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+            String text = extractor.getText();
+            assertNotNull(text);
+        }
     }
 
     private XMLSlideShow openPPTX(String file) throws IOException {
-        InputStream is = slTests.openResourceAsStream(file);
-        try {
+        try (InputStream is = slTests.openResourceAsStream(file)) {
             return new XMLSlideShow(is);
-        } finally {
-            is.close();
         }
     }
 }
index bf7cc57d4fa7aeba45f04e552ec58fd46fd3a373..f77d0834f9633d6871e0406175b32479f37761d7 100644 (file)
@@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
+import org.apache.poi.sl.usermodel.SlideShowFactory;
 
 /**
  * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
@@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory {
         }
 
         if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
-            return new PowerPointExtractor(poifsDir);
+            return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
         }
 
         if (poifsDir.hasEntry("VisioDocument")) {
index 156747eace42cb09c109288b5fab253237a1e38e..efbefd6ceb03a7d1e50b66b804d4a53573809750 100644 (file)
@@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.sl.usermodel.SlideShowFactory;
+import org.apache.poi.util.Removal;
 
 /**
  * This class can be used to extract text from a PowerPoint file. Can optionally
@@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory;
  */
 @SuppressWarnings("WeakerAccess")
 @Deprecated
+@Removal(version="5.0.0")
 public final class PowerPointExtractor extends POIOLE2TextExtractor {
        private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
 
index ab6cc818d0b0956b59eee39d2c81a6d27f8191cf..16df6f384fcb7bad661bae62225bf28d1d7da30e 100644 (file)
@@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
     public void close() throws IOException {
         _hslfSlideShow.close();
     }
+
+       @Override
+       public Object getPersistDocument() {
+               return getSlideShowImpl();
+       }
 }
index fb216f1af4868021d3a8993cda017e7ced32d271..b5adaf698d100895dbe65a80d537e8ff23412e11 100644 (file)
@@ -19,8 +19,8 @@ package org.apache.poi.hslf.usermodel;
 
 import java.io.IOException;
 
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
-import org.apache.poi.sl.usermodel.SlideShow;
 import org.apache.poi.sl.usermodel.SlideShowFactory;
 import org.apache.poi.util.Internal;
 
@@ -31,12 +31,20 @@ import org.apache.poi.util.Internal;
 @Internal
 public class HSLFSlideShowFactory extends SlideShowFactory {
     /**
-     * Creates a HSLFSlideShow from the given NPOIFSFileSystem
-     * <p>Note that in order to properly release resources the
-     *  SlideShow should be closed after use.
+     * Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
+     * Note that in order to properly release resources the
+     * SlideShow should be closed after use.
      */
-    public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException {
+    public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
         return new HSLFSlideShow(fs);
     }
 
+    /**
+     * Creates a HSLFSlideShow from the given DirectoryNode<p>
+     * Note that in order to properly release resources the
+     * SlideShow should be closed after use.
+     */
+    public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
+        return new HSLFSlideShow(root);
+    }
 }
index 248fb3b2c5448dddd75d1e73c1162bac5f621b59..c0676f22641e03c32a2d06da658dc4aadcbf1bc8 100644 (file)
@@ -846,9 +846,13 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
 
     @Override
     public void close() throws IOException {
-        NPOIFSFileSystem fs = getDirectory().getFileSystem();
-        if (fs != null) {
-            fs.close();
+        // only close the filesystem, if we are based on the root node.
+        // embedded documents/slideshows shouldn't close the parent container
+        if (getDirectory().getParent() == null) {
+            NPOIFSFileSystem fs = getDirectory().getFileSystem();
+            if (fs != null) {
+                fs.close();
+            }
         }
     }
 
index 6a34d1af3fca5efaf4da54c41651134ff9197c83..9a420a6e267722f5142f4d124251b40baedcb037 100644 (file)
@@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
+import org.apache.poi.sl.usermodel.ObjectShape;
+import org.apache.poi.sl.usermodel.SlideShow;
+import org.apache.poi.sl.usermodel.SlideShowFactory;
 import org.apache.poi.util.IOUtils;
 import org.junit.Test;
 
@@ -76,43 +80,46 @@ public final class TestExtractor {
 //        ppe.close();
 //    }
 
-    private PowerPointExtractor openExtractor(String fileName) throws IOException {
-        InputStream is = slTests.openResourceAsStream(fileName);
-        try {
-            return new PowerPointExtractor(is);
-        } finally {
-            is.close();
+    private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
+        try (InputStream is = slTests.openResourceAsStream(fileName)) {
+            return new SlideShowExtractor(SlideShowFactory.create(is));
         }
     }
     
     @Test
     public void testReadSheetText() throws IOException {
         // Basic 2 page example
-        PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
-        assertEquals(expectText, ppe.getText());
-        ppe.close();
+        try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
+            assertEquals(expectText, ppe.getText());
+        }
 
         // 1 page example with text boxes
-        PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
-        assertEquals(expectText2, ppe2.getText());
-        ppe2.close();
+        try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
+            assertEquals(expectText2, ppe.getText());
+        }
     }
 
     @Test
     public void testReadNoteText() throws IOException {
         // Basic 2 page example
-        PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
-        String notesText = ppe.getNotes();
-        String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
-        assertEquals(expText, notesText);
-        ppe.close();
+        try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
+            ppe.setNotesByDefault(true);
+            ppe.setSlidesByDefault(false);
+            ppe.setMasterByDefault(false);
+            String notesText = ppe.getText();
+            String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
+            assertEquals(expText, notesText);
+        }
 
         // Other one doesn't have notes
-        PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
-        notesText = ppe2.getNotes();
-        expText = "";
-        assertEquals(expText, notesText);
-        ppe2.close();
+        try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
+            ppe.setNotesByDefault(true);
+            ppe.setSlidesByDefault(false);
+            ppe.setMasterByDefault(false);
+            String notesText = ppe.getText();
+            String expText = "";
+            assertEquals(expText, notesText);
+        }
     }
 
     @Test
@@ -126,19 +133,19 @@ public final class TestExtractor {
                 "\nThese are the notes on page two, again lacking formatting\n"
         };
 
-        PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
-        ppe.setSlidesByDefault(true);
-        ppe.setNotesByDefault(false);
-        assertEquals(slText[0] + slText[1], ppe.getText());
+        try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
+            ppe.setSlidesByDefault(true);
+            ppe.setNotesByDefault(false);
+            assertEquals(slText[0] + slText[1], ppe.getText());
 
-        ppe.setSlidesByDefault(false);
-        ppe.setNotesByDefault(true);
-        assertEquals(ntText[0] + ntText[1], ppe.getText());
+            ppe.setSlidesByDefault(false);
+            ppe.setNotesByDefault(true);
+            assertEquals(ntText[0] + ntText[1], ppe.getText());
 
-        ppe.setSlidesByDefault(true);
-        ppe.setNotesByDefault(true);
-        assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
-        ppe.close();
+            ppe.setSlidesByDefault(true);
+            ppe.setNotesByDefault(true);
+            assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
+        }
     }
 
     /**
@@ -149,45 +156,46 @@ public final class TestExtractor {
      */
     @Test
     public void testMissingCoreRecords() throws IOException {
-        PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
-
-        String text = ppe.getText(true, false);
-        String nText = ppe.getNotes();
-
-        assertNotNull(text);
-        assertNotNull(nText);
-
-        // Notes record were corrupt, so don't expect any
-        assertEquals(nText.length(), 0);
-
-        // Slide records were fine
-        assertContains(text, "Using Disease Surveillance and Response");
-        
-        ppe.close();
+        try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
+            ppe.setSlidesByDefault(true);
+            ppe.setNotesByDefault(false);
+            String text = ppe.getText();
+            ppe.setSlidesByDefault(false);
+            ppe.setNotesByDefault(true);
+            String nText = ppe.getText();
+
+            assertNotNull(text);
+            assertNotNull(nText);
+
+            // Notes record were corrupt, so don't expect any
+            assertEquals(nText.length(), 0);
+
+            // Slide records were fine
+            assertContains(text, "Using Disease Surveillance and Response");
+        }
     }
 
     @Test
     public void testExtractFromEmbeded() throws IOException {
-        InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
-        POIFSFileSystem fs = new POIFSFileSystem(is);
-        DirectoryNode root = fs.getRoot();
-        PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
-        PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
-        ppe2.close();
-        ppe1.close();
-        fs.close();
-    }
-    
-    private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
-    throws IOException {
-        DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
-        assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
-
-        // Check the first file
-        HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
-        PowerPointExtractor ppe = new PowerPointExtractor(ppt);
-        assertEquals(expected, ppe.getText(true, false));
-        return ppe;
+        try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
+            final POIFSFileSystem fs = new POIFSFileSystem(is)) {
+            final DirectoryNode root = fs.getRoot();
+
+            final String[] TEST_SET = {
+                "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
+                "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
+            };
+
+            for (int i=0; i<TEST_SET.length; i+=2) {
+                DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
+                assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
+
+                try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
+                     final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
+                    assertEquals(TEST_SET[i+1], ppe.getText());
+                }
+            }
+        }
     }
 
     /**
@@ -195,32 +203,32 @@ public final class TestExtractor {
      */
     @Test
     public void testExtractFromOwnEmbeded() throws IOException {
-        PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
-        List<HSLFObjectShape> shapes = ppe.getOLEShapes();
-        assertEquals("Expected 6 ole shapes", 6, shapes.size());
-        int num_ppt = 0, num_doc = 0, num_xls = 0;
-        for (HSLFObjectShape ole : shapes) {
-            String name = ole.getInstanceName();
-            InputStream data = ole.getObjectData().getInputStream();
-            if ("Worksheet".equals(name)) {
-                HSSFWorkbook wb = new HSSFWorkbook(data);
-                num_xls++;
-                wb.close();
-            } else if ("Document".equals(name)) {
-                HWPFDocument doc = new HWPFDocument(data);
-                num_doc++;
-                doc.close();
-            } else if ("Presentation".equals(name)) {
-                num_ppt++;
-                HSLFSlideShow ppt = new HSLFSlideShow(data);
-                ppt.close();
+        try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
+            List<? extends ObjectShape> shapes = ppe.getOLEShapes();
+            assertEquals("Expected 6 ole shapes", 6, shapes.size());
+            int num_ppt = 0, num_doc = 0, num_xls = 0;
+            for (ObjectShape ole : shapes) {
+                String name = ((HSLFObjectShape)ole).getInstanceName();
+                InputStream data = ole.getObjectData().getInputStream();
+                if ("Worksheet".equals(name)) {
+                    HSSFWorkbook wb = new HSSFWorkbook(data);
+                    num_xls++;
+                    wb.close();
+                } else if ("Document".equals(name)) {
+                    HWPFDocument doc = new HWPFDocument(data);
+                    num_doc++;
+                    doc.close();
+                } else if ("Presentation".equals(name)) {
+                    num_ppt++;
+                    HSLFSlideShow ppt = new HSLFSlideShow(data);
+                    ppt.close();
+                }
+                data.close();
             }
-            data.close();
+            assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
+            assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
+            assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
         }
-        assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
-        assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
-        assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
-        ppe.close();
     }
 
     /**
@@ -228,11 +236,11 @@ public final class TestExtractor {
      */
     @Test
     public void test52991() throws IOException {
-        PowerPointExtractor ppe = openExtractor("badzip.ppt");
-        for (HSLFObjectShape shape : ppe.getOLEShapes()) {
-            IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
+        try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
+            for (ObjectShape shape : ppe.getOLEShapes()) {
+                IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
+            }
         }
-        ppe.close();
     }
 
     /**
@@ -240,27 +248,27 @@ public final class TestExtractor {
      */
     @Test
     public void testWithComments() throws IOException {
-        PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
-        String text = ppe1.getText();
-        assertFalse("Comments not in by default", text.contains("This is a test comment"));
+        try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
+            String text = ppe.getText();
+            assertFalse("Comments not in by default", text.contains("This is a test comment"));
 
-        ppe1.setCommentsByDefault(true);
+            ppe.setCommentsByDefault(true);
 
-        text = ppe1.getText();
-        assertContains(text, "This is a test comment");
-        ppe1.close();
+            text = ppe.getText();
+            assertContains(text, "This is a test comment");
+        }
 
 
         // And another file
-        PowerPointExtractor ppe2 = openExtractor("45543.ppt");
-        text = ppe2.getText();
-        assertFalse("Comments not in by default", text.contains("testdoc"));
+        try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
+            String text = ppe.getText();
+            assertFalse("Comments not in by default", text.contains("testdoc"));
 
-        ppe2.setCommentsByDefault(true);
+            ppe.setCommentsByDefault(true);
 
-        text = ppe2.getText();
-        assertContains(text, "testdoc");
-        ppe2.close();
+            text = ppe.getText();
+            assertContains(text, "testdoc");
+        }
     }
 
     /**
@@ -268,48 +276,37 @@ public final class TestExtractor {
      */
     @Test
     public void testHeaderFooter() throws IOException {
-        String text;
-
         // With a header on the notes
-        InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
-        HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
-        is1.close();
-        assertNotNull(ppt1.getNotesHeadersFooters());
-        assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
-
-        PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
+        try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
+            HSLFSlideShow ppt = new HSLFSlideShow(is)) {
 
-        text = ppe1.getText();
-        assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
-        assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
+            assertNotNull(ppt.getNotesHeadersFooters());
+            assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
 
-        ppe1.setNotesByDefault(true);
-        text = ppe1.getText();
-        assertContains(text, "testdoc");
-        assertContains(text, "test phrase");
-        ppe1.close();
-        ppt1.close();
+            testHeaderFooterInner(ppt);
+        }
 
         // And with a footer, also on notes
-        InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
-        HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
-        is2.close();
-        
-        assertNotNull(ppt2.getNotesHeadersFooters());
-        assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
-        ppt2.close();
-
-        PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
-
-        text = ppe2.getText();
-        assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
-        assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
-
-        ppe2.setNotesByDefault(true);
-        text = ppe2.getText();
-        assertContains(text, "testdoc");
-        assertContains(text, "test phrase");
-        ppe2.close();
+        try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
+            final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
+            assertNotNull(ppt.getNotesHeadersFooters());
+            assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
+
+            testHeaderFooterInner(ppt);
+        }
+    }
+
+    private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
+        try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
+            String text = ppe.getText();
+            assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
+            assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
+
+            ppe.setNotesByDefault(true);
+            text = ppe.getText();
+            assertContains(text, "testdoc");
+            assertContains(text, "test phrase");
+        }
     }
 
     @SuppressWarnings("unused")
@@ -318,41 +315,40 @@ public final class TestExtractor {
         String masterTitleText = "This is the Master Title";
         String masterRandomText = "This text comes from the Master Slide";
         String masterFooterText = "Footer from the master slide";
-        PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
-        ppe.setMasterByDefault(true);
+        try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
+            ppe.setMasterByDefault(true);
 
-        String text = ppe.getText();
-        assertContains(text, masterRandomText);
-        assertContains(text, masterFooterText);
-        ppe.close();
+            String text = ppe.getText();
+            assertContains(text, masterRandomText);
+            assertContains(text, masterFooterText);
+        }
     }
 
     @Test
     public void testMasterText() throws IOException {
-        PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
-
-        // Initially not there
-        String text = ppe1.getText();
-        assertFalse(text.contains("Text that I added to the master slide"));
-
-        // Enable, shows up
-        ppe1.setMasterByDefault(true);
-        text = ppe1.getText();
-        assertContains(text, "Text that I added to the master slide");
-
-        // Make sure placeholder text does not come out
-        assertNotContained(text, "Click to edit Master");
-        ppe1.close();
+        try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
+            // Initially not there
+            String text = ppe.getText();
+            assertFalse(text.contains("Text that I added to the master slide"));
+
+            // Enable, shows up
+            ppe.setMasterByDefault(true);
+            text = ppe.getText();
+            assertContains(text, "Text that I added to the master slide");
+
+            // Make sure placeholder text does not come out
+            assertNotContained(text, "Click to edit Master");
+        }
 
         // Now with another file only containing master text
         // Will always show up
-        PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
-        String masterText = "Footer from the master slide";
+        try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
+            String masterText = "Footer from the master slide";
 
-        text = ppe2.getText();
-        assertContainsIgnoreCase(text, "master");
-        assertContains(text, masterText);
-        ppe2.close();
+            String text = ppe.getText();
+            assertContainsIgnoreCase(text, "master");
+            assertContains(text, masterText);
+        }
     }
 
     /**
@@ -360,22 +356,21 @@ public final class TestExtractor {
      */
     @Test
     public void testChineseText() throws IOException {
-        PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
-
-        String text = ppe.getText();
+        try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
+            String text = ppe.getText();
 
-        // Check for the english text line
-        assertContains(text, "Single byte");
+            // Check for the english text line
+            assertContains(text, "Single byte");
 
-        // Check for the english text in the mixed line
-        assertContains(text, "Mix");
+            // Check for the english text in the mixed line
+            assertContains(text, "Mix");
 
-        // Check for the chinese text in the mixed line
-        assertContains(text, "\u8868");
+            // Check for the chinese text in the mixed line
+            assertContains(text, "\u8868");
 
-        // Check for the chinese only text line
-        assertContains(text, "\uff8a\uff9d\uff76\uff78");
-        ppe.close();
+            // Check for the chinese only text line
+            assertContains(text, "\uff8a\uff9d\uff76\uff78");
+        }
     }
 
     /**
@@ -387,67 +382,59 @@ public final class TestExtractor {
     public void testDifferentPOIFS() throws IOException {
         // Open the two filesystems
         File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
-        InputStream is1 = new FileInputStream(pptFile);
-        OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
-        is1.close();
-        NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
-        
-        DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
-
-        // Open directly
-        for (DirectoryNode dir : files) {
-            PowerPointExtractor extractor = new PowerPointExtractor(dir);
-            assertEquals(expectText, extractor.getText());
-        }
+        try (final InputStream is1 = new FileInputStream(pptFile);
+            final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
 
-        // Open via a HSLFSlideShow
-        for (DirectoryNode dir : files) {
-            HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
-            PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
-            assertEquals(expectText, extractor.getText());
-            extractor.close();
-            slideshow.close();
-        }
+            final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
 
-        npoifs.close();
+            DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
+
+            // Open directly
+            for (DirectoryNode dir : files) {
+                try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
+                    SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
+                    assertEquals(expectText, extractor.getText());
+                }
+            }
+        }
     }
 
     @Test
     public void testTable() throws Exception {
-        PowerPointExtractor ppe1 = openExtractor("54111.ppt");
-        String text1 = ppe1.getText();
-        String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
-                         "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+   
-                         "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
-                         "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
-                         "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+ 
-                         "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
-        assertContains(text1, target1);
-        ppe1.close();
-
-        PowerPointExtractor ppe2 = openExtractor("54722.ppt");
-        String text2 = ppe2.getText();
-
-        String target2 = "this\tText\tis\twithin\ta\n" +
-                "table\t1\t2\t3\t4";
-        assertContains(text2, target2);
-        ppe2.close();
+        try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
+            String text = ppe.getText();
+            String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
+                    "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
+                    "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
+                    "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
+                    "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
+                    "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
+            assertContains(text, target);
+        }
+
+        try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
+            String text = ppe.getText();
+
+            String target = "this\tText\tis\twithin\ta\n" +
+                    "table\t1\t2\t3\t4";
+            assertContains(text, target);
+        }
     }
 
     // bug 60003
     @Test
     public void testExtractMasterSlideFooterText() throws Exception {
-        PowerPointExtractor ppe = openExtractor("60003.ppt");
-        ppe.setMasterByDefault(true);
+        try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
+            ppe.setMasterByDefault(true);
 
-        String text = ppe.getText();
-        assertContains(text, "Prague");
-        ppe.close();
+            String text = ppe.getText();
+            assertContains(text, "Prague");
+        }
     }
 
     @Test
     public void testExtractGroupedShapeText() throws Exception {
-        try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
+        try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
             final String text = ppe.getText();
 
             //this tests that we're ignoring text shapes at depth=0
index df4421609390c2527351d354cb7de10b0f9ccc0b..aee64bd6f9c4545929b26b00c149dee623431013 100644 (file)
@@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader;
 import org.apache.poi.sl.draw.DrawFactory;
 import org.apache.poi.sl.draw.DrawPaint;
 import org.apache.poi.sl.draw.DrawTextParagraph;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
 import org.apache.poi.sl.usermodel.ColorStyle;
 import org.apache.poi.sl.usermodel.PaintStyle;
 import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
@@ -800,18 +801,18 @@ public final class TestBugs {
         String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt",  }; 
         for (String f : files) {
             File sample = HSLFTestDataSamples.getSampleFile(f);
-            PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
-            assertNotNull(ex.getText());
-            ex.close();
+            try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
+                 assertNotNull(ex.getText());
+             }
         }
     }
 
     @Test
     public void bug58733() throws IOException {
         File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
-        PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
-        assertNotNull(ex.getText());
-        ex.close();
+        try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
+            assertNotNull(ex.getText());
+        }
     }
 
     @Test
index e5237f4c15dd7adf33bd79a7416f54c9ca6751ef..df3a26debdc27989bddd8257a278727974a75424 100644 (file)
Binary files a/test-data/slideshow/SampleShow.pptx and b/test-data/slideshow/SampleShow.pptx differ