From: Andreas Beeker Date: Fri, 20 Apr 2018 12:52:59 +0000 (+0000) Subject: #62319 - Decommission XSLF-/PowerPointExtractor X-Git-Tag: REL_4_0_0_FINAL~191 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=e81613175907a4f6f7d135c57cb412c78ea1db72;p=poi.git #62319 - Decommission XSLF-/PowerPointExtractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index 333a8ebcdc..8721c50948 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -330,8 +330,6 @@ public class TestAllFiles { ); private static final Set IGNORED = unmodifiableHashSet( - // need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081 - "slideshow/42474-2.ppt", // OPC handler works / XSSF handler fails "spreadsheet/57181.xlsm", "spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop diff --git a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java index 7aab657650..80fe858249 100644 --- a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java @@ -24,6 +24,7 @@ import java.io.FileInputStream; import java.io.InputStream; import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlideShow; @@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler { // additionally try the other getText() methods - try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) { + try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) { assertNotNull(extractor); + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(true); + extractor.setMasterByDefault(true); - assertNotNull(extractor.getText(true, true, true)); - assertEquals("With all options disabled we should not get text", - "", extractor.getText(false, false, false)); + assertNotNull(extractor.getText()); + + extractor.setSlidesByDefault(false); + extractor.setNotesByDefault(false); + extractor.setMasterByDefault(false); + + assertEquals("With all options disabled we should not get text", "", extractor.getText()); } } diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index 05c7781057..0fccf71c4a 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { * * @return the underlying POIDocument */ + @Override public POIDocument getDocument() { return document; } diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index 58cf5e6ddb..55d0832f16 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable { fsToClose.close(); } } + + /** + * @return the processed document + */ + public abstract Object getDocument(); } diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java index 5f895dc375..5e52f9d6c2 100644 --- a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java +++ b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java @@ -115,26 +115,23 @@ public class OLE2ExtractorFactory { return threadPreferEventExtractors.get(); } - public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { - // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + public static T createExtractor(POIFSFileSystem fs) throws IOException { + return (T)createExtractor(fs.getRoot()); } - public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException { - // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + public static T createExtractor(NPOIFSFileSystem fs) throws IOException { + return (T)createExtractor(fs.getRoot()); } - public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException { - // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); + public static T createExtractor(OPOIFSFileSystem fs) throws IOException { + return (T)createExtractor(fs.getRoot()); } - public static POITextExtractor createExtractor(InputStream input) throws IOException { + public static T createExtractor(InputStream input) throws IOException { Class cls = getOOXMLClass(); if (cls != null) { // Use Reflection to get us the full OOXML-enabled version try { Method m = cls.getDeclaredMethod("createExtractor", InputStream.class); - return (POITextExtractor)m.invoke(null, input); + return (T)m.invoke(null, input); } catch (IllegalArgumentException iae) { throw iae; } catch (Exception e) { diff --git a/src/java/org/apache/poi/poifs/filesystem/DocumentFactoryHelper.java b/src/java/org/apache/poi/poifs/filesystem/DocumentFactoryHelper.java index 0657b9bb63..57b57a1757 100644 --- a/src/java/org/apache/poi/poifs/filesystem/DocumentFactoryHelper.java +++ b/src/java/org/apache/poi/poifs/filesystem/DocumentFactoryHelper.java @@ -44,8 +44,30 @@ public class DocumentFactoryHelper { * @throws IOException If an error occurs while decrypting or if the password does not match */ public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password) + throws IOException { + // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem + // as well when the resulting OPCPackage is closed + return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) { + @Override + public void close() throws IOException { + fs.close(); + super.close(); + } + }; + } + + /** + * Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using + * the given password. + * + * @param root The OLE2 directory node for the document + * @param password The password, null if the default password should be used + * @return A stream for reading the decrypted data + * @throws IOException If an error occurs while decrypting or if the password does not match + */ + public static InputStream getDecryptedStream(final DirectoryNode root, String password) throws IOException { - EncryptionInfo info = new EncryptionInfo(fs); + EncryptionInfo info = new EncryptionInfo(root); Decryptor d = Decryptor.getInstance(info); try { @@ -58,21 +80,11 @@ public class DocumentFactoryHelper { } if (passwordCorrect) { - // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem - // as well when the resulting OPCPackage is closed - return new FilterInputStream(d.getDataStream(fs.getRoot())) { - @Override - public void close() throws IOException { - fs.close(); - - super.close(); - } - }; + return d.getDataStream(root); + } else if (password != null) { + throw new EncryptedDocumentException("Password incorrect"); } else { - if (password != null) - throw new EncryptedDocumentException("Password incorrect"); - else - throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied"); + throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied"); } } catch (GeneralSecurityException e) { throw new IOException(e); diff --git a/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java b/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java index bf234f9d63..a7283acbee 100644 --- a/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java +++ b/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java @@ -1,3 +1,20 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + package org.apache.poi.sl.extractor; import java.util.ArrayList; @@ -48,6 +65,16 @@ public class SlideShowExtractor< this.slideshow = slideshow; } + /** + * Returns opened document + * + * @return the opened document + */ + @Override + public final Object getDocument() { + return slideshow.getPersistDocument(); + } + /** * Should a call to getText() return slide text? Default is yes */ @@ -219,7 +246,6 @@ public class SlideShowExtractor< return; } for (final P para : paraList) { - final int oldLen = sb.length(); for (final TextRun tr : para) { final String str = tr.getRawText().replace("\r", ""); final String newStr; diff --git a/src/java/org/apache/poi/sl/usermodel/SlideShow.java b/src/java/org/apache/poi/sl/usermodel/SlideShow.java index 8ddd96217c..9894e687d8 100644 --- a/src/java/org/apache/poi/sl/usermodel/SlideShow.java +++ b/src/java/org/apache/poi/sl/usermodel/SlideShow.java @@ -126,4 +126,13 @@ public interface SlideShow< * @since POI 4.0.0 */ POITextExtractor getMetadataTextExtractor(); + + /** + * @return the instance which handles the persisting of the slideshow, + * which is either a subclass of {@link org.apache.poi.POIDocument} + * or {@link org.apache.poi.POIXMLDocument} + * + * @since POI 4.0.0 + */ + Object getPersistDocument(); } diff --git a/src/java/org/apache/poi/sl/usermodel/SlideShowFactory.java b/src/java/org/apache/poi/sl/usermodel/SlideShowFactory.java index cdee782c32..02e9fcb146 100644 --- a/src/java/org/apache/poi/sl/usermodel/SlideShowFactory.java +++ b/src/java/org/apache/poi/sl/usermodel/SlideShowFactory.java @@ -60,13 +60,40 @@ public class SlideShowFactory { * @throws IOException if an error occurs while reading the data */ public static SlideShow create(final NPOIFSFileSystem fs, String password) throws IOException { - DirectoryNode root = fs.getRoot(); + return create(fs.getRoot(), password); + } + + /** + * Creates a SlideShow from the given NPOIFSFileSystem. + * + * @param root The {@link DirectoryNode} to start reading the document from + * + * @return The created SlideShow + * + * @throws IOException if an error occurs while reading the data + */ + public static SlideShow create(final DirectoryNode root) throws IOException { + return create(root, null); + } + + /** + * Creates a SlideShow from the given NPOIFSFileSystem, which may + * be password protected + * + * @param root The {@link DirectoryNode} to start reading the document from + * @param password The password that should be used or null if no password is necessary. + * + * @return The created SlideShow + * + * @throws IOException if an error occurs while reading the data + */ + public static SlideShow create(final DirectoryNode root, String password) throws IOException { // Encrypted OOXML files go inside OLE2 containers, is this one? if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { InputStream stream = null; try { - stream = DocumentFactoryHelper.getDecryptedStream(fs, password); + stream = DocumentFactoryHelper.getDecryptedStream(root, password); return createXSLFSlideShow(stream); } finally { @@ -82,7 +109,7 @@ public class SlideShowFactory { passwordSet = true; } try { - return createHSLFSlideShow(fs); + return createHSLFSlideShow(root); } finally { if (passwordSet) { Biff8EncryptionKey.setCurrentUserPassword(null); diff --git a/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java b/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java index 26b2cd84c2..003fe353f1 100644 --- a/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java +++ b/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java @@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor { * * @return the opened document */ + @Override public final POIXMLDocument getDocument() { return _document; } diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 910baf6a64..9a7765af0d 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OfficeXmlFileException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.util.IOUtils; import org.apache.poi.util.NotImplemented; import org.apache.poi.util.POILogFactory; @@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger; import org.apache.poi.util.Removal; import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.poi.xslf.usermodel.XSLFSlideShow; import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; @@ -127,20 +129,20 @@ public class ExtractorFactory { return OLE2ExtractorFactory.getPreferEventExtractor(); } - public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException { + public static T createExtractor(File f) throws IOException, OpenXML4JException, XmlException { NPOIFSFileSystem fs = null; try { fs = new NPOIFSFileSystem(f); if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { - return createEncryptedOOXMLExtractor(fs); + return (T)createEncryptedOOXMLExtractor(fs); } - POIOLE2TextExtractor extractor = createExtractor(fs); + POITextExtractor extractor = createExtractor(fs); extractor.setFilesystem(fs); - return extractor; + return (T)extractor; } catch (OfficeXmlFileException e) { // ensure file-handle release IOUtils.closeQuietly(fs); - return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); + return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); } catch (NotOLE2FileException ne) { // ensure file-handle release IOUtils.closeQuietly(fs); @@ -179,7 +181,7 @@ public class ExtractorFactory { * @throws XmlException If an XML parsing error occurs. * @throws IllegalArgumentException If no matching file type could be found. */ - public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { + public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { try { // Check for the normal Office core document PackageRelationshipCollection core; @@ -226,13 +228,13 @@ public class ExtractorFactory { // Is it XSLF? for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { if ( rel.getContentType().equals( contentType ) ) { - return new XSLFPowerPointExtractor(pkg); + return new SlideShowExtractor(new XMLSlideShow(pkg)); } } // special handling for SlideShow-Theme-files, if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { - return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); + return new SlideShowExtractor(new XMLSlideShow(pkg)); } // How about xlsb? @@ -252,28 +254,28 @@ public class ExtractorFactory { } } - public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - return OLE2ExtractorFactory.createExtractor(fs); + public static T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + return createExtractor(fs.getRoot()); } - public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - return OLE2ExtractorFactory.createExtractor(fs); + public static T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + return createExtractor(fs.getRoot()); } - public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - return OLE2ExtractorFactory.createExtractor(fs); + public static T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { + return createExtractor(fs.getRoot()); } - public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException + public static T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException { // First, check for OOXML for (String entryName : poifsDir.getEntryNames()) { if (entryName.equals("Package")) { OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); - return createExtractor(pkg); + return (T)createExtractor(pkg); } } // If not, ask the OLE2 code to check, with Scratchpad if possible - return OLE2ExtractorFactory.createExtractor(poifsDir); + return (T)OLE2ExtractorFactory.createExtractor(poifsDir); } /** @@ -403,7 +405,7 @@ public class ExtractorFactory { throw new IllegalStateException("Not yet supported"); } - private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs) + private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs) throws IOException { String pass = Biff8EncryptionKey.getCurrentUserPassword(); if (pass == null) { diff --git a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java index ef9097f15e..b3cc7c5879 100644 --- a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java +++ b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java @@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException; * @deprecated use {@link SlideShowExtractor} */ @Deprecated -@Removal(version="4.2.0") +@Removal(version="5.0.0") public class XSLFPowerPointExtractor extends POIXMLTextExtractor { public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{ XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java index 8734f3603e..ffc6f0d86b 100644 --- a/src/ooxml/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java @@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { return new POIXMLPropertiesTextExtractor(this); } + + @Override + public Object getPersistDocument() { + return this; + } } diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFPlaceholderDetails.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFPlaceholderDetails.java index 25fbd63443..e5d321bc0d 100644 --- a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFPlaceholderDetails.java +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFPlaceholderDetails.java @@ -1,3 +1,20 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + package org.apache.poi.xslf.usermodel; import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS; diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFSlide.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFSlide.java index ce650c152d..7ba272ed63 100644 --- a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFSlide.java +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFSlide.java @@ -182,12 +182,20 @@ implements Slide { */ public XSLFCommentAuthors getCommentAuthorsPart() { if(_commentAuthors == null) { + // first scan the slide relations for (POIXMLDocumentPart p : getRelations()) { if (p instanceof XSLFCommentAuthors) { _commentAuthors = (XSLFCommentAuthors)p; return _commentAuthors; } } + // then scan the presentation relations + for (POIXMLDocumentPart p : getSlideShow().getRelations()) { + if (p instanceof XSLFCommentAuthors) { + _commentAuthors = (XSLFCommentAuthors)p; + return _commentAuthors; + } + } } return null; diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 3670f6fc7a..0a885377a6 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -27,16 +27,15 @@ import static org.junit.Assert.fail; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.util.Locale; import org.apache.poi.POIDataSamples; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; -import org.apache.poi.POIXMLException; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.UnsupportedFileFormatException; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; -import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.OldExcelFormatException; @@ -44,18 +43,20 @@ import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; -import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; -import org.junit.BeforeClass; +import org.apache.xmlbeans.XmlException; import org.junit.Test; /** @@ -65,34 +66,39 @@ public class TestExtractorFactory { private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class); - private static File txt; - - private static File xls; - private static File xlsx; - private static File xlsxStrict; - private static File xltx; - private static File xlsEmb; - private static File xlsb; - - private static File doc; - private static File doc6; - private static File doc95; - private static File docx; - private static File dotx; - private static File docEmb; - private static File docEmbOOXML; - - private static File ppt; - private static File pptx; - - private static File msg; - private static File msgEmb; - private static File msgEmbMsg; - - private static File vsd; - private static File vsdx; - - private static File pub; + private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); + private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls"); + private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); + private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx"); + private static final File xltx = getFileAndCheck(ssTests, "test.xltx"); + private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls"); + private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb"); + + private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); + private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc"); + private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc"); + private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc"); + private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx"); + private static final File dotx = getFileAndCheck(wpTests, "test.dotx"); + private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc"); + private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc"); + + private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); + private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt"); + private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx"); + private static final File txt = getFileAndCheck(slTests, "SampleShow.txt"); + + private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance(); + private static final File msg = getFileAndCheck(olTests, "quick.msg"); + private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg"); + private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg"); + + private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); + private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd"); + private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx"); + + private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); + private static File pub = getFileAndCheck(pubTests, "Simple.pub"); private static File getFileAndCheck(POIDataSamples samples, String name) { File file = samples.getFile(name); @@ -104,595 +110,133 @@ public class TestExtractorFactory { return file; } - @BeforeClass - public static void setUp() throws Exception { - - POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); - xls = getFileAndCheck(ssTests, "SampleSS.xls"); - xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); - xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx"); - xltx = getFileAndCheck(ssTests, "test.xltx"); - xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls"); - xlsb = getFileAndCheck(ssTests, "testVarious.xlsb"); - - POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); - doc = getFileAndCheck(wpTests, "SampleDoc.doc"); - doc6 = getFileAndCheck(wpTests, "Word6.doc"); - doc95 = getFileAndCheck(wpTests, "Word95.doc"); - docx = getFileAndCheck(wpTests, "SampleDoc.docx"); - dotx = getFileAndCheck(wpTests, "test.dotx"); - docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc"); - docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc"); - - POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); - ppt = getFileAndCheck(slTests, "SampleShow.ppt"); - pptx = getFileAndCheck(slTests, "SampleShow.pptx"); - txt = getFileAndCheck(slTests, "SampleShow.txt"); - - POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); - vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd"); - vsdx = getFileAndCheck(dgTests, "test.vsdx"); - - POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); - pub = getFileAndCheck(pubTests, "Simple.pub"); - - POIDataSamples olTests = POIDataSamples.getHSMFInstance(); - msg = getFileAndCheck(olTests, "quick.msg"); - msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg"); - msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg"); - } - - @Test - public void testFile() throws Exception { - // Excel - POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls); - assertNotNull("Had empty extractor for " + xls, xlsExtractor); - assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(), - xlsExtractor - instanceof ExcelExtractor - ); - assertTrue( - xlsExtractor.getText().length() > 200 - ); - xlsExtractor.close(); - - POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx); - assertTrue( - extractor.getClass().getName(), - extractor - instanceof XSSFExcelExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(xlsx); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(xltx); - assertTrue( - extractor.getClass().getName(), - extractor - instanceof XSSFExcelExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(xlsb); - assertContains(extractor.getText(), "test"); - extractor.close(); - - - extractor = ExtractorFactory.createExtractor(xltx); - assertContains(extractor.getText(), "test"); - extractor.close(); + private static final Object[] TEST_SET = { + "Excel", xls, ExcelExtractor.class, 200, + "Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200, + "Excel - xltx", xltx, XSSFExcelExtractor.class, -1, + "Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1, + "Word", doc, WordExtractor.class, 120, + "Word - docx", docx, XWPFWordExtractor.class, 120, + "Word - dotx", dotx, XWPFWordExtractor.class, -1, + "Word 6", doc6, Word6Extractor.class, 20, + "Word 95", doc95, Word6Extractor.class, 120, + "PowerPoint", ppt, SlideShowExtractor.class, 120, + "PowerPoint - pptx", pptx, SlideShowExtractor.class, 120, + "Visio", vsd, VisioTextExtractor.class, 50, + "Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20, + "Publisher", pub, PublisherTextExtractor.class, 50, + "Outlook msg", msg, OutlookTextExtactor.class, 50, // TODO Support OOXML-Strict, see bug #57699 - try { - /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict); - fail("OOXML-Strict isn't yet supported"); - } catch (POIXMLException e) { - // Expected, for now - } -// extractor = ExtractorFactory.createExtractor(xlsxStrict); -// assertTrue( -// extractor -// instanceof XSSFExcelExtractor -// ); -// extractor.close(); -// -// extractor = ExtractorFactory.createExtractor(xlsxStrict); -// assertTrue( -// extractor.getText().contains("test") -// ); -// extractor.close(); - - - // Word - extractor = ExtractorFactory.createExtractor(doc); - assertTrue( - extractor - instanceof WordExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(doc6); - assertTrue( - extractor - instanceof Word6Extractor - ); - assertTrue( - extractor.getText().length() > 20 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(doc95); - assertTrue( - extractor - instanceof Word6Extractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(docx); - assertTrue( - extractor instanceof XWPFWordExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(docx); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(dotx); - assertTrue( - extractor instanceof XWPFWordExtractor - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(dotx); - assertContains(extractor.getText(), "Test"); - extractor.close(); - - // PowerPoint (PPT) - extractor = ExtractorFactory.createExtractor(ppt); - assertTrue( - extractor - instanceof PowerPointExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - // PowerPoint (PPTX) - extractor = ExtractorFactory.createExtractor(pptx); - assertTrue( - extractor - instanceof XSLFPowerPointExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); + // xlsxStrict + }; - // Visio - binary - extractor = ExtractorFactory.createExtractor(vsd); - assertTrue( - extractor - instanceof VisioTextExtractor - ); - assertTrue( - extractor.getText().length() > 50 - ); - extractor.close(); + @FunctionalInterface + interface FunctionEx { + R apply(T t) throws IOException, OpenXML4JException, XmlException; + } - // Visio - vsdx - extractor = ExtractorFactory.createExtractor(vsdx); - assertTrue( - extractor - instanceof XDGFVisioExtractor - ); - assertTrue( - extractor.getText().length() > 20 - ); - extractor.close(); - // Publisher - extractor = ExtractorFactory.createExtractor(pub); - assertTrue( - extractor - instanceof PublisherTextExtractor - ); - assertTrue( - extractor.getText().length() > 50 - ); - extractor.close(); - - // Outlook msg - extractor = ExtractorFactory.createExtractor(msg); - assertTrue( - extractor - instanceof OutlookTextExtactor - ); - assertTrue( - extractor.getText().length() > 50 - ); - extractor.close(); + @Test + public void testFile() throws Exception { + for (int i = 0; i < TEST_SET.length; i += 4) { + try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) { + testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]); + } + } + } + @Test(expected = IllegalArgumentException.class) + public void testFileInvalid() throws Exception { // Text - try { - ExtractorFactory.createExtractor(txt); - fail("expected IllegalArgumentException"); - } catch(IllegalArgumentException e) { - // Good - } + try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {} } @Test public void testInputStream() throws Exception { - // Excel - POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls)); - assertTrue( - extractor - instanceof ExcelExtractor - ); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx)); - assertTrue( - extractor.getClass().getName(), - extractor - instanceof XSSFExcelExtractor - ); - assertTrue( - extractor.getText().length() > 200 - ); - // TODO Support OOXML-Strict, see bug #57699 -// assertTrue( -// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)) -// instanceof XSSFExcelExtractor -// ); -// assertTrue( -// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200 -// ); - extractor.close(); - - // Word - extractor = ExtractorFactory.createExtractor(new FileInputStream(doc)); - assertTrue( - extractor.getClass().getName(), - extractor - instanceof WordExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6)); - assertTrue( - extractor.getClass().getName(), - extractor - instanceof Word6Extractor - ); - assertTrue( - extractor.getText().length() > 20 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95)); - assertTrue( - extractor.getClass().getName(), - extractor - instanceof Word6Extractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(new FileInputStream(docx)); - assertTrue( - extractor - instanceof XWPFWordExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - // PowerPoint - extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt)); - assertTrue( - extractor - instanceof PowerPointExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx)); - assertTrue( - extractor - instanceof XSLFPowerPointExtractor - ); - assertTrue( - extractor.getText().length() > 120 - ); - extractor.close(); - - // Visio - extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd)); - assertTrue( - extractor - instanceof VisioTextExtractor - ); - assertTrue( - extractor.getText().length() > 50 - ); - extractor.close(); - - // Visio - vsdx - extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx)); - assertTrue( - extractor - instanceof XDGFVisioExtractor - ); - assertTrue( - extractor.getText().length() > 20 - ); - extractor.close(); - - // Publisher - extractor = ExtractorFactory.createExtractor(new FileInputStream(pub)); - assertTrue( - extractor - instanceof PublisherTextExtractor - ); - assertTrue( - extractor.getText().length() > 50 - ); - extractor.close(); - - // Outlook msg - extractor = ExtractorFactory.createExtractor(new FileInputStream(msg)); - assertTrue( - extractor - instanceof OutlookTextExtactor - ); - assertTrue( - extractor.getText().length() > 50 - ); - extractor.close(); + testStream((f) -> ExtractorFactory.createExtractor(f), true); + } - // Text - try (FileInputStream stream = new FileInputStream(txt)) { - ExtractorFactory.createExtractor(stream); - fail("expected IllegalArgumentException"); - } catch(IllegalArgumentException e) { - // Good - } + @Test(expected = IllegalArgumentException.class) + public void testInputStreamInvalid() throws Exception { + testInvalid((f) -> ExtractorFactory.createExtractor(f)); } @Test public void testPOIFS() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))) - instanceof Word6Extractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 - ); - - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))) - instanceof Word6Extractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 - ); - - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 - ); - - // Publisher - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))) - instanceof PublisherTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 - ); - - // Outlook msg - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) - instanceof OutlookTextExtactor - ); - assertTrue( - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); - fail("expected IllegalArgumentException"); - } catch(IOException e) { - // Good - } + testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false); } + @Test(expected = IOException.class) + public void testPOIFSInvalid() throws Exception { + testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f))); + } @Test public void testOPOIFS() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200 - ); - - // Word - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))) - instanceof Word6Extractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 - ); - - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))) - instanceof Word6Extractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 - ); + testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false); + } - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120 - ); + @Test(expected = IOException.class) + public void testOPOIFSInvalid() throws Exception { + testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f))); + } - // Visio - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 - ); - // Publisher - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))) - instanceof PublisherTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50 - ); + private void testStream(final FunctionEx poifsIS, final boolean loadOOXML) + throws IOException, OpenXML4JException, XmlException { + for (int i = 0; i < TEST_SET.length; i += 4) { + File testFile = (File) TEST_SET[i + 1]; + if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) { + continue; + } + try (FileInputStream fis = new FileInputStream(testFile); + POITextExtractor ext = poifsIS.apply(fis)) { + testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]); + } catch (IllegalArgumentException e) { + fail("failed to process "+testFile); + } + } + } - // Outlook msg - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))) - instanceof OutlookTextExtactor - ); - assertTrue( - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 - ); + private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) { + assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext)); + final String actual = ext.getText(); + if (minLength == -1) { + assertContains(actual.toLowerCase(Locale.ROOT), "test"); + } else { + assertTrue("extracted content too short for " + testcase, actual.length() > minLength); + } + } + private void testInvalid(FunctionEx poifs) throws IOException, OpenXML4JException, XmlException { // Text - try { - ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt))); - fail("expected IllegalArgumentException"); - } catch(IOException e) { - // Good + try (FileInputStream fis = new FileInputStream(txt); + POITextExtractor te = poifs.apply(fis)) { } } @Test public void testPackage() throws Exception { - // Excel - POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue(extractor instanceof XSSFExcelExtractor); - extractor.close(); - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); - assertTrue(extractor.getText().length() > 200); - extractor.close(); - - // Word - extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); - assertTrue(extractor instanceof XWPFWordExtractor); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString())); - assertTrue(extractor.getText().length() > 120); - extractor.close(); - - // PowerPoint - extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); - assertTrue(extractor instanceof XSLFPowerPointExtractor); - extractor.close(); + for (int i = 0; i < TEST_SET.length; i += 4) { + final File testFile = (File) TEST_SET[i + 1]; + if (!testFile.getName().endsWith("x")) { + continue; + } - extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString())); - assertTrue(extractor.getText().length() > 120); - extractor.close(); - - // Visio - extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString())); - assertTrue(extractor instanceof XDGFVisioExtractor); - assertTrue(extractor.getText().length() > 20); - extractor.close(); + try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ); + final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) { + testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]); + pkg.revert(); + } + } + } + @Test(expected = UnsupportedFileFormatException.class) + public void testPackageInvalid() throws Exception { // Text - try { - ExtractorFactory.createExtractor(OPCPackage.open(txt.toString())); - fail("TestExtractorFactory.testPackage() failed on " + txt); - } catch(UnsupportedFileFormatException e) { - // Good - } catch (Exception e) { - LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt); - throw e; - } + try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ); + final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {} } @Test @@ -781,142 +325,49 @@ public class TestExtractorFactory { * does poifs embedded, but will do ooxml ones * at some point. */ - @SuppressWarnings("deprecation") @Test public void testEmbedded() throws Exception { - POIOLE2TextExtractor ext; - POITextExtractor[] embeds; - - // No embeddings - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xls); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - assertEquals(0, embeds.length); - ext.close(); - - // No embeddings - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xls); - embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext); - assertEquals(0, embeds.length); - ext.close(); - - // Excel - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xlsEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - assertNotNull(embeds); - ext.close(); - - // Excel - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xlsEmb); - embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext); - - assertEquals(6, embeds.length); - int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX; - for (POITextExtractor embed : embeds) { - assertTrue(embed.getText().length() > 20); - - if (embed instanceof PowerPointExtractor) numPpt++; - else if (embed instanceof ExcelExtractor) numXls++; - else if (embed instanceof WordExtractor) numWord++; - else if (embed instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(2, numPpt); - assertEquals(2, numXls); - assertEquals(2, numWord); - assertEquals(0, numMsg); - ext.close(); - - // Word - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(docEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; - assertEquals(4, embeds.length); - for (POITextExtractor embed : embeds) { - assertTrue(embed.getText().length() > 20); - if (embed instanceof PowerPointExtractor) numPpt++; - else if (embed instanceof ExcelExtractor) numXls++; - else if (embed instanceof WordExtractor) numWord++; - else if (embed instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(1, numPpt); - assertEquals(2, numXls); - assertEquals(1, numWord); - assertEquals(0, numMsg); - ext.close(); - - // Word which contains an OOXML file - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(docEmbOOXML); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0; - assertEquals(3, embeds.length); - for (POITextExtractor embed : embeds) { - assertTrue(embed.getText().length() > 20); - if (embed instanceof PowerPointExtractor) numPpt++; - else if (embed instanceof ExcelExtractor) numXls++; - else if (embed instanceof WordExtractor) numWord++; - else if (embed instanceof OutlookTextExtactor) numMsg++; - else if (embed instanceof XWPFWordExtractor) numWordX++; - } - assertEquals(1, numPpt); - assertEquals(1, numXls); - assertEquals(0, numWord); - assertEquals(1, numWordX); - assertEquals(0, numMsg); - ext.close(); - - // Outlook - ext = (OutlookTextExtactor) - ExtractorFactory.createExtractor(msgEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; - assertEquals(1, embeds.length); - for (POITextExtractor embed : embeds) { - assertTrue(embed.getText().length() > 20); - if (embed instanceof PowerPointExtractor) numPpt++; - else if (embed instanceof ExcelExtractor) numXls++; - else if (embed instanceof WordExtractor) numWord++; - else if (embed instanceof OutlookTextExtactor) numMsg++; - } - assertEquals(0, numPpt); - assertEquals(0, numXls); - assertEquals(1, numWord); - assertEquals(0, numMsg); - ext.close(); - - // Outlook with another outlook file in it - ext = (OutlookTextExtactor) - ExtractorFactory.createExtractor(msgEmbMsg); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; - assertEquals(1, embeds.length); - for (POITextExtractor embed : embeds) { - assertTrue(embed.getText().length() > 20); - if (embed instanceof PowerPointExtractor) numPpt++; - else if (embed instanceof ExcelExtractor) numXls++; - else if (embed instanceof WordExtractor) numWord++; - else if (embed instanceof OutlookTextExtactor) numMsg++; + final Object[] testObj = { + "No embeddings", xls, "0-0-0-0-0-0", + "Excel", xlsEmb, "6-2-2-2-0-0", + "Word", docEmb, "4-1-2-1-0-0", + "Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1", + "Outlook", msgEmb, "1-1-0-0-0-0", + "Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0", + }; + + for (int i=0; i 20); + if (embed instanceof SlideShowExtractor) { + numPpt++; + } else if (embed instanceof ExcelExtractor) { + numXls++; + } else if (embed instanceof WordExtractor) { + numWord++; + } else if (embed instanceof OutlookTextExtactor) { + numMsg++; + } else if (embed instanceof XWPFWordExtractor) { + numWordX++; + } + } + + final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX; + final String expected = (String)testObj[i+2]; + assertEquals("invalid number of embeddings - "+testObj[i], expected, actual); + } } - assertEquals(0, numPpt); - assertEquals(0, numXls); - assertEquals(0, numWord); - assertEquals(1, numMsg); - ext.close(); // TODO - PowerPoint // TODO - Publisher // TODO - Visio } - private static final String[] EXPECTED_FAILURES = new String[] { + private static final String[] EXPECTED_FAILURES = { // password protected files "spreadsheet/password.xls", "spreadsheet/protected_passtika.xlsx", @@ -1018,37 +469,26 @@ public class TestExtractorFactory { * #59074 - Excel 95 files should give a helpful message, not just * "No supported documents found in the OLE2 stream" */ - @Test + @Test(expected = OldExcelFormatException.class) public void bug59074() throws Exception { - try { - ExtractorFactory.createExtractor( - POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); - fail("Old excel formats not supported via ExtractorFactory"); - } catch (OldExcelFormatException e) { - // expected here - } + ExtractorFactory.createExtractor( + POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); } @SuppressWarnings("deprecation") - @Test - public void testGetEmbeddedFromXMLExtractor() { - try { - // currently not implemented - ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null); - fail("Unsupported currently"); - } catch (IllegalStateException e) { - // expected here - } + @Test(expected = IllegalStateException.class) + public void testGetEmbedFromXMLExtractor() { + // currently not implemented + ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null); + } - try { - // currently not implemented - ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null); - fail("Unsupported currently"); - } catch (IllegalStateException e) { - // expected here - } + @SuppressWarnings("deprecation") + @Test(expected = IllegalStateException.class) + public void testGetEmbeddedFromXMLExtractor() { + // currently not implemented + ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null); } - + // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed. // When this happens, change this from @Test(expected=...) to @Test // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor diff --git a/src/ooxml/testcases/org/apache/poi/poifs/crypt/TestHxxFEncryption.java b/src/ooxml/testcases/org/apache/poi/poifs/crypt/TestHxxFEncryption.java index 68c8bfa054..7403ecc3d7 100644 --- a/src/ooxml/testcases/org/apache/poi/poifs/crypt/TestHxxFEncryption.java +++ b/src/ooxml/testcases/org/apache/poi/poifs/crypt/TestHxxFEncryption.java @@ -120,10 +120,10 @@ public class TestHxxFEncryption { public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException { Biff8EncryptionKey.setCurrentUserPassword(password); File f = sampleDir.getFile(file); - POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f); + POITextExtractor te1 = ExtractorFactory.createExtractor(f); Biff8EncryptionKey.setCurrentUserPassword(newPass); ByteArrayOutputStream bos = new ByteArrayOutputStream(); - POIDocument doc = te1.getDocument(); + POIDocument doc = (POIDocument)te1.getDocument(); doc.write(bos); doc.close(); te1.close(); @@ -140,25 +140,25 @@ public class TestHxxFEncryption { ByteArrayOutputStream bos = new ByteArrayOutputStream(); Biff8EncryptionKey.setCurrentUserPassword(password); File f = sampleDir.getFile(file); - POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f); + POITextExtractor te1 = ExtractorFactory.createExtractor(f); // first remove encryption Biff8EncryptionKey.setCurrentUserPassword(null); - POIDocument doc = te1.getDocument(); + POIDocument doc = (POIDocument)te1.getDocument(); doc.write(bos); doc.close(); te1.close(); // then use default setting, which is cryptoapi String newPass = "newPass"; - POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); + POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); Biff8EncryptionKey.setCurrentUserPassword(newPass); - doc = te2.getDocument(); + doc = (POIDocument)te2.getDocument(); bos.reset(); doc.write(bos); doc.close(); te2.close(); // and finally update cryptoapi setting - POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); - doc = te3.getDocument(); + POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); + doc = (POIDocument)te3.getDocument(); // need to cache data (i.e. read all data) before changing the key size if (doc instanceof HSLFSlideShowImpl) { HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc; @@ -175,8 +175,8 @@ public class TestHxxFEncryption { doc.close(); te3.close(); // check the setting - POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); - doc = te4.getDocument(); + POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray())); + doc = (POIDocument)te4.getDocument(); ei = doc.getEncryptionInfo(); assertNotNull(ei); assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader); diff --git a/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java b/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java index 94e5a5077f..a6a6467f9e 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java @@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePartName; import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.sl.draw.DrawPaint; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.PaintStyle; import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint; import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint; @@ -221,28 +222,27 @@ public class TestXSLFBugs { * rID2 -> slide3.xml */ @Test - public void bug54916() throws Exception { - XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx"); - XSLFSlide slide; + public void bug54916() throws IOException { + try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) { + XSLFSlide slide; - // Should find 4 slides - assertEquals(4, ss.getSlides().size()); + // Should find 4 slides + assertEquals(4, ss.getSlides().size()); - // Check the text, to see we got them in order - slide = ss.getSlides().get(0); - assertContains(getSlideText(slide), "POI cannot read this"); + // Check the text, to see we got them in order + slide = ss.getSlides().get(0); + assertContains(getSlideText(ss, slide), "POI cannot read this"); - slide = ss.getSlides().get(1); - assertContains(getSlideText(slide), "POI can read this"); - assertContains(getSlideText(slide), "Has a relationship to another slide"); + slide = ss.getSlides().get(1); + assertContains(getSlideText(ss, slide), "POI can read this"); + assertContains(getSlideText(ss, slide), "Has a relationship to another slide"); - slide = ss.getSlides().get(2); - assertContains(getSlideText(slide), "POI can read this"); + slide = ss.getSlides().get(2); + assertContains(getSlideText(ss, slide), "POI can read this"); - slide = ss.getSlides().get(3); - assertContains(getSlideText(slide), "POI can read this"); - - ss.close(); + slide = ss.getSlides().get(3); + assertContains(getSlideText(ss, slide), "POI can read this"); + } } /** @@ -311,8 +311,15 @@ public class TestXSLFBugs { ss.close(); } - protected String getSlideText(XSLFSlide slide) { - return XSLFPowerPointExtractor.getText(slide, true, false, false); + protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException { + try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) { + // do not auto-close the slideshow + extr.setFilesystem(null); + extr.setSlidesByDefault(true); + extr.setNotesByDefault(false); + extr.setMasterByDefault(false); + return extr.getText(slide); + } } @Test @@ -458,7 +465,7 @@ public class TestXSLFBugs { for (int i = 0; i < slideTexts.length; i++) { XSLFSlide slide = ss.getSlides().get(i); - assertContains(getSlideText(slide), slideTexts[i]); + assertContains(getSlideText(ss, slide), slideTexts[i]); } } diff --git a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java index 2b6a96ad8d..353ce7cbd4 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java @@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import java.io.File; import java.io.IOException; import java.io.InputStream; import org.apache.poi.POIDataSamples; -import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; -import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.xmlbeans.XmlException; +import org.junit.Ignore; import org.junit.Test; /** @@ -44,188 +45,189 @@ public class TestXSLFPowerPointExtractor { /** * Get text out of the simple file - * @throws XmlException - * @throws OpenXML4JException */ @Test - public void testGetSimpleText() - throws IOException, XmlException, OpenXML4JException { - XMLSlideShow xmlA = openPPTX("sample.pptx"); - @SuppressWarnings("resource") - OPCPackage pkg = xmlA.getPackage(); - - new XSLFPowerPointExtractor(xmlA).close(); - new XSLFPowerPointExtractor(pkg).close(); - - XSLFPowerPointExtractor extractor = - new XSLFPowerPointExtractor(xmlA); - extractor.getText(); - - String text = extractor.getText(); - assertTrue(text.length() > 0); - - // Check Basics - assertStartsWith(text, "Lorem ipsum dolor sit amet\n"); - assertContains(text, "amet\n\n"); - - // Our placeholder master text - // This shouldn't show up in the output - // String masterText = - // "Click to edit Master title style\n" + - // "Click to edit Master subtitle style\n" + - // "\n\n\n\n\n\n" + - // "Click to edit Master title style\n" + - // "Click to edit Master text styles\n" + - // "Second level\n" + - // "Third level\n" + - // "Fourth level\n" + - // "Fifth level\n"; - - // Just slides, no notes - text = extractor.getText(true, false, false); - String slideText = - "Lorem ipsum dolor sit amet\n" + - "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + - "\n" + - "Lorem ipsum dolor sit amet\n" + - "Lorem\n" + - "ipsum\n" + - "dolor\n" + - "sit\n" + - "amet\n" + - "\n"; - assertEquals(slideText, text); - - // Just notes, no slides - text = extractor.getText(false, true); - assertEquals("\n\n1\n\n\n2\n", text); - - // Both - text = extractor.getText(true, true, false); - String bothText = - "Lorem ipsum dolor sit amet\n" + - "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + - "\n\n\n1\n" + - "Lorem ipsum dolor sit amet\n" + - "Lorem\n" + - "ipsum\n" + - "dolor\n" + - "sit\n" + - "amet\n" + - "\n\n\n2\n"; - assertEquals(bothText, text); - - // With Slides and Master Text - text = extractor.getText(true, false, true); - String smText = - "Lorem ipsum dolor sit amet\n" + - "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + - "\n" + - "Lorem ipsum dolor sit amet\n" + - "Lorem\n" + - "ipsum\n" + - "dolor\n" + - "sit\n" + - "amet\n" + - "\n"; - assertEquals(smText, text); - - // With Slides, Notes and Master Text - text = extractor.getText(true, true, true); - String snmText = - "Lorem ipsum dolor sit amet\n" + - "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + - "\n\n\n1\n" + - "Lorem ipsum dolor sit amet\n" + - "Lorem\n" + - "ipsum\n" + - "dolor\n" + - "sit\n" + - "amet\n" + - "\n\n\n2\n"; - assertEquals(snmText, text); - - // Via set defaults - extractor.setSlidesByDefault(false); - extractor.setNotesByDefault(true); - text = extractor.getText(); - assertEquals("\n\n1\n\n\n2\n", text); - - extractor.close(); - xmlA.close(); + public void testGetSimpleText() throws IOException { + try (XMLSlideShow xmlA = openPPTX("sample.pptx"); + SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) { + + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check Basics + assertStartsWith(text, "Lorem ipsum dolor sit amet\n"); + assertContains(text, "amet\n\n"); + + // Our placeholder master text + // This shouldn't show up in the output + // String masterText = + // "Click to edit Master title style\n" + + // "Click to edit Master subtitle style\n" + + // "\n\n\n\n\n\n" + + // "Click to edit Master title style\n" + + // "Click to edit Master text styles\n" + + // "Second level\n" + + // "Third level\n" + + // "Fourth level\n" + + // "Fifth level\n"; + + // Just slides, no notes + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(false); + extractor.setMasterByDefault(false); + text = extractor.getText(); + String slideText = + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n"; + assertEquals(slideText, text); + + // Just notes, no slides + extractor.setSlidesByDefault(false); + extractor.setNotesByDefault(true); + text = extractor.getText(); + assertEquals("\n\n1\n\n\n2\n", text); + + // Both + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(true); + text = extractor.getText(); + String bothText = + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n\n\n1\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n\n\n2\n"; + assertEquals(bothText, text); + + // With Slides and Master Text + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(false); + extractor.setMasterByDefault(true); + text = extractor.getText(); + String smText = + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n"; + assertEquals(smText, text); + + // With Slides, Notes and Master Text + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(true); + extractor.setMasterByDefault(true); + text = extractor.getText(); + String snmText = + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n\n\n1\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n\n\n2\n"; + assertEquals(snmText, text); + + // Via set defaults + extractor.setSlidesByDefault(false); + extractor.setNotesByDefault(true); + text = extractor.getText(); + assertEquals("\n\n1\n\n\n2\n", text); + } } + @Test public void testGetComments() throws IOException { - XMLSlideShow xml = openPPTX("45545_Comment.pptx"); - XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); - - String text = extractor.getText(); - assertTrue(text.length() > 0); + try (XMLSlideShow xml = openPPTX("45545_Comment.pptx"); + SlideShowExtractor extractor = new SlideShowExtractor(xml)) { + extractor.setCommentsByDefault(true); - // Check comments are there - assertContains(text, "testdoc"); - assertContains(text, "test phrase"); + String text = extractor.getText(); + assertTrue(text.length() > 0); - // Check the authors came through too - assertContains(text, "XPVMWARE01"); + // Check comments are there + assertContains(text, "testdoc"); + assertContains(text, "test phrase"); - extractor.close(); - xml.close(); + // Check the authors came through too + assertContains(text, "XPVMWARE01"); + } } + @Test + @Ignore("currently slidelayouts aren't yet supported") public void testGetMasterText() throws Exception { - XMLSlideShow xml = openPPTX("WithMaster.pptx"); - XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); - extractor.setSlidesByDefault(true); - extractor.setNotesByDefault(false); - extractor.setMasterByDefault(true); - - String text = extractor.getText(); - assertTrue(text.length() > 0); - - // Check master text is there - assertContains(text, "Footer from the master slide"); - - // Theme text shouldn't show up - // String themeText = - // "Theme Master Title\n" + - // "Theme Master first level\n" + - // "And the 2nd level\n" + - // "Our 3rd level goes here\n" + - // "And onto the 4th, such fun....\n" + - // "Finally is the Fifth level\n"; - - // Check the whole text - String wholeText = - "First page title\n" + - "First page subtitle\n" + - "This is the Master Title\n" + - "This text comes from the Master Slide\n" + - "\n" + - // TODO Detect we didn't have a title, and include the master one - "2nd page subtitle\n" + - "Footer from the master slide\n" + - "This is the Master Title\n" + - "This text comes from the Master Slide\n"; - assertEquals(wholeText, text); - - extractor.close(); - xml.close(); + try (XMLSlideShow xml = openPPTX("WithMaster.pptx"); + SlideShowExtractor extractor = new SlideShowExtractor(xml)) { + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(false); + extractor.setMasterByDefault(true); + + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check master text is there + assertContains(text, "Footer from the master slide"); + + // Theme text shouldn't show up + // String themeText = + // "Theme Master Title\n" + + // "Theme Master first level\n" + + // "And the 2nd level\n" + + // "Our 3rd level goes here\n" + + // "And onto the 4th, such fun....\n" + + // "Finally is the Fifth level\n"; + + // Check the whole text + String wholeText = + "First page title\n" + + "First page subtitle\n" + + "This is the Master Title\n" + + "This text comes from the Master Slide\n" + + "\n" + + // TODO Detect we didn't have a title, and include the master one + "2nd page subtitle\n" + + "Footer from the master slide\n" + + "This is the Master Title\n" + + "This text comes from the Master Slide\n"; + assertEquals(wholeText, text); + } } @Test public void testTable() throws Exception { - XMLSlideShow xml = openPPTX("present1.pptx"); - XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); - - String text = extractor.getText(); - assertTrue(text.length() > 0); + try (XMLSlideShow xml = openPPTX("present1.pptx"); + SlideShowExtractor extractor = new SlideShowExtractor(xml)) { - // Check comments are there - assertContains(text, "TEST"); + String text = extractor.getText(); + assertTrue(text.length() > 0); - extractor.close(); - xml.close(); + // Check comments are there + assertContains(text, "TEST"); + } } /** @@ -241,74 +243,76 @@ public class TestXSLFPowerPointExtractor { }; for(String extension : extensions) { String filename = "testPPT." + extension; - XMLSlideShow xml = openPPTX(filename); - XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); - String text = extractor.getText(); - if (extension.equals("thmx")) { - // Theme file doesn't have any textual content - assertEquals(filename, 0, text.length()); - continue; + try (XMLSlideShow xml = openPPTX(filename); + SlideShowExtractor extractor = new SlideShowExtractor(xml)) { + + String text = extractor.getText(); + if (extension.equals("thmx")) { + // Theme file doesn't have any textual content + assertEquals(filename, 0, text.length()); + continue; + } + + assertTrue(filename, text.length() > 0); + assertContains(filename, text, "Attachment Test"); + assertContains(filename, text, "This is a test file data with the same content"); + assertContains(filename, text, "content parsing"); + assertContains(filename, text, "Different words to test against"); + assertContains(filename, text, "Mystery"); } - - assertTrue(filename, text.length() > 0); - assertContains(filename, text, "Attachment Test"); - assertContains(filename, text, "This is a test file data with the same content"); - assertContains(filename, text, "content parsing"); - assertContains(filename, text, "Different words to test against"); - assertContains(filename, text, "Mystery"); - - extractor.close(); - xml.close(); } } @Test - public void test45541() throws Exception { + public void test45541() throws IOException, OpenXML4JException, XmlException { // extract text from a powerpoint that has a header in the notes-element - POITextExtractor extr = ExtractorFactory.createExtractor( - slTests.getFile("45541_Header.pptx")); - String text = extr.getText(); - assertNotNull(text); - assertFalse("Had: " + text, text.contains("testdoc")); - - text = ((XSLFPowerPointExtractor)extr).getText(false, true); - assertContains(text, "testdoc"); - extr.close(); - assertNotNull(text); + final File headerFile = slTests.getFile("45541_Header.pptx"); + try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) { + String text = extr.getText(); + assertNotNull(text); + assertFalse("Had: " + text, text.contains("testdoc")); + + extr.setSlidesByDefault(false); + extr.setNotesByDefault(true); + + text = extr.getText(); + assertContains(text, "testdoc"); + assertNotNull(text); + } // extract text from a powerpoint that has a footer in the master-slide - extr = ExtractorFactory.createExtractor( - slTests.getFile("45541_Footer.pptx")); - text = extr.getText(); - assertNotContained(text, "testdoc"); - - text = ((XSLFPowerPointExtractor)extr).getText(false, true); - assertNotContained(text, "testdoc"); - - text = ((XSLFPowerPointExtractor)extr).getText(false, false, true); - assertNotContained(text, "testdoc"); - - extr.close(); + final File footerFile = slTests.getFile("45541_Footer.pptx"); + try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) { + String text = extr.getText(); + assertNotContained(text, "testdoc"); + + extr.setSlidesByDefault(false); + extr.setNotesByDefault(true); + text = extr.getText(); + assertNotContained(text, "testdoc"); + + extr.setSlidesByDefault(false); + extr.setNotesByDefault(false); + extr.setMasterByDefault(true); + text = extr.getText(); + assertNotContained(text, "testdoc"); + } } @Test public void bug54570() throws IOException { - XMLSlideShow xml = openPPTX("bug54570.pptx"); - XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); - String text = extractor.getText(); - assertNotNull(text); - extractor.close(); - xml.close(); + try (XMLSlideShow xml = openPPTX("bug54570.pptx"); + SlideShowExtractor extractor = new SlideShowExtractor(xml)) { + String text = extractor.getText(); + assertNotNull(text); + } } private XMLSlideShow openPPTX(String file) throws IOException { - InputStream is = slTests.openResourceAsStream(file); - try { + try (InputStream is = slTests.openResourceAsStream(file)) { return new XMLSlideShow(is); - } finally { - is.close(); } } } diff --git a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java index bf7cc57d4f..f77d0834f9 100644 --- a/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java +++ b/src/scratchpad/src/org/apache/poi/extractor/OLE2ScratchpadExtractorFactory.java @@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.sl.extractor.SlideShowExtractor; +import org.apache.poi.sl.usermodel.SlideShowFactory; /** * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and @@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory { } if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { - return new PowerPointExtractor(poifsDir); + return new SlideShowExtractor(SlideShowFactory.create(poifsDir)); } if (poifsDir.hasEntry("VisioDocument")) { diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java index 156747eace..efbefd6ceb 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java @@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.SlideShowFactory; +import org.apache.poi.util.Removal; /** * This class can be used to extract text from a PowerPoint file. Can optionally @@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory; */ @SuppressWarnings("WeakerAccess") @Deprecated +@Removal(version="5.0.0") public final class PowerPointExtractor extends POIOLE2TextExtractor { private final SlideShowExtractor delegate; diff --git a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java index ab6cc818d0..16df6f384f 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java +++ b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java @@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShowNote that in order to properly release resources the - * SlideShow should be closed after use. + * Creates a HSLFSlideShow from the given NPOIFSFileSystem

+ * Note that in order to properly release resources the + * SlideShow should be closed after use. */ - public static SlideShow createSlideShow(NPOIFSFileSystem fs) throws IOException { + public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException { return new HSLFSlideShow(fs); } + /** + * Creates a HSLFSlideShow from the given DirectoryNode

+ * Note that in order to properly release resources the + * SlideShow should be closed after use. + */ + public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException { + return new HSLFSlideShow(root); + } } diff --git a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java index 248fb3b2c5..c0676f2264 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java +++ b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java @@ -846,9 +846,13 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { @Override public void close() throws IOException { - NPOIFSFileSystem fs = getDirectory().getFileSystem(); - if (fs != null) { - fs.close(); + // only close the filesystem, if we are based on the root node. + // embedded documents/slideshows shouldn't close the parent container + if (getDirectory().getParent() == null) { + NPOIFSFileSystem fs = getDirectory().getFileSystem(); + if (fs != null) { + fs.close(); + } } } diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java index 6a34d1af3f..9a420a6e26 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java @@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.sl.extractor.SlideShowExtractor; +import org.apache.poi.sl.usermodel.ObjectShape; +import org.apache.poi.sl.usermodel.SlideShow; +import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.util.IOUtils; import org.junit.Test; @@ -76,43 +80,46 @@ public final class TestExtractor { // ppe.close(); // } - private PowerPointExtractor openExtractor(String fileName) throws IOException { - InputStream is = slTests.openResourceAsStream(fileName); - try { - return new PowerPointExtractor(is); - } finally { - is.close(); + private SlideShowExtractor openExtractor(String fileName) throws IOException { + try (InputStream is = slTests.openResourceAsStream(fileName)) { + return new SlideShowExtractor(SlideShowFactory.create(is)); } } @Test public void testReadSheetText() throws IOException { // Basic 2 page example - PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt"); - assertEquals(expectText, ppe.getText()); - ppe.close(); + try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) { + assertEquals(expectText, ppe.getText()); + } // 1 page example with text boxes - PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt"); - assertEquals(expectText2, ppe2.getText()); - ppe2.close(); + try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) { + assertEquals(expectText2, ppe.getText()); + } } @Test public void testReadNoteText() throws IOException { // Basic 2 page example - PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt"); - String notesText = ppe.getNotes(); - String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n"; - assertEquals(expText, notesText); - ppe.close(); + try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) { + ppe.setNotesByDefault(true); + ppe.setSlidesByDefault(false); + ppe.setMasterByDefault(false); + String notesText = ppe.getText(); + String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n"; + assertEquals(expText, notesText); + } // Other one doesn't have notes - PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt"); - notesText = ppe2.getNotes(); - expText = ""; - assertEquals(expText, notesText); - ppe2.close(); + try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) { + ppe.setNotesByDefault(true); + ppe.setSlidesByDefault(false); + ppe.setMasterByDefault(false); + String notesText = ppe.getText(); + String expText = ""; + assertEquals(expText, notesText); + } } @Test @@ -126,19 +133,19 @@ public final class TestExtractor { "\nThese are the notes on page two, again lacking formatting\n" }; - PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt"); - ppe.setSlidesByDefault(true); - ppe.setNotesByDefault(false); - assertEquals(slText[0] + slText[1], ppe.getText()); + try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) { + ppe.setSlidesByDefault(true); + ppe.setNotesByDefault(false); + assertEquals(slText[0] + slText[1], ppe.getText()); - ppe.setSlidesByDefault(false); - ppe.setNotesByDefault(true); - assertEquals(ntText[0] + ntText[1], ppe.getText()); + ppe.setSlidesByDefault(false); + ppe.setNotesByDefault(true); + assertEquals(ntText[0] + ntText[1], ppe.getText()); - ppe.setSlidesByDefault(true); - ppe.setNotesByDefault(true); - assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText()); - ppe.close(); + ppe.setSlidesByDefault(true); + ppe.setNotesByDefault(true); + assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText()); + } } /** @@ -149,45 +156,46 @@ public final class TestExtractor { */ @Test public void testMissingCoreRecords() throws IOException { - PowerPointExtractor ppe = openExtractor("missing_core_records.ppt"); - - String text = ppe.getText(true, false); - String nText = ppe.getNotes(); - - assertNotNull(text); - assertNotNull(nText); - - // Notes record were corrupt, so don't expect any - assertEquals(nText.length(), 0); - - // Slide records were fine - assertContains(text, "Using Disease Surveillance and Response"); - - ppe.close(); + try (SlideShowExtractor ppe = openExtractor("missing_core_records.ppt")) { + ppe.setSlidesByDefault(true); + ppe.setNotesByDefault(false); + String text = ppe.getText(); + ppe.setSlidesByDefault(false); + ppe.setNotesByDefault(true); + String nText = ppe.getText(); + + assertNotNull(text); + assertNotNull(nText); + + // Notes record were corrupt, so don't expect any + assertEquals(nText.length(), 0); + + // Slide records were fine + assertContains(text, "Using Disease Surveillance and Response"); + } } @Test public void testExtractFromEmbeded() throws IOException { - InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls"); - POIFSFileSystem fs = new POIFSFileSystem(is); - DirectoryNode root = fs.getRoot(); - PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n"); - PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"); - ppe2.close(); - ppe1.close(); - fs.close(); - } - - private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected) - throws IOException { - DirectoryNode dir = (DirectoryNode)root.getEntry(entryName); - assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)); - - // Check the first file - HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir); - PowerPointExtractor ppe = new PowerPointExtractor(ppt); - assertEquals(expected, ppe.getText(true, false)); - return ppe; + try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls"); + final POIFSFileSystem fs = new POIFSFileSystem(is)) { + final DirectoryNode root = fs.getRoot(); + + final String[] TEST_SET = { + "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n", + "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n" + }; + + for (int i=0; i ppt = SlideShowFactory.create(dir); + final SlideShowExtractor ppe = new SlideShowExtractor(ppt)) { + assertEquals(TEST_SET[i+1], ppe.getText()); + } + } + } } /** @@ -195,32 +203,32 @@ public final class TestExtractor { */ @Test public void testExtractFromOwnEmbeded() throws IOException { - PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt"); - List shapes = ppe.getOLEShapes(); - assertEquals("Expected 6 ole shapes", 6, shapes.size()); - int num_ppt = 0, num_doc = 0, num_xls = 0; - for (HSLFObjectShape ole : shapes) { - String name = ole.getInstanceName(); - InputStream data = ole.getObjectData().getInputStream(); - if ("Worksheet".equals(name)) { - HSSFWorkbook wb = new HSSFWorkbook(data); - num_xls++; - wb.close(); - } else if ("Document".equals(name)) { - HWPFDocument doc = new HWPFDocument(data); - num_doc++; - doc.close(); - } else if ("Presentation".equals(name)) { - num_ppt++; - HSLFSlideShow ppt = new HSLFSlideShow(data); - ppt.close(); + try (SlideShowExtractor ppe = openExtractor("ppt_with_embeded.ppt")) { + List shapes = ppe.getOLEShapes(); + assertEquals("Expected 6 ole shapes", 6, shapes.size()); + int num_ppt = 0, num_doc = 0, num_xls = 0; + for (ObjectShape ole : shapes) { + String name = ((HSLFObjectShape)ole).getInstanceName(); + InputStream data = ole.getObjectData().getInputStream(); + if ("Worksheet".equals(name)) { + HSSFWorkbook wb = new HSSFWorkbook(data); + num_xls++; + wb.close(); + } else if ("Document".equals(name)) { + HWPFDocument doc = new HWPFDocument(data); + num_doc++; + doc.close(); + } else if ("Presentation".equals(name)) { + num_ppt++; + HSLFSlideShow ppt = new HSLFSlideShow(data); + ppt.close(); + } + data.close(); } - data.close(); + assertEquals("Expected 2 embedded Word Documents", 2, num_doc); + assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls); + assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt); } - assertEquals("Expected 2 embedded Word Documents", 2, num_doc); - assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls); - assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt); - ppe.close(); } /** @@ -228,11 +236,11 @@ public final class TestExtractor { */ @Test public void test52991() throws IOException { - PowerPointExtractor ppe = openExtractor("badzip.ppt"); - for (HSLFObjectShape shape : ppe.getOLEShapes()) { - IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream()); + try (SlideShowExtractor ppe = openExtractor("badzip.ppt")) { + for (ObjectShape shape : ppe.getOLEShapes()) { + IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream()); + } } - ppe.close(); } /** @@ -240,27 +248,27 @@ public final class TestExtractor { */ @Test public void testWithComments() throws IOException { - PowerPointExtractor ppe1 = openExtractor("WithComments.ppt"); - String text = ppe1.getText(); - assertFalse("Comments not in by default", text.contains("This is a test comment")); + try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) { + String text = ppe.getText(); + assertFalse("Comments not in by default", text.contains("This is a test comment")); - ppe1.setCommentsByDefault(true); + ppe.setCommentsByDefault(true); - text = ppe1.getText(); - assertContains(text, "This is a test comment"); - ppe1.close(); + text = ppe.getText(); + assertContains(text, "This is a test comment"); + } // And another file - PowerPointExtractor ppe2 = openExtractor("45543.ppt"); - text = ppe2.getText(); - assertFalse("Comments not in by default", text.contains("testdoc")); + try (SlideShowExtractor ppe = openExtractor("45543.ppt")) { + String text = ppe.getText(); + assertFalse("Comments not in by default", text.contains("testdoc")); - ppe2.setCommentsByDefault(true); + ppe.setCommentsByDefault(true); - text = ppe2.getText(); - assertContains(text, "testdoc"); - ppe2.close(); + text = ppe.getText(); + assertContains(text, "testdoc"); + } } /** @@ -268,48 +276,37 @@ public final class TestExtractor { */ @Test public void testHeaderFooter() throws IOException { - String text; - // With a header on the notes - InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt"); - HSLFSlideShow ppt1 = new HSLFSlideShow(is1); - is1.close(); - assertNotNull(ppt1.getNotesHeadersFooters()); - assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText()); - - PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl()); + try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt"); + HSLFSlideShow ppt = new HSLFSlideShow(is)) { - text = ppe1.getText(); - assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc")); - assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase")); + assertNotNull(ppt.getNotesHeadersFooters()); + assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText()); - ppe1.setNotesByDefault(true); - text = ppe1.getText(); - assertContains(text, "testdoc"); - assertContains(text, "test phrase"); - ppe1.close(); - ppt1.close(); + testHeaderFooterInner(ppt); + } // And with a footer, also on notes - InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt"); - HSLFSlideShow ppt2 = new HSLFSlideShow(is2); - is2.close(); - - assertNotNull(ppt2.getNotesHeadersFooters()); - assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText()); - ppt2.close(); - - PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt"); - - text = ppe2.getText(); - assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc")); - assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase")); - - ppe2.setNotesByDefault(true); - text = ppe2.getText(); - assertContains(text, "testdoc"); - assertContains(text, "test phrase"); - ppe2.close(); + try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt"); + final HSLFSlideShow ppt = new HSLFSlideShow(is)) { + assertNotNull(ppt.getNotesHeadersFooters()); + assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText()); + + testHeaderFooterInner(ppt); + } + } + + private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException { + try (final SlideShowExtractor ppe = new SlideShowExtractor(ppt)) { + String text = ppe.getText(); + assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc")); + assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase")); + + ppe.setNotesByDefault(true); + text = ppe.getText(); + assertContains(text, "testdoc"); + assertContains(text, "test phrase"); + } } @SuppressWarnings("unused") @@ -318,41 +315,40 @@ public final class TestExtractor { String masterTitleText = "This is the Master Title"; String masterRandomText = "This text comes from the Master Slide"; String masterFooterText = "Footer from the master slide"; - PowerPointExtractor ppe = openExtractor("WithMaster.ppt"); - ppe.setMasterByDefault(true); + try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) { + ppe.setMasterByDefault(true); - String text = ppe.getText(); - assertContains(text, masterRandomText); - assertContains(text, masterFooterText); - ppe.close(); + String text = ppe.getText(); + assertContains(text, masterRandomText); + assertContains(text, masterFooterText); + } } @Test public void testMasterText() throws IOException { - PowerPointExtractor ppe1 = openExtractor("master_text.ppt"); - - // Initially not there - String text = ppe1.getText(); - assertFalse(text.contains("Text that I added to the master slide")); - - // Enable, shows up - ppe1.setMasterByDefault(true); - text = ppe1.getText(); - assertContains(text, "Text that I added to the master slide"); - - // Make sure placeholder text does not come out - assertNotContained(text, "Click to edit Master"); - ppe1.close(); + try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) { + // Initially not there + String text = ppe.getText(); + assertFalse(text.contains("Text that I added to the master slide")); + + // Enable, shows up + ppe.setMasterByDefault(true); + text = ppe.getText(); + assertContains(text, "Text that I added to the master slide"); + + // Make sure placeholder text does not come out + assertNotContained(text, "Click to edit Master"); + } // Now with another file only containing master text // Will always show up - PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt"); - String masterText = "Footer from the master slide"; + try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) { + String masterText = "Footer from the master slide"; - text = ppe2.getText(); - assertContainsIgnoreCase(text, "master"); - assertContains(text, masterText); - ppe2.close(); + String text = ppe.getText(); + assertContainsIgnoreCase(text, "master"); + assertContains(text, masterText); + } } /** @@ -360,22 +356,21 @@ public final class TestExtractor { */ @Test public void testChineseText() throws IOException { - PowerPointExtractor ppe = openExtractor("54880_chinese.ppt"); - - String text = ppe.getText(); + try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) { + String text = ppe.getText(); - // Check for the english text line - assertContains(text, "Single byte"); + // Check for the english text line + assertContains(text, "Single byte"); - // Check for the english text in the mixed line - assertContains(text, "Mix"); + // Check for the english text in the mixed line + assertContains(text, "Mix"); - // Check for the chinese text in the mixed line - assertContains(text, "\u8868"); + // Check for the chinese text in the mixed line + assertContains(text, "\u8868"); - // Check for the chinese only text line - assertContains(text, "\uff8a\uff9d\uff76\uff78"); - ppe.close(); + // Check for the chinese only text line + assertContains(text, "\uff8a\uff9d\uff76\uff78"); + } } /** @@ -387,67 +382,59 @@ public final class TestExtractor { public void testDifferentPOIFS() throws IOException { // Open the two filesystems File pptFile = slTests.getFile("basic_test_ppt_file.ppt"); - InputStream is1 = new FileInputStream(pptFile); - OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1); - is1.close(); - NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile); - - DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() }; - - // Open directly - for (DirectoryNode dir : files) { - PowerPointExtractor extractor = new PowerPointExtractor(dir); - assertEquals(expectText, extractor.getText()); - } + try (final InputStream is1 = new FileInputStream(pptFile); + final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) { - // Open via a HSLFSlideShow - for (DirectoryNode dir : files) { - HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir); - PowerPointExtractor extractor = new PowerPointExtractor(slideshow); - assertEquals(expectText, extractor.getText()); - extractor.close(); - slideshow.close(); - } + final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1); - npoifs.close(); + DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()}; + + // Open directly + for (DirectoryNode dir : files) { + try (SlideShow ppt = SlideShowFactory.create(dir); + SlideShowExtractor extractor = new SlideShowExtractor(ppt)) { + assertEquals(expectText, extractor.getText()); + } + } + } } @Test public void testTable() throws Exception { - PowerPointExtractor ppe1 = openExtractor("54111.ppt"); - String text1 = ppe1.getText(); - String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+ - "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+ - "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+ - "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+ - "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+ - "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n"; - assertContains(text1, target1); - ppe1.close(); - - PowerPointExtractor ppe2 = openExtractor("54722.ppt"); - String text2 = ppe2.getText(); - - String target2 = "this\tText\tis\twithin\ta\n" + - "table\t1\t2\t3\t4"; - assertContains(text2, target2); - ppe2.close(); + try (SlideShowExtractor ppe = openExtractor("54111.ppt")) { + String text = ppe.getText(); + String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" + + "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" + + "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" + + "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" + + "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" + + "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n"; + assertContains(text, target); + } + + try (SlideShowExtractor ppe = openExtractor("54722.ppt")) { + String text = ppe.getText(); + + String target = "this\tText\tis\twithin\ta\n" + + "table\t1\t2\t3\t4"; + assertContains(text, target); + } } // bug 60003 @Test public void testExtractMasterSlideFooterText() throws Exception { - PowerPointExtractor ppe = openExtractor("60003.ppt"); - ppe.setMasterByDefault(true); + try (SlideShowExtractor ppe = openExtractor("60003.ppt")) { + ppe.setMasterByDefault(true); - String text = ppe.getText(); - assertContains(text, "Prague"); - ppe.close(); + String text = ppe.getText(); + assertContains(text, "Prague"); + } } @Test public void testExtractGroupedShapeText() throws Exception { - try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) { + try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) { final String text = ppe.getText(); //this tests that we're ignoring text shapes at depth=0 diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java index df44216093..aee64bd6f9 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java @@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader; import org.apache.poi.sl.draw.DrawFactory; import org.apache.poi.sl.draw.DrawPaint; import org.apache.poi.sl.draw.DrawTextParagraph; +import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.ColorStyle; import org.apache.poi.sl.usermodel.PaintStyle; import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint; @@ -800,18 +801,18 @@ public final class TestBugs { String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", }; for (String f : files) { File sample = HSLFTestDataSamples.getSampleFile(f); - PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath()); - assertNotNull(ex.getText()); - ex.close(); + try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) { + assertNotNull(ex.getText()); + } } } @Test public void bug58733() throws IOException { File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt"); - PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath()); - assertNotNull(ex.getText()); - ex.close(); + try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) { + assertNotNull(ex.getText()); + } } @Test diff --git a/test-data/slideshow/SampleShow.pptx b/test-data/slideshow/SampleShow.pptx index e5237f4c15..df3a26debd 100644 Binary files a/test-data/slideshow/SampleShow.pptx and b/test-data/slideshow/SampleShow.pptx differ