);
private static final Set<String> IGNORED = unmodifiableHashSet(
- // need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
- "slideshow/42474-2.ppt",
// OPC handler works / XSSF handler fails
"spreadsheet/57181.xlsm",
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop
import java.io.InputStream;
import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
// additionally try the other getText() methods
- try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) {
+ try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor);
+ extractor.setSlidesByDefault(true);
+ extractor.setNotesByDefault(true);
+ extractor.setMasterByDefault(true);
- assertNotNull(extractor.getText(true, true, true));
- assertEquals("With all options disabled we should not get text",
- "", extractor.getText(false, false, false));
+ assertNotNull(extractor.getText());
+
+ extractor.setSlidesByDefault(false);
+ extractor.setNotesByDefault(false);
+ extractor.setMasterByDefault(false);
+
+ assertEquals("With all options disabled we should not get text", "", extractor.getText());
}
}
*
* @return the underlying POIDocument
*/
+ @Override
public POIDocument getDocument() {
return document;
}
fsToClose.close();
}
}
+
+ /**
+ * @return the processed document
+ */
+ public abstract Object getDocument();
}
return threadPreferEventExtractors.get();
}
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
- // Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+ public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
+ return (T)createExtractor(fs.getRoot());
}
- public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
- // Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+ public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
+ return (T)createExtractor(fs.getRoot());
}
- public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
- // Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+ public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
+ return (T)createExtractor(fs.getRoot());
}
- public static POITextExtractor createExtractor(InputStream input) throws IOException {
+ public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
Class<?> cls = getOOXMLClass();
if (cls != null) {
// Use Reflection to get us the full OOXML-enabled version
try {
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
- return (POITextExtractor)m.invoke(null, input);
+ return (T)m.invoke(null, input);
} catch (IllegalArgumentException iae) {
throw iae;
} catch (Exception e) {
* @throws IOException If an error occurs while decrypting or if the password does not match
*/
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
+ throws IOException {
+ // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
+ // as well when the resulting OPCPackage is closed
+ return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
+ @Override
+ public void close() throws IOException {
+ fs.close();
+ super.close();
+ }
+ };
+ }
+
+ /**
+ * Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
+ * the given password.
+ *
+ * @param root The OLE2 directory node for the document
+ * @param password The password, null if the default password should be used
+ * @return A stream for reading the decrypted data
+ * @throws IOException If an error occurs while decrypting or if the password does not match
+ */
+ public static InputStream getDecryptedStream(final DirectoryNode root, String password)
throws IOException {
- EncryptionInfo info = new EncryptionInfo(fs);
+ EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
}
if (passwordCorrect) {
- // wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
- // as well when the resulting OPCPackage is closed
- return new FilterInputStream(d.getDataStream(fs.getRoot())) {
- @Override
- public void close() throws IOException {
- fs.close();
-
- super.close();
- }
- };
+ return d.getDataStream(root);
+ } else if (password != null) {
+ throw new EncryptedDocumentException("Password incorrect");
} else {
- if (password != null)
- throw new EncryptedDocumentException("Password incorrect");
- else
- throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
+ throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
}
} catch (GeneralSecurityException e) {
throw new IOException(e);
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
package org.apache.poi.sl.extractor;
import java.util.ArrayList;
this.slideshow = slideshow;
}
+ /**
+ * Returns opened document
+ *
+ * @return the opened document
+ */
+ @Override
+ public final Object getDocument() {
+ return slideshow.getPersistDocument();
+ }
+
/**
* Should a call to getText() return slide text? Default is yes
*/
return;
}
for (final P para : paraList) {
- final int oldLen = sb.length();
for (final TextRun tr : para) {
final String str = tr.getRawText().replace("\r", "");
final String newStr;
* @since POI 4.0.0
*/
POITextExtractor getMetadataTextExtractor();
+
+ /**
+ * @return the instance which handles the persisting of the slideshow,
+ * which is either a subclass of {@link org.apache.poi.POIDocument}
+ * or {@link org.apache.poi.POIXMLDocument}
+ *
+ * @since POI 4.0.0
+ */
+ Object getPersistDocument();
}
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
- DirectoryNode root = fs.getRoot();
+ return create(fs.getRoot(), password);
+ }
+
+ /**
+ * Creates a SlideShow from the given NPOIFSFileSystem.
+ *
+ * @param root The {@link DirectoryNode} to start reading the document from
+ *
+ * @return The created SlideShow
+ *
+ * @throws IOException if an error occurs while reading the data
+ */
+ public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
+ return create(root, null);
+ }
+
+ /**
+ * Creates a SlideShow from the given NPOIFSFileSystem, which may
+ * be password protected
+ *
+ * @param root The {@link DirectoryNode} to start reading the document from
+ * @param password The password that should be used or null if no password is necessary.
+ *
+ * @return The created SlideShow
+ *
+ * @throws IOException if an error occurs while reading the data
+ */
+ public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
// Encrypted OOXML files go inside OLE2 containers, is this one?
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
InputStream stream = null;
try {
- stream = DocumentFactoryHelper.getDecryptedStream(fs, password);
+ stream = DocumentFactoryHelper.getDecryptedStream(root, password);
return createXSLFSlideShow(stream);
} finally {
passwordSet = true;
}
try {
- return createHSLFSlideShow(fs);
+ return createHSLFSlideShow(root);
} finally {
if (passwordSet) {
Biff8EncryptionKey.setCurrentUserPassword(null);
*
* @return the opened document
*/
+ @Override
public final POIXMLDocument getDocument() {
return _document;
}
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.NotImplemented;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.Removal;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
return OLE2ExtractorFactory.getPreferEventExtractor();
}
- public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
+ public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
NPOIFSFileSystem fs = null;
try {
fs = new NPOIFSFileSystem(f);
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
- return createEncryptedOOXMLExtractor(fs);
+ return (T)createEncryptedOOXMLExtractor(fs);
}
- POIOLE2TextExtractor extractor = createExtractor(fs);
+ POITextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
- return extractor;
+ return (T)extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
- return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
+ return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
- public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
+ public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
// Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
- return new XSLFPowerPointExtractor(pkg);
+ return new SlideShowExtractor(new XMLSlideShow(pkg));
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ return new SlideShowExtractor(new XMLSlideShow(pkg));
}
// How about xlsb?
}
}
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
+ public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+ return createExtractor(fs.getRoot());
}
- public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
+ public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+ return createExtractor(fs.getRoot());
}
- public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
+ public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+ return createExtractor(fs.getRoot());
}
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
+ public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
{
// First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
- return createExtractor(pkg);
+ return (T)createExtractor(pkg);
}
}
// If not, ask the OLE2 code to check, with Scratchpad if possible
- return OLE2ExtractorFactory.createExtractor(poifsDir);
+ return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
}
/**
throw new IllegalStateException("Not yet supported");
}
- private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
+ private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
throws IOException {
String pass = Biff8EncryptionKey.getCurrentUserPassword();
if (pass == null) {
* @deprecated use {@link SlideShowExtractor}
*/
@Deprecated
-@Removal(version="4.2.0")
+@Removal(version="5.0.0")
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
return new POIXMLPropertiesTextExtractor(this);
}
+
+ @Override
+ public Object getPersistDocument() {
+ return this;
+ }
}
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
package org.apache.poi.xslf.usermodel;
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;
*/
public XSLFCommentAuthors getCommentAuthorsPart() {
if(_commentAuthors == null) {
+ // first scan the slide relations
for (POIXMLDocumentPart p : getRelations()) {
if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors;
}
}
+ // then scan the presentation relations
+ for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
+ if (p instanceof XSLFCommentAuthors) {
+ _commentAuthors = (XSLFCommentAuthors)p;
+ return _commentAuthors;
+ }
+ }
}
return null;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.util.Locale;
import org.apache.poi.POIDataSamples;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
-import org.apache.poi.POIXMLException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
-import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
-import org.junit.BeforeClass;
+import org.apache.xmlbeans.XmlException;
import org.junit.Test;
/**
private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
- private static File txt;
-
- private static File xls;
- private static File xlsx;
- private static File xlsxStrict;
- private static File xltx;
- private static File xlsEmb;
- private static File xlsb;
-
- private static File doc;
- private static File doc6;
- private static File doc95;
- private static File docx;
- private static File dotx;
- private static File docEmb;
- private static File docEmbOOXML;
-
- private static File ppt;
- private static File pptx;
-
- private static File msg;
- private static File msgEmb;
- private static File msgEmbMsg;
-
- private static File vsd;
- private static File vsdx;
-
- private static File pub;
+ private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
+ private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
+ private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
+ private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
+ private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
+ private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
+ private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
+
+ private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
+ private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
+ private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
+ private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
+ private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
+ private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
+ private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
+ private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
+
+ private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
+ private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
+ private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
+ private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
+
+ private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
+ private static final File msg = getFileAndCheck(olTests, "quick.msg");
+ private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
+ private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
+
+ private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
+ private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
+ private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
+
+ private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
+ private static File pub = getFileAndCheck(pubTests, "Simple.pub");
private static File getFileAndCheck(POIDataSamples samples, String name) {
File file = samples.getFile(name);
return file;
}
- @BeforeClass
- public static void setUp() throws Exception {
-
- POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
- xls = getFileAndCheck(ssTests, "SampleSS.xls");
- xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
- xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
- xltx = getFileAndCheck(ssTests, "test.xltx");
- xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
- xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
-
- POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
- doc = getFileAndCheck(wpTests, "SampleDoc.doc");
- doc6 = getFileAndCheck(wpTests, "Word6.doc");
- doc95 = getFileAndCheck(wpTests, "Word95.doc");
- docx = getFileAndCheck(wpTests, "SampleDoc.docx");
- dotx = getFileAndCheck(wpTests, "test.dotx");
- docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
- docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
-
- POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- ppt = getFileAndCheck(slTests, "SampleShow.ppt");
- pptx = getFileAndCheck(slTests, "SampleShow.pptx");
- txt = getFileAndCheck(slTests, "SampleShow.txt");
-
- POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
- vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
- vsdx = getFileAndCheck(dgTests, "test.vsdx");
-
- POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
- pub = getFileAndCheck(pubTests, "Simple.pub");
-
- POIDataSamples olTests = POIDataSamples.getHSMFInstance();
- msg = getFileAndCheck(olTests, "quick.msg");
- msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
- msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
- }
-
- @Test
- public void testFile() throws Exception {
- // Excel
- POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
- assertNotNull("Had empty extractor for " + xls, xlsExtractor);
- assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
- xlsExtractor
- instanceof ExcelExtractor
- );
- assertTrue(
- xlsExtractor.getText().length() > 200
- );
- xlsExtractor.close();
-
- POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xlsb);
- assertContains(extractor.getText(), "test");
- extractor.close();
-
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertContains(extractor.getText(), "test");
- extractor.close();
+ private static final Object[] TEST_SET = {
+ "Excel", xls, ExcelExtractor.class, 200,
+ "Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200,
+ "Excel - xltx", xltx, XSSFExcelExtractor.class, -1,
+ "Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1,
+ "Word", doc, WordExtractor.class, 120,
+ "Word - docx", docx, XWPFWordExtractor.class, 120,
+ "Word - dotx", dotx, XWPFWordExtractor.class, -1,
+ "Word 6", doc6, Word6Extractor.class, 20,
+ "Word 95", doc95, Word6Extractor.class, 120,
+ "PowerPoint", ppt, SlideShowExtractor.class, 120,
+ "PowerPoint - pptx", pptx, SlideShowExtractor.class, 120,
+ "Visio", vsd, VisioTextExtractor.class, 50,
+ "Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20,
+ "Publisher", pub, PublisherTextExtractor.class, 50,
+ "Outlook msg", msg, OutlookTextExtactor.class, 50,
// TODO Support OOXML-Strict, see bug #57699
- try {
- /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
- fail("OOXML-Strict isn't yet supported");
- } catch (POIXMLException e) {
- // Expected, for now
- }
-// extractor = ExtractorFactory.createExtractor(xlsxStrict);
-// assertTrue(
-// extractor
-// instanceof XSSFExcelExtractor
-// );
-// extractor.close();
-//
-// extractor = ExtractorFactory.createExtractor(xlsxStrict);
-// assertTrue(
-// extractor.getText().contains("test")
-// );
-// extractor.close();
-
-
- // Word
- extractor = ExtractorFactory.createExtractor(doc);
- assertTrue(
- extractor
- instanceof WordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(doc6);
- assertTrue(
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(doc95);
- assertTrue(
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertContains(extractor.getText(), "Test");
- extractor.close();
-
- // PowerPoint (PPT)
- extractor = ExtractorFactory.createExtractor(ppt);
- assertTrue(
- extractor
- instanceof PowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint (PPTX)
- extractor = ExtractorFactory.createExtractor(pptx);
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
+ // xlsxStrict
+ };
- // Visio - binary
- extractor = ExtractorFactory.createExtractor(vsd);
- assertTrue(
- extractor
- instanceof VisioTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
+ @FunctionalInterface
+ interface FunctionEx<T, R> {
+ R apply(T t) throws IOException, OpenXML4JException, XmlException;
+ }
- // Visio - vsdx
- extractor = ExtractorFactory.createExtractor(vsdx);
- assertTrue(
- extractor
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
- // Publisher
- extractor = ExtractorFactory.createExtractor(pub);
- assertTrue(
- extractor
- instanceof PublisherTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Outlook msg
- extractor = ExtractorFactory.createExtractor(msg);
- assertTrue(
- extractor
- instanceof OutlookTextExtactor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
+ @Test
+ public void testFile() throws Exception {
+ for (int i = 0; i < TEST_SET.length; i += 4) {
+ try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
+ testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
+ }
+ }
+ }
+ @Test(expected = IllegalArgumentException.class)
+ public void testFileInvalid() throws Exception {
// Text
- try {
- ExtractorFactory.createExtractor(txt);
- fail("expected IllegalArgumentException");
- } catch(IllegalArgumentException e) {
- // Good
- }
+ try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {}
}
@Test
public void testInputStream() throws Exception {
- // Excel
- POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- assertTrue(
- extractor.getText().length() > 200
- );
- // TODO Support OOXML-Strict, see bug #57699
-// assertTrue(
-// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
-// instanceof XSSFExcelExtractor
-// );
-// assertTrue(
-// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
-// );
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof WordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
- assertTrue(
- extractor
- instanceof XWPFWordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
- assertTrue(
- extractor
- instanceof PowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio
- extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
- assertTrue(
- extractor
- instanceof VisioTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Visio - vsdx
- extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
- assertTrue(
- extractor
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- // Publisher
- extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
- assertTrue(
- extractor
- instanceof PublisherTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Outlook msg
- extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
- assertTrue(
- extractor
- instanceof OutlookTextExtactor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
+ testStream((f) -> ExtractorFactory.createExtractor(f), true);
+ }
- // Text
- try (FileInputStream stream = new FileInputStream(txt)) {
- ExtractorFactory.createExtractor(stream);
- fail("expected IllegalArgumentException");
- } catch(IllegalArgumentException e) {
- // Good
- }
+ @Test(expected = IllegalArgumentException.class)
+ public void testInputStreamInvalid() throws Exception {
+ testInvalid((f) -> ExtractorFactory.createExtractor(f));
}
@Test
public void testPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
- fail("expected IllegalArgumentException");
- } catch(IOException e) {
- // Good
- }
+ testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
}
+ @Test(expected = IOException.class)
+ public void testPOIFSInvalid() throws Exception {
+ testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
+ }
@Test
public void testOPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
+ testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false);
+ }
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
+ @Test(expected = IOException.class)
+ public void testOPOIFSInvalid() throws Exception {
+ testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)));
+ }
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
+ private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
+ throws IOException, OpenXML4JException, XmlException {
+ for (int i = 0; i < TEST_SET.length; i += 4) {
+ File testFile = (File) TEST_SET[i + 1];
+ if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
+ continue;
+ }
+ try (FileInputStream fis = new FileInputStream(testFile);
+ POITextExtractor ext = poifsIS.apply(fis)) {
+ testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
+ } catch (IllegalArgumentException e) {
+ fail("failed to process "+testFile);
+ }
+ }
+ }
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
+ private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) {
+ assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext));
+ final String actual = ext.getText();
+ if (minLength == -1) {
+ assertContains(actual.toLowerCase(Locale.ROOT), "test");
+ } else {
+ assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
+ }
+ }
+ private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
// Text
- try {
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
- fail("expected IllegalArgumentException");
- } catch(IOException e) {
- // Good
+ try (FileInputStream fis = new FileInputStream(txt);
+ POITextExtractor te = poifs.apply(fis)) {
}
}
@Test
public void testPackage() throws Exception {
- // Excel
- POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFExcelExtractor);
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(extractor.getText().length() > 200);
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(extractor instanceof XWPFWordExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(extractor.getText().length() > 120);
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(extractor instanceof XSLFPowerPointExtractor);
- extractor.close();
+ for (int i = 0; i < TEST_SET.length; i += 4) {
+ final File testFile = (File) TEST_SET[i + 1];
+ if (!testFile.getName().endsWith("x")) {
+ continue;
+ }
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(extractor.getText().length() > 120);
- extractor.close();
-
- // Visio
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
- assertTrue(extractor instanceof XDGFVisioExtractor);
- assertTrue(extractor.getText().length() > 20);
- extractor.close();
+ try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
+ final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
+ testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
+ pkg.revert();
+ }
+ }
+ }
+ @Test(expected = UnsupportedFileFormatException.class)
+ public void testPackageInvalid() throws Exception {
// Text
- try {
- ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
- fail("TestExtractorFactory.testPackage() failed on " + txt);
- } catch(UnsupportedFileFormatException e) {
- // Good
- } catch (Exception e) {
- LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt);
- throw e;
- }
+ try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
+ final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {}
}
@Test
* does poifs embedded, but will do ooxml ones
* at some point.
*/
- @SuppressWarnings("deprecation")
@Test
public void testEmbedded() throws Exception {
- POIOLE2TextExtractor ext;
- POITextExtractor[] embeds;
-
- // No embeddings
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xls);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
- assertEquals(0, embeds.length);
- ext.close();
-
- // No embeddings
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xls);
- embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
- assertEquals(0, embeds.length);
- ext.close();
-
- // Excel
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xlsEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
- assertNotNull(embeds);
- ext.close();
-
- // Excel
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xlsEmb);
- embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
-
- assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
-
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(2, numPpt);
- assertEquals(2, numXls);
- assertEquals(2, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Word
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(4, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(1, numPpt);
- assertEquals(2, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Word which contains an OOXML file
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmbOOXML);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
- assertEquals(3, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- else if (embed instanceof XWPFWordExtractor) numWordX++;
- }
- assertEquals(1, numPpt);
- assertEquals(1, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numWordX);
- assertEquals(0, numMsg);
- ext.close();
-
- // Outlook
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Outlook with another outlook file in it
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmbMsg);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
+ final Object[] testObj = {
+ "No embeddings", xls, "0-0-0-0-0-0",
+ "Excel", xlsEmb, "6-2-2-2-0-0",
+ "Word", docEmb, "4-1-2-1-0-0",
+ "Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
+ "Outlook", msgEmb, "1-1-0-0-0-0",
+ "Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
+ };
+
+ for (int i=0; i<testObj.length; i+=3) {
+ try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
+ final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
+
+ int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
+ for (POITextExtractor embed : embeds) {
+ assertTrue(embed.getText().length() > 20);
+ if (embed instanceof SlideShowExtractor) {
+ numPpt++;
+ } else if (embed instanceof ExcelExtractor) {
+ numXls++;
+ } else if (embed instanceof WordExtractor) {
+ numWord++;
+ } else if (embed instanceof OutlookTextExtactor) {
+ numMsg++;
+ } else if (embed instanceof XWPFWordExtractor) {
+ numWordX++;
+ }
+ }
+
+ final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
+ final String expected = (String)testObj[i+2];
+ assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
+ }
}
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numMsg);
- ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
- private static final String[] EXPECTED_FAILURES = new String[] {
+ private static final String[] EXPECTED_FAILURES = {
// password protected files
"spreadsheet/password.xls",
"spreadsheet/protected_passtika.xlsx",
* #59074 - Excel 95 files should give a helpful message, not just
* "No supported documents found in the OLE2 stream"
*/
- @Test
+ @Test(expected = OldExcelFormatException.class)
public void bug59074() throws Exception {
- try {
- ExtractorFactory.createExtractor(
- POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
- fail("Old excel formats not supported via ExtractorFactory");
- } catch (OldExcelFormatException e) {
- // expected here
- }
+ ExtractorFactory.createExtractor(
+ POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
}
@SuppressWarnings("deprecation")
- @Test
- public void testGetEmbeddedFromXMLExtractor() {
- try {
- // currently not implemented
- ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
- fail("Unsupported currently");
- } catch (IllegalStateException e) {
- // expected here
- }
+ @Test(expected = IllegalStateException.class)
+ public void testGetEmbedFromXMLExtractor() {
+ // currently not implemented
+ ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
+ }
- try {
- // currently not implemented
- ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
- fail("Unsupported currently");
- } catch (IllegalStateException e) {
- // expected here
- }
+ @SuppressWarnings("deprecation")
+ @Test(expected = IllegalStateException.class)
+ public void testGetEmbeddedFromXMLExtractor() {
+ // currently not implemented
+ ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
}
-
+
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
// When this happens, change this from @Test(expected=...) to @Test
// bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file);
- POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
+ POITextExtractor te1 = ExtractorFactory.createExtractor(f);
Biff8EncryptionKey.setCurrentUserPassword(newPass);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
- POIDocument doc = te1.getDocument();
+ POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos);
doc.close();
te1.close();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file);
- POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
+ POITextExtractor te1 = ExtractorFactory.createExtractor(f);
// first remove encryption
Biff8EncryptionKey.setCurrentUserPassword(null);
- POIDocument doc = te1.getDocument();
+ POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos);
doc.close();
te1.close();
// then use default setting, which is cryptoapi
String newPass = "newPass";
- POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
+ POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
Biff8EncryptionKey.setCurrentUserPassword(newPass);
- doc = te2.getDocument();
+ doc = (POIDocument)te2.getDocument();
bos.reset();
doc.write(bos);
doc.close();
te2.close();
// and finally update cryptoapi setting
- POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
- doc = te3.getDocument();
+ POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
+ doc = (POIDocument)te3.getDocument();
// need to cache data (i.e. read all data) before changing the key size
if (doc instanceof HSLFSlideShowImpl) {
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
doc.close();
te3.close();
// check the setting
- POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
- doc = te4.getDocument();
+ POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
+ doc = (POIDocument)te4.getDocument();
ei = doc.getEncryptionInfo();
assertNotNull(ei);
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.sl.draw.DrawPaint;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
* rID2 -> slide3.xml
*/
@Test
- public void bug54916() throws Exception {
- XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx");
- XSLFSlide slide;
+ public void bug54916() throws IOException {
+ try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
+ XSLFSlide slide;
- // Should find 4 slides
- assertEquals(4, ss.getSlides().size());
+ // Should find 4 slides
+ assertEquals(4, ss.getSlides().size());
- // Check the text, to see we got them in order
- slide = ss.getSlides().get(0);
- assertContains(getSlideText(slide), "POI cannot read this");
+ // Check the text, to see we got them in order
+ slide = ss.getSlides().get(0);
+ assertContains(getSlideText(ss, slide), "POI cannot read this");
- slide = ss.getSlides().get(1);
- assertContains(getSlideText(slide), "POI can read this");
- assertContains(getSlideText(slide), "Has a relationship to another slide");
+ slide = ss.getSlides().get(1);
+ assertContains(getSlideText(ss, slide), "POI can read this");
+ assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
- slide = ss.getSlides().get(2);
- assertContains(getSlideText(slide), "POI can read this");
+ slide = ss.getSlides().get(2);
+ assertContains(getSlideText(ss, slide), "POI can read this");
- slide = ss.getSlides().get(3);
- assertContains(getSlideText(slide), "POI can read this");
-
- ss.close();
+ slide = ss.getSlides().get(3);
+ assertContains(getSlideText(ss, slide), "POI can read this");
+ }
}
/**
ss.close();
}
- protected String getSlideText(XSLFSlide slide) {
- return XSLFPowerPointExtractor.getText(slide, true, false, false);
+ protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
+ try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
+ // do not auto-close the slideshow
+ extr.setFilesystem(null);
+ extr.setSlidesByDefault(true);
+ extr.setNotesByDefault(false);
+ extr.setMasterByDefault(false);
+ return extr.getText(slide);
+ }
}
@Test
for (int i = 0; i < slideTexts.length; i++) {
XSLFSlide slide = ss.getSlides().get(i);
- assertContains(getSlideText(slide), slideTexts[i]);
+ assertContains(getSlideText(ss, slide), slideTexts[i]);
}
}
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIDataSamples;
-import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.xmlbeans.XmlException;
+import org.junit.Ignore;
import org.junit.Test;
/**
/**
* Get text out of the simple file
- * @throws XmlException
- * @throws OpenXML4JException
*/
@Test
- public void testGetSimpleText()
- throws IOException, XmlException, OpenXML4JException {
- XMLSlideShow xmlA = openPPTX("sample.pptx");
- @SuppressWarnings("resource")
- OPCPackage pkg = xmlA.getPackage();
-
- new XSLFPowerPointExtractor(xmlA).close();
- new XSLFPowerPointExtractor(pkg).close();
-
- XSLFPowerPointExtractor extractor =
- new XSLFPowerPointExtractor(xmlA);
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check Basics
- assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
- assertContains(text, "amet\n\n");
-
- // Our placeholder master text
- // This shouldn't show up in the output
- // String masterText =
- // "Click to edit Master title style\n" +
- // "Click to edit Master subtitle style\n" +
- // "\n\n\n\n\n\n" +
- // "Click to edit Master title style\n" +
- // "Click to edit Master text styles\n" +
- // "Second level\n" +
- // "Third level\n" +
- // "Fourth level\n" +
- // "Fifth level\n";
-
- // Just slides, no notes
- text = extractor.getText(true, false, false);
- String slideText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n";
- assertEquals(slideText, text);
-
- // Just notes, no slides
- text = extractor.getText(false, true);
- assertEquals("\n\n1\n\n\n2\n", text);
-
- // Both
- text = extractor.getText(true, true, false);
- String bothText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n\n\n1\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n\n\n2\n";
- assertEquals(bothText, text);
-
- // With Slides and Master Text
- text = extractor.getText(true, false, true);
- String smText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n";
- assertEquals(smText, text);
-
- // With Slides, Notes and Master Text
- text = extractor.getText(true, true, true);
- String snmText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n\n\n1\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n\n\n2\n";
- assertEquals(snmText, text);
-
- // Via set defaults
- extractor.setSlidesByDefault(false);
- extractor.setNotesByDefault(true);
- text = extractor.getText();
- assertEquals("\n\n1\n\n\n2\n", text);
-
- extractor.close();
- xmlA.close();
+ public void testGetSimpleText() throws IOException {
+ try (XMLSlideShow xmlA = openPPTX("sample.pptx");
+ SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
+
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check Basics
+ assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
+ assertContains(text, "amet\n\n");
+
+ // Our placeholder master text
+ // This shouldn't show up in the output
+ // String masterText =
+ // "Click to edit Master title style\n" +
+ // "Click to edit Master subtitle style\n" +
+ // "\n\n\n\n\n\n" +
+ // "Click to edit Master title style\n" +
+ // "Click to edit Master text styles\n" +
+ // "Second level\n" +
+ // "Third level\n" +
+ // "Fourth level\n" +
+ // "Fifth level\n";
+
+ // Just slides, no notes
+ extractor.setSlidesByDefault(true);
+ extractor.setNotesByDefault(false);
+ extractor.setMasterByDefault(false);
+ text = extractor.getText();
+ String slideText =
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n";
+ assertEquals(slideText, text);
+
+ // Just notes, no slides
+ extractor.setSlidesByDefault(false);
+ extractor.setNotesByDefault(true);
+ text = extractor.getText();
+ assertEquals("\n\n1\n\n\n2\n", text);
+
+ // Both
+ extractor.setSlidesByDefault(true);
+ extractor.setNotesByDefault(true);
+ text = extractor.getText();
+ String bothText =
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n\n\n1\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n\n\n2\n";
+ assertEquals(bothText, text);
+
+ // With Slides and Master Text
+ extractor.setSlidesByDefault(true);
+ extractor.setNotesByDefault(false);
+ extractor.setMasterByDefault(true);
+ text = extractor.getText();
+ String smText =
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n";
+ assertEquals(smText, text);
+
+ // With Slides, Notes and Master Text
+ extractor.setSlidesByDefault(true);
+ extractor.setNotesByDefault(true);
+ extractor.setMasterByDefault(true);
+ text = extractor.getText();
+ String snmText =
+ "Lorem ipsum dolor sit amet\n" +
+ "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+ "\n\n\n1\n" +
+ "Lorem ipsum dolor sit amet\n" +
+ "Lorem\n" +
+ "ipsum\n" +
+ "dolor\n" +
+ "sit\n" +
+ "amet\n" +
+ "\n\n\n2\n";
+ assertEquals(snmText, text);
+
+ // Via set defaults
+ extractor.setSlidesByDefault(false);
+ extractor.setNotesByDefault(true);
+ text = extractor.getText();
+ assertEquals("\n\n1\n\n\n2\n", text);
+ }
}
+ @Test
public void testGetComments() throws IOException {
- XMLSlideShow xml = openPPTX("45545_Comment.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
+ try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
+ SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+ extractor.setCommentsByDefault(true);
- // Check comments are there
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
- // Check the authors came through too
- assertContains(text, "XPVMWARE01");
+ // Check comments are there
+ assertContains(text, "testdoc");
+ assertContains(text, "test phrase");
- extractor.close();
- xml.close();
+ // Check the authors came through too
+ assertContains(text, "XPVMWARE01");
+ }
}
+ @Test
+ @Ignore("currently slidelayouts aren't yet supported")
public void testGetMasterText() throws Exception {
- XMLSlideShow xml = openPPTX("WithMaster.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
- extractor.setSlidesByDefault(true);
- extractor.setNotesByDefault(false);
- extractor.setMasterByDefault(true);
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check master text is there
- assertContains(text, "Footer from the master slide");
-
- // Theme text shouldn't show up
- // String themeText =
- // "Theme Master Title\n" +
- // "Theme Master first level\n" +
- // "And the 2nd level\n" +
- // "Our 3rd level goes here\n" +
- // "And onto the 4th, such fun....\n" +
- // "Finally is the Fifth level\n";
-
- // Check the whole text
- String wholeText =
- "First page title\n" +
- "First page subtitle\n" +
- "This is the Master Title\n" +
- "This text comes from the Master Slide\n" +
- "\n" +
- // TODO Detect we didn't have a title, and include the master one
- "2nd page subtitle\n" +
- "Footer from the master slide\n" +
- "This is the Master Title\n" +
- "This text comes from the Master Slide\n";
- assertEquals(wholeText, text);
-
- extractor.close();
- xml.close();
+ try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
+ SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+ extractor.setSlidesByDefault(true);
+ extractor.setNotesByDefault(false);
+ extractor.setMasterByDefault(true);
+
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check master text is there
+ assertContains(text, "Footer from the master slide");
+
+ // Theme text shouldn't show up
+ // String themeText =
+ // "Theme Master Title\n" +
+ // "Theme Master first level\n" +
+ // "And the 2nd level\n" +
+ // "Our 3rd level goes here\n" +
+ // "And onto the 4th, such fun....\n" +
+ // "Finally is the Fifth level\n";
+
+ // Check the whole text
+ String wholeText =
+ "First page title\n" +
+ "First page subtitle\n" +
+ "This is the Master Title\n" +
+ "This text comes from the Master Slide\n" +
+ "\n" +
+ // TODO Detect we didn't have a title, and include the master one
+ "2nd page subtitle\n" +
+ "Footer from the master slide\n" +
+ "This is the Master Title\n" +
+ "This text comes from the Master Slide\n";
+ assertEquals(wholeText, text);
+ }
}
@Test
public void testTable() throws Exception {
- XMLSlideShow xml = openPPTX("present1.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
+ try (XMLSlideShow xml = openPPTX("present1.pptx");
+ SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
- // Check comments are there
- assertContains(text, "TEST");
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
- extractor.close();
- xml.close();
+ // Check comments are there
+ assertContains(text, "TEST");
+ }
}
/**
};
for(String extension : extensions) {
String filename = "testPPT." + extension;
- XMLSlideShow xml = openPPTX(filename);
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
- String text = extractor.getText();
- if (extension.equals("thmx")) {
- // Theme file doesn't have any textual content
- assertEquals(filename, 0, text.length());
- continue;
+ try (XMLSlideShow xml = openPPTX(filename);
+ SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+
+ String text = extractor.getText();
+ if (extension.equals("thmx")) {
+ // Theme file doesn't have any textual content
+ assertEquals(filename, 0, text.length());
+ continue;
+ }
+
+ assertTrue(filename, text.length() > 0);
+ assertContains(filename, text, "Attachment Test");
+ assertContains(filename, text, "This is a test file data with the same content");
+ assertContains(filename, text, "content parsing");
+ assertContains(filename, text, "Different words to test against");
+ assertContains(filename, text, "Mystery");
}
-
- assertTrue(filename, text.length() > 0);
- assertContains(filename, text, "Attachment Test");
- assertContains(filename, text, "This is a test file data with the same content");
- assertContains(filename, text, "content parsing");
- assertContains(filename, text, "Different words to test against");
- assertContains(filename, text, "Mystery");
-
- extractor.close();
- xml.close();
}
}
@Test
- public void test45541() throws Exception {
+ public void test45541() throws IOException, OpenXML4JException, XmlException {
// extract text from a powerpoint that has a header in the notes-element
- POITextExtractor extr = ExtractorFactory.createExtractor(
- slTests.getFile("45541_Header.pptx"));
- String text = extr.getText();
- assertNotNull(text);
- assertFalse("Had: " + text, text.contains("testdoc"));
-
- text = ((XSLFPowerPointExtractor)extr).getText(false, true);
- assertContains(text, "testdoc");
- extr.close();
- assertNotNull(text);
+ final File headerFile = slTests.getFile("45541_Header.pptx");
+ try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
+ String text = extr.getText();
+ assertNotNull(text);
+ assertFalse("Had: " + text, text.contains("testdoc"));
+
+ extr.setSlidesByDefault(false);
+ extr.setNotesByDefault(true);
+
+ text = extr.getText();
+ assertContains(text, "testdoc");
+ assertNotNull(text);
+ }
// extract text from a powerpoint that has a footer in the master-slide
- extr = ExtractorFactory.createExtractor(
- slTests.getFile("45541_Footer.pptx"));
- text = extr.getText();
- assertNotContained(text, "testdoc");
-
- text = ((XSLFPowerPointExtractor)extr).getText(false, true);
- assertNotContained(text, "testdoc");
-
- text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
- assertNotContained(text, "testdoc");
-
- extr.close();
+ final File footerFile = slTests.getFile("45541_Footer.pptx");
+ try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
+ String text = extr.getText();
+ assertNotContained(text, "testdoc");
+
+ extr.setSlidesByDefault(false);
+ extr.setNotesByDefault(true);
+ text = extr.getText();
+ assertNotContained(text, "testdoc");
+
+ extr.setSlidesByDefault(false);
+ extr.setNotesByDefault(false);
+ extr.setMasterByDefault(true);
+ text = extr.getText();
+ assertNotContained(text, "testdoc");
+ }
}
@Test
public void bug54570() throws IOException {
- XMLSlideShow xml = openPPTX("bug54570.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
- String text = extractor.getText();
- assertNotNull(text);
- extractor.close();
- xml.close();
+ try (XMLSlideShow xml = openPPTX("bug54570.pptx");
+ SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
+ String text = extractor.getText();
+ assertNotNull(text);
+ }
}
private XMLSlideShow openPPTX(String file) throws IOException {
- InputStream is = slTests.openResourceAsStream(file);
- try {
+ try (InputStream is = slTests.openResourceAsStream(file)) {
return new XMLSlideShow(is);
- } finally {
- is.close();
}
}
}
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
+import org.apache.poi.sl.usermodel.SlideShowFactory;
/**
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
}
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
- return new PowerPointExtractor(poifsDir);
+ return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
}
if (poifsDir.hasEntry("VisioDocument")) {
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
+import org.apache.poi.util.Removal;
/**
* This class can be used to extract text from a PowerPoint file. Can optionally
*/
@SuppressWarnings("WeakerAccess")
@Deprecated
+@Removal(version="5.0.0")
public final class PowerPointExtractor extends POIOLE2TextExtractor {
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
public void close() throws IOException {
_hslfSlideShow.close();
}
+
+ @Override
+ public Object getPersistDocument() {
+ return getSlideShowImpl();
+ }
}
import java.io.IOException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
-import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Internal;
@Internal
public class HSLFSlideShowFactory extends SlideShowFactory {
/**
- * Creates a HSLFSlideShow from the given NPOIFSFileSystem
- * <p>Note that in order to properly release resources the
- * SlideShow should be closed after use.
+ * Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
+ * Note that in order to properly release resources the
+ * SlideShow should be closed after use.
*/
- public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException {
+ public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
return new HSLFSlideShow(fs);
}
+ /**
+ * Creates a HSLFSlideShow from the given DirectoryNode<p>
+ * Note that in order to properly release resources the
+ * SlideShow should be closed after use.
+ */
+ public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
+ return new HSLFSlideShow(root);
+ }
}
@Override
public void close() throws IOException {
- NPOIFSFileSystem fs = getDirectory().getFileSystem();
- if (fs != null) {
- fs.close();
+ // only close the filesystem, if we are based on the root node.
+ // embedded documents/slideshows shouldn't close the parent container
+ if (getDirectory().getParent() == null) {
+ NPOIFSFileSystem fs = getDirectory().getFileSystem();
+ if (fs != null) {
+ fs.close();
+ }
}
}
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
+import org.apache.poi.sl.usermodel.ObjectShape;
+import org.apache.poi.sl.usermodel.SlideShow;
+import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.IOUtils;
import org.junit.Test;
// ppe.close();
// }
- private PowerPointExtractor openExtractor(String fileName) throws IOException {
- InputStream is = slTests.openResourceAsStream(fileName);
- try {
- return new PowerPointExtractor(is);
- } finally {
- is.close();
+ private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
+ try (InputStream is = slTests.openResourceAsStream(fileName)) {
+ return new SlideShowExtractor(SlideShowFactory.create(is));
}
}
@Test
public void testReadSheetText() throws IOException {
// Basic 2 page example
- PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
- assertEquals(expectText, ppe.getText());
- ppe.close();
+ try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
+ assertEquals(expectText, ppe.getText());
+ }
// 1 page example with text boxes
- PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
- assertEquals(expectText2, ppe2.getText());
- ppe2.close();
+ try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
+ assertEquals(expectText2, ppe.getText());
+ }
}
@Test
public void testReadNoteText() throws IOException {
// Basic 2 page example
- PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
- String notesText = ppe.getNotes();
- String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
- assertEquals(expText, notesText);
- ppe.close();
+ try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
+ ppe.setNotesByDefault(true);
+ ppe.setSlidesByDefault(false);
+ ppe.setMasterByDefault(false);
+ String notesText = ppe.getText();
+ String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
+ assertEquals(expText, notesText);
+ }
// Other one doesn't have notes
- PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
- notesText = ppe2.getNotes();
- expText = "";
- assertEquals(expText, notesText);
- ppe2.close();
+ try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
+ ppe.setNotesByDefault(true);
+ ppe.setSlidesByDefault(false);
+ ppe.setMasterByDefault(false);
+ String notesText = ppe.getText();
+ String expText = "";
+ assertEquals(expText, notesText);
+ }
}
@Test
"\nThese are the notes on page two, again lacking formatting\n"
};
- PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
- ppe.setSlidesByDefault(true);
- ppe.setNotesByDefault(false);
- assertEquals(slText[0] + slText[1], ppe.getText());
+ try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
+ ppe.setSlidesByDefault(true);
+ ppe.setNotesByDefault(false);
+ assertEquals(slText[0] + slText[1], ppe.getText());
- ppe.setSlidesByDefault(false);
- ppe.setNotesByDefault(true);
- assertEquals(ntText[0] + ntText[1], ppe.getText());
+ ppe.setSlidesByDefault(false);
+ ppe.setNotesByDefault(true);
+ assertEquals(ntText[0] + ntText[1], ppe.getText());
- ppe.setSlidesByDefault(true);
- ppe.setNotesByDefault(true);
- assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
- ppe.close();
+ ppe.setSlidesByDefault(true);
+ ppe.setNotesByDefault(true);
+ assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
+ }
}
/**
*/
@Test
public void testMissingCoreRecords() throws IOException {
- PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
-
- String text = ppe.getText(true, false);
- String nText = ppe.getNotes();
-
- assertNotNull(text);
- assertNotNull(nText);
-
- // Notes record were corrupt, so don't expect any
- assertEquals(nText.length(), 0);
-
- // Slide records were fine
- assertContains(text, "Using Disease Surveillance and Response");
-
- ppe.close();
+ try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
+ ppe.setSlidesByDefault(true);
+ ppe.setNotesByDefault(false);
+ String text = ppe.getText();
+ ppe.setSlidesByDefault(false);
+ ppe.setNotesByDefault(true);
+ String nText = ppe.getText();
+
+ assertNotNull(text);
+ assertNotNull(nText);
+
+ // Notes record were corrupt, so don't expect any
+ assertEquals(nText.length(), 0);
+
+ // Slide records were fine
+ assertContains(text, "Using Disease Surveillance and Response");
+ }
}
@Test
public void testExtractFromEmbeded() throws IOException {
- InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
- POIFSFileSystem fs = new POIFSFileSystem(is);
- DirectoryNode root = fs.getRoot();
- PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
- PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
- ppe2.close();
- ppe1.close();
- fs.close();
- }
-
- private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
- throws IOException {
- DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
- assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
-
- // Check the first file
- HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
- PowerPointExtractor ppe = new PowerPointExtractor(ppt);
- assertEquals(expected, ppe.getText(true, false));
- return ppe;
+ try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
+ final POIFSFileSystem fs = new POIFSFileSystem(is)) {
+ final DirectoryNode root = fs.getRoot();
+
+ final String[] TEST_SET = {
+ "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
+ "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
+ };
+
+ for (int i=0; i<TEST_SET.length; i+=2) {
+ DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
+ assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
+
+ try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
+ final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
+ assertEquals(TEST_SET[i+1], ppe.getText());
+ }
+ }
+ }
}
/**
*/
@Test
public void testExtractFromOwnEmbeded() throws IOException {
- PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
- List<HSLFObjectShape> shapes = ppe.getOLEShapes();
- assertEquals("Expected 6 ole shapes", 6, shapes.size());
- int num_ppt = 0, num_doc = 0, num_xls = 0;
- for (HSLFObjectShape ole : shapes) {
- String name = ole.getInstanceName();
- InputStream data = ole.getObjectData().getInputStream();
- if ("Worksheet".equals(name)) {
- HSSFWorkbook wb = new HSSFWorkbook(data);
- num_xls++;
- wb.close();
- } else if ("Document".equals(name)) {
- HWPFDocument doc = new HWPFDocument(data);
- num_doc++;
- doc.close();
- } else if ("Presentation".equals(name)) {
- num_ppt++;
- HSLFSlideShow ppt = new HSLFSlideShow(data);
- ppt.close();
+ try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
+ List<? extends ObjectShape> shapes = ppe.getOLEShapes();
+ assertEquals("Expected 6 ole shapes", 6, shapes.size());
+ int num_ppt = 0, num_doc = 0, num_xls = 0;
+ for (ObjectShape ole : shapes) {
+ String name = ((HSLFObjectShape)ole).getInstanceName();
+ InputStream data = ole.getObjectData().getInputStream();
+ if ("Worksheet".equals(name)) {
+ HSSFWorkbook wb = new HSSFWorkbook(data);
+ num_xls++;
+ wb.close();
+ } else if ("Document".equals(name)) {
+ HWPFDocument doc = new HWPFDocument(data);
+ num_doc++;
+ doc.close();
+ } else if ("Presentation".equals(name)) {
+ num_ppt++;
+ HSLFSlideShow ppt = new HSLFSlideShow(data);
+ ppt.close();
+ }
+ data.close();
}
- data.close();
+ assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
+ assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
+ assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
}
- assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
- assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
- assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
- ppe.close();
}
/**
*/
@Test
public void test52991() throws IOException {
- PowerPointExtractor ppe = openExtractor("badzip.ppt");
- for (HSLFObjectShape shape : ppe.getOLEShapes()) {
- IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
+ try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
+ for (ObjectShape shape : ppe.getOLEShapes()) {
+ IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
+ }
}
- ppe.close();
}
/**
*/
@Test
public void testWithComments() throws IOException {
- PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
- String text = ppe1.getText();
- assertFalse("Comments not in by default", text.contains("This is a test comment"));
+ try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
+ String text = ppe.getText();
+ assertFalse("Comments not in by default", text.contains("This is a test comment"));
- ppe1.setCommentsByDefault(true);
+ ppe.setCommentsByDefault(true);
- text = ppe1.getText();
- assertContains(text, "This is a test comment");
- ppe1.close();
+ text = ppe.getText();
+ assertContains(text, "This is a test comment");
+ }
// And another file
- PowerPointExtractor ppe2 = openExtractor("45543.ppt");
- text = ppe2.getText();
- assertFalse("Comments not in by default", text.contains("testdoc"));
+ try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
+ String text = ppe.getText();
+ assertFalse("Comments not in by default", text.contains("testdoc"));
- ppe2.setCommentsByDefault(true);
+ ppe.setCommentsByDefault(true);
- text = ppe2.getText();
- assertContains(text, "testdoc");
- ppe2.close();
+ text = ppe.getText();
+ assertContains(text, "testdoc");
+ }
}
/**
*/
@Test
public void testHeaderFooter() throws IOException {
- String text;
-
// With a header on the notes
- InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
- HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
- is1.close();
- assertNotNull(ppt1.getNotesHeadersFooters());
- assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
-
- PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
+ try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
+ HSLFSlideShow ppt = new HSLFSlideShow(is)) {
- text = ppe1.getText();
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
+ assertNotNull(ppt.getNotesHeadersFooters());
+ assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
- ppe1.setNotesByDefault(true);
- text = ppe1.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- ppe1.close();
- ppt1.close();
+ testHeaderFooterInner(ppt);
+ }
// And with a footer, also on notes
- InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
- HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
- is2.close();
-
- assertNotNull(ppt2.getNotesHeadersFooters());
- assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
- ppt2.close();
-
- PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
-
- text = ppe2.getText();
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
-
- ppe2.setNotesByDefault(true);
- text = ppe2.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- ppe2.close();
+ try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
+ final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
+ assertNotNull(ppt.getNotesHeadersFooters());
+ assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
+
+ testHeaderFooterInner(ppt);
+ }
+ }
+
+ private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
+ try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
+ String text = ppe.getText();
+ assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
+ assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
+
+ ppe.setNotesByDefault(true);
+ text = ppe.getText();
+ assertContains(text, "testdoc");
+ assertContains(text, "test phrase");
+ }
}
@SuppressWarnings("unused")
String masterTitleText = "This is the Master Title";
String masterRandomText = "This text comes from the Master Slide";
String masterFooterText = "Footer from the master slide";
- PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
- ppe.setMasterByDefault(true);
+ try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
+ ppe.setMasterByDefault(true);
- String text = ppe.getText();
- assertContains(text, masterRandomText);
- assertContains(text, masterFooterText);
- ppe.close();
+ String text = ppe.getText();
+ assertContains(text, masterRandomText);
+ assertContains(text, masterFooterText);
+ }
}
@Test
public void testMasterText() throws IOException {
- PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
-
- // Initially not there
- String text = ppe1.getText();
- assertFalse(text.contains("Text that I added to the master slide"));
-
- // Enable, shows up
- ppe1.setMasterByDefault(true);
- text = ppe1.getText();
- assertContains(text, "Text that I added to the master slide");
-
- // Make sure placeholder text does not come out
- assertNotContained(text, "Click to edit Master");
- ppe1.close();
+ try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
+ // Initially not there
+ String text = ppe.getText();
+ assertFalse(text.contains("Text that I added to the master slide"));
+
+ // Enable, shows up
+ ppe.setMasterByDefault(true);
+ text = ppe.getText();
+ assertContains(text, "Text that I added to the master slide");
+
+ // Make sure placeholder text does not come out
+ assertNotContained(text, "Click to edit Master");
+ }
// Now with another file only containing master text
// Will always show up
- PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
- String masterText = "Footer from the master slide";
+ try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
+ String masterText = "Footer from the master slide";
- text = ppe2.getText();
- assertContainsIgnoreCase(text, "master");
- assertContains(text, masterText);
- ppe2.close();
+ String text = ppe.getText();
+ assertContainsIgnoreCase(text, "master");
+ assertContains(text, masterText);
+ }
}
/**
*/
@Test
public void testChineseText() throws IOException {
- PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
-
- String text = ppe.getText();
+ try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
+ String text = ppe.getText();
- // Check for the english text line
- assertContains(text, "Single byte");
+ // Check for the english text line
+ assertContains(text, "Single byte");
- // Check for the english text in the mixed line
- assertContains(text, "Mix");
+ // Check for the english text in the mixed line
+ assertContains(text, "Mix");
- // Check for the chinese text in the mixed line
- assertContains(text, "\u8868");
+ // Check for the chinese text in the mixed line
+ assertContains(text, "\u8868");
- // Check for the chinese only text line
- assertContains(text, "\uff8a\uff9d\uff76\uff78");
- ppe.close();
+ // Check for the chinese only text line
+ assertContains(text, "\uff8a\uff9d\uff76\uff78");
+ }
}
/**
public void testDifferentPOIFS() throws IOException {
// Open the two filesystems
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
- InputStream is1 = new FileInputStream(pptFile);
- OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
- is1.close();
- NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
-
- DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
-
- // Open directly
- for (DirectoryNode dir : files) {
- PowerPointExtractor extractor = new PowerPointExtractor(dir);
- assertEquals(expectText, extractor.getText());
- }
+ try (final InputStream is1 = new FileInputStream(pptFile);
+ final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
- // Open via a HSLFSlideShow
- for (DirectoryNode dir : files) {
- HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
- PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
- assertEquals(expectText, extractor.getText());
- extractor.close();
- slideshow.close();
- }
+ final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
- npoifs.close();
+ DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
+
+ // Open directly
+ for (DirectoryNode dir : files) {
+ try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
+ SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
+ assertEquals(expectText, extractor.getText());
+ }
+ }
+ }
}
@Test
public void testTable() throws Exception {
- PowerPointExtractor ppe1 = openExtractor("54111.ppt");
- String text1 = ppe1.getText();
- String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
- "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
- "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
- "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
- "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
- "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
- assertContains(text1, target1);
- ppe1.close();
-
- PowerPointExtractor ppe2 = openExtractor("54722.ppt");
- String text2 = ppe2.getText();
-
- String target2 = "this\tText\tis\twithin\ta\n" +
- "table\t1\t2\t3\t4";
- assertContains(text2, target2);
- ppe2.close();
+ try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
+ String text = ppe.getText();
+ String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
+ "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
+ "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
+ "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
+ "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
+ "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
+ assertContains(text, target);
+ }
+
+ try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
+ String text = ppe.getText();
+
+ String target = "this\tText\tis\twithin\ta\n" +
+ "table\t1\t2\t3\t4";
+ assertContains(text, target);
+ }
}
// bug 60003
@Test
public void testExtractMasterSlideFooterText() throws Exception {
- PowerPointExtractor ppe = openExtractor("60003.ppt");
- ppe.setMasterByDefault(true);
+ try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
+ ppe.setMasterByDefault(true);
- String text = ppe.getText();
- assertContains(text, "Prague");
- ppe.close();
+ String text = ppe.getText();
+ assertContains(text, "Prague");
+ }
}
@Test
public void testExtractGroupedShapeText() throws Exception {
- try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
+ try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
final String text = ppe.getText();
//this tests that we're ignoring text shapes at depth=0
import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.draw.DrawTextParagraph;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ColorStyle;
import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
for (String f : files) {
File sample = HSLFTestDataSamples.getSampleFile(f);
- PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
- assertNotNull(ex.getText());
- ex.close();
+ try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
+ assertNotNull(ex.getText());
+ }
}
}
@Test
public void bug58733() throws IOException {
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
- PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
- assertNotNull(ex.getText());
- ex.close();
+ try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
+ assertNotNull(ex.getText());
+ }
}
@Test