From dfdf9e6d6f470b82ad2a6b77e3059dd0df23905b Mon Sep 17 00:00:00 2001
From: Andreas Beeker
Date: Thu, 13 Aug 2020 21:08:24 +0000
Subject: #64411 - Provide JigSaw modules - rework extractors - see bugzilla
entry for more information
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1880839 13f79535-47bb-0310-9956-ffa450edef68
---
.../org/apache/poi/stress/AbstractFileHandler.java | 2 +-
.../org/apache/poi/stress/XSLFFileHandler.java | 13 +-
.../org/apache/poi/extractor/ExtractorFactory.java | 304 ++++++++++++++++
.../apache/poi/extractor/ExtractorProvider.java | 76 ++++
.../apache/poi/extractor/MainExtractorFactory.java | 76 ++++
.../apache/poi/extractor/OLE2ExtractorFactory.java | 279 ---------------
.../apache/poi/extractor/POIOLE2TextExtractor.java | 49 +--
.../org/apache/poi/extractor/POITextExtractor.java | 49 +--
.../hpsf/extractor/HPSFPropertiesExtractor.java | 47 ++-
.../hssf/extractor/EventBasedExcelExtractor.java | 47 ++-
.../apache/poi/hssf/extractor/ExcelExtractor.java | 36 +-
.../poi/hssf/extractor/OldExcelExtractor.java | 38 +-
.../poi/sl/extractor/SlideShowExtractor.java | 37 +-
.../apache/poi/ss/extractor/ExcelExtractor.java | 20 +-
src/multimodule/ooxml/java9/module-info.class | Bin 2602 -> 2715 bytes
src/multimodule/ooxml/java9/module-info.java | 1 +
src/multimodule/ooxml/test9/module-info.class | Bin 3471 -> 3584 bytes
src/multimodule/ooxml/test9/module-info.java | 1 +
src/multimodule/poi/java9/module-info.class | Bin 3062 -> 3169 bytes
src/multimodule/poi/java9/module-info.java | 4 +
src/multimodule/poi/test9/module-info.class | Bin 3189 -> 3296 bytes
src/multimodule/poi/test9/module-info.java | 2 +
src/multimodule/scratchpad/java9/module-info.class | Bin 2099 -> 2219 bytes
src/multimodule/scratchpad/java9/module-info.java | 2 +
src/multimodule/scratchpad/test9/module-info.class | Bin 2273 -> 2393 bytes
src/multimodule/scratchpad/test9/module-info.java | 2 +
.../ooxml/extractor/CommandLineTextExtractor.java | 6 +-
.../poi/ooxml/extractor/ExtractorFactory.java | 384 ---------------------
.../ooxml/extractor/POIXMLExtractorFactory.java | 281 +++++++++++++++
.../extractor/POIXMLPropertiesTextExtractor.java | 29 +-
.../poi/ooxml/extractor/POIXMLTextExtractor.java | 60 ++--
.../poi/xdgf/extractor/XDGFVisioExtractor.java | 43 ++-
.../apache/poi/xslf/extractor/XSLFExtractor.java | 45 +++
.../extractor/XSSFBEventBasedExcelExtractor.java | 16 +-
.../extractor/XSSFEventBasedExcelExtractor.java | 48 +--
.../poi/xssf/extractor/XSSFExcelExtractor.java | 45 ++-
.../poi/xwpf/extractor/XWPFWordExtractor.java | 45 +--
.../poi/extractor/ooxml/TestExtractorFactory.java | 143 ++++----
.../org/apache/poi/openxml4j/opc/TestPackage.java | 6 +-
.../poi/poifs/crypt/tests/TestHxxFEncryption.java | 16 +-
.../org/apache/poi/xslf/TestXSLFBugs.java | 2 +-
.../extractor/TestXSLFPowerPointExtractor.java | 14 +-
...stXSSFEventBasedExcelExtractorUsingFactory.java | 4 +-
.../TestXSSFExcelExtractorUsingFactory.java | 2 +-
.../org.apache.poi.extractor.ExtractorProvider | 18 +
.../org.apache.poi.extractor.ExtractorProvider | 18 +
.../org.apache.poi.extractor.ExtractorProvider | 18 +
.../ole2/OLE2ScratchpadExtractorFactory.java | 132 ++++---
.../poi/hdgf/extractor/VisioTextExtractor.java | 37 +-
.../poi/hpbf/extractor/PublisherTextExtractor.java | 44 ++-
.../poi/hslf/extractor/PowerPointExtractor.java | 279 ---------------
.../apache/poi/hslf/usermodel/HSLFSlideShow.java | 103 +++++-
.../poi/hslf/usermodel/HSLFSlideShowImpl.java | 35 +-
.../poi/hsmf/extractor/OutlookTextExtactor.java | 61 ----
.../poi/hsmf/extractor/OutlookTextExtractor.java | 30 +-
.../apache/poi/hwpf/extractor/Word6Extractor.java | 53 +--
.../apache/poi/hwpf/extractor/WordExtractor.java | 51 ++-
.../poi/hdgf/extractor/TestVisioExtractor.java | 34 +-
.../apache/poi/hsmf/TestFixedSizedProperties.java | 53 ++-
.../hsmf/extractor/TestOutlookTextExtractor.java | 282 +++++++--------
.../poi/hwpf/extractor/TestWordExtractorBugs.java | 14 +-
.../extractor/TestHPSFPropertiesExtractor.java | 57 ++-
.../poi/hssf/extractor/TestExcelExtractor.java | 18 +-
63 files changed, 1811 insertions(+), 1800 deletions(-)
create mode 100644 src/java/org/apache/poi/extractor/ExtractorFactory.java
create mode 100644 src/java/org/apache/poi/extractor/ExtractorProvider.java
create mode 100644 src/java/org/apache/poi/extractor/MainExtractorFactory.java
delete mode 100644 src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
delete mode 100644 src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java
create mode 100644 src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java
create mode 100644 src/ooxml/java/org/apache/poi/xslf/extractor/XSLFExtractor.java
create mode 100644 src/resources/main/META-INF/services/org.apache.poi.extractor.ExtractorProvider
create mode 100644 src/resources/ooxml/META-INF/services/org.apache.poi.extractor.ExtractorProvider
create mode 100644 src/resources/scratchpad/META-INF/services/org.apache.poi.extractor.ExtractorProvider
delete mode 100644 src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
delete mode 100644 src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
(limited to 'src')
diff --git a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
index 94bc98b9ec..d5a019c448 100644
--- a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
+++ b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
@@ -29,11 +29,11 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.ss.extractor.ExcelExtractor;
import org.apache.poi.util.IOUtils;
diff --git a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
index ba6e4c1b0a..fa4d038682 100644
--- a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
+++ b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
@@ -23,7 +23,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@@ -37,11 +37,11 @@ public class XSLFFileHandler extends SlideShowHandler {
assertNotNull(slideInner.getPresentation());
assertNotNull(slideInner.getSlideMasterReferences());
assertNotNull(slideInner.getSlideReferences());
-
+
new POIXMLDocumentHandler().handlePOIXMLDocument(slide);
handleSlideShow(slide);
-
+
slideInner.close();
slide.close();
}
@@ -49,11 +49,12 @@ public class XSLFFileHandler extends SlideShowHandler {
@Override
public void handleExtracting(File file) throws Exception {
super.handleExtracting(file);
-
-
+
+
// additionally try the other getText() methods
- try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
+ //noinspection rawtypes
+ try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
diff --git a/src/java/org/apache/poi/extractor/ExtractorFactory.java b/src/java/org/apache/poi/extractor/ExtractorFactory.java
new file mode 100644
index 0000000000..fa57be7a8c
--- /dev/null
+++ b/src/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -0,0 +1,304 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.extractor;
+
+import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword;
+import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ServiceLoader;
+import java.util.stream.StreamSupport;
+
+import org.apache.poi.EmptyFileException;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+/**
+ * Figures out the correct POIOLE2TextExtractor for your supplied
+ * document, and returns it.
+ *
+ *
Note 1 - will fail for many file formats if the POI Scratchpad jar is
+ * not present on the runtime classpath
+ *
Note 2 - for text extractor creation across all formats, use
+ * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
+ * the OOXML jar.
+ *
Note 3 - rather than using this, for most cases you would be better
+ * off switching to Apache Tika instead!
+ */
+@SuppressWarnings({"WeakerAccess", "JavadocReference"})
+public final class ExtractorFactory {
+ private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class);
+
+ /** Should this thread prefer event based over usermodel based extractors? */
+ private static final ThreadLocal threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
+
+ /** Should all threads prefer event based over usermodel based extractors? */
+ private static Boolean allPreferEventExtractors;
+
+
+ private static class Singleton {
+ private static final ExtractorFactory INSTANCE = new ExtractorFactory();
+ }
+
+ private interface ProviderMethod {
+ POITextExtractor create(ExtractorProvider prov) throws IOException;
+ }
+
+ private final List provider = new ArrayList<>();
+
+
+ private ExtractorFactory() {
+ ServiceLoader.load(ExtractorProvider.class).forEach(provider::add);
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is false.
+ *
+ * @return true if event extractors should be preferred in the current thread, fals otherwise.
+ */
+ public static boolean getThreadPrefersEventExtractors() {
+ return threadPreferEventExtractors.get();
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is to use the thread level setting, which defaults to false.
+ *
+ * @return true if event extractors should be preferred in all threads, fals otherwise.
+ */
+ public static Boolean getAllThreadsPreferEventExtractors() {
+ return allPreferEventExtractors;
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * Will only be used if the All Threads setting is null.
+ *
+ * @param preferEventExtractors If this threads should prefer event based extractors.
+ */
+ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+ threadPreferEventExtractors.set(preferEventExtractors);
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * If set, will take preference over the Thread level setting.
+ *
+ * @param preferEventExtractors If all threads should prefer event based extractors.
+ */
+ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+ allPreferEventExtractors = preferEventExtractors;
+ }
+
+ /**
+ * Should this thread use event based extractors is available?
+ * Checks the all-threads one first, then thread specific.
+ *
+ * @return If the current thread should use event based extractors.
+ */
+ public static boolean getPreferEventExtractor() {
+ return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get();
+ }
+
+ public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+ return createExtractor(fs, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException {
+ return createExtractor(fs.getRoot(), password);
+ }
+
+ public static POITextExtractor createExtractor(InputStream input) throws IOException {
+ return createExtractor(input, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(InputStream input, String password) throws IOException {
+ final InputStream is = FileMagic.prepareToCheckMagic(input);
+ byte[] emptyFileCheck = new byte[1];
+ is.mark(emptyFileCheck.length);
+ if (is.read(emptyFileCheck) < emptyFileCheck.length) {
+ throw new EmptyFileException();
+ }
+ is.reset();
+
+ final FileMagic fm = FileMagic.valueOf(is);
+ if (FileMagic.OOXML == fm) {
+ return wp(fm, w -> w.create(is, password));
+ }
+
+ if (FileMagic.OLE2 != fm) {
+ throw new IOException("Can't create extractor - unsupported file type: "+fm);
+ }
+
+ POIFSFileSystem poifs = new POIFSFileSystem(is);
+ boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
+
+ return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
+ }
+
+ public static POITextExtractor createExtractor(File file) throws IOException {
+ return createExtractor(file, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(File file, String password) throws IOException {
+ if (file.length() == 0) {
+ throw new EmptyFileException();
+ }
+
+ final FileMagic fm = FileMagic.valueOf(file);
+ if (FileMagic.OOXML == fm) {
+ return wp(fm, w -> w.create(file, password));
+ }
+
+ if (FileMagic.OLE2 != fm) {
+ throw new IOException("Can't create extractor - unsupported file type: "+fm);
+ }
+
+ POIFSFileSystem poifs = new POIFSFileSystem(file, true);
+ try {
+ boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
+ return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
+ } catch (IOException | RuntimeException e) {
+ IOUtils.closeQuietly(poifs);
+ throw e;
+ }
+ }
+
+
+ /**
+ * Create the Extractor, if possible. Generally needs the Scratchpad jar.
+ * Note that this won't check for embedded OOXML resources either, use
+ * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
+ *
+ * @param root The {@link DirectoryNode} pointing to a document.
+ *
+ * @return The resulting {@link POITextExtractor}, an exception is thrown if
+ * no TextExtractor can be created for some reason.
+ *
+ * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
+ * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
+ * an unsupported version of Excel.
+ * @throws IllegalArgumentException If creating the Extractor fails
+ */
+ public static POITextExtractor createExtractor(DirectoryNode root) throws IOException {
+ return createExtractor(root, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException {
+ // Encrypted OOXML files go inside OLE2 containers, is this one?
+ if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) {
+ return wp(FileMagic.OOXML, w -> w.create(root, password));
+ } else {
+ return wp(FileMagic.OLE2, w -> w.create(root, password));
+ }
+ }
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ *
+ * @param ext The extractor to look at for embedded documents
+ *
+ * @return An array of resulting extractors. Empty if no embedded documents are found.
+ *
+ * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
+ * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
+ * an unsupported version of Excel.
+ * @throws IllegalArgumentException If creating the Extractor fails
+ */
+ public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ if (ext == null) {
+ throw new IllegalStateException("extractor must be given");
+ }
+
+ // All the embedded directories we spotted
+ List dirs = new ArrayList<>();
+ // For anything else not directly held in as a POIFS directory
+ List nonPOIFS = new ArrayList<>();
+
+ // Find all the embedded directories
+ DirectoryEntry root = ext.getRoot();
+ if(root == null) {
+ throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+ }
+
+ if(ext instanceof ExcelExtractor) {
+ // These are in MBD... under the root
+ StreamSupport.stream(root.spliterator(), false)
+ .filter(entry -> entry.getName().startsWith("MBD"))
+ .forEach(dirs::add);
+ } else {
+ for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
+ if (prov.accepts(FileMagic.OLE2)) {
+ prov.identifyEmbeddedResources(ext, dirs, nonPOIFS);
+ break;
+ }
+ }
+ }
+
+ // Create the extractors
+ if(dirs.size() == 0 && nonPOIFS.size() == 0){
+ return new POITextExtractor[0];
+ }
+
+ ArrayList textExtractors = new ArrayList<>();
+ for (Entry dir : dirs) {
+ textExtractors.add(createExtractor((DirectoryNode) dir));
+ }
+ for (InputStream stream : nonPOIFS) {
+ try {
+ textExtractors.add(createExtractor(stream));
+ } catch (IOException e) {
+ // Ignore, just means it didn't contain a format we support as yet
+ LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
+ }
+ }
+ return textExtractors.toArray(new POITextExtractor[0]);
+ }
+
+ private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException {
+ for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
+ if (prov.accepts(fm)) {
+ POITextExtractor ext = fun.create(prov);
+ if (ext != null) {
+ return ext;
+ }
+ }
+ }
+ throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " +
+ "or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm);
+ }
+
+}
diff --git a/src/java/org/apache/poi/extractor/ExtractorProvider.java b/src/java/org/apache/poi/extractor/ExtractorProvider.java
new file mode 100644
index 0000000000..ccbeee15d0
--- /dev/null
+++ b/src/java/org/apache/poi/extractor/ExtractorProvider.java
@@ -0,0 +1,76 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.extractor;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+
+public interface ExtractorProvider {
+ boolean accepts(FileMagic fm);
+
+ /**
+ * Create Extractor via file
+ * @param file the file
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if file can't be read or parsed
+ */
+ POITextExtractor create(File file, String password) throws IOException;
+
+ /**
+ * Create Extractor via InputStream
+ * @param inputStream the stream
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if stream can't be read or parsed
+ */
+ POITextExtractor create(InputStream inputStream, String password) throws IOException;
+
+ /**
+ * Create Extractor from POIFS node
+ * @param poifsDir the node
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if node can't be parsed
+ */
+ POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException;
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ *
+ * @param ext the extractor holding the directory to start parsing
+ * @param dirs a list to be filled with directory references holding embedded
+ * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
+ *
+ * @throws IOException when the format specific extraction fails because of invalid entires
+ */
+ default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) throws IOException {
+ throw new IllegalArgumentException("Error checking for Scratchpad embedded resources");
+ }
+
+}
diff --git a/src/java/org/apache/poi/extractor/MainExtractorFactory.java b/src/java/org/apache/poi/extractor/MainExtractorFactory.java
new file mode 100644
index 0000000000..7f8733eecf
--- /dev/null
+++ b/src/java/org/apache/poi/extractor/MainExtractorFactory.java
@@ -0,0 +1,76 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.extractor;
+
+import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.model.InternalWorkbook;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * ExtractorFactory for HSSF and Old Excel format
+ */
+public class MainExtractorFactory implements ExtractorProvider {
+ @Override
+ public boolean accepts(FileMagic fm) {
+ return FileMagic.OLE2 == fm;
+ }
+
+ @Override
+ public POITextExtractor create(File file, String password) throws IOException {
+ return create(new POIFSFileSystem(file, true).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(InputStream inputStream, String password) throws IOException {
+ return create(new POIFSFileSystem(inputStream).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
+ final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
+ try {
+ Biff8EncryptionKey.setCurrentUserPassword(password);
+
+ // Look for certain entries in the stream, to figure it out from
+ for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
+ if (poifsDir.hasEntry(workbookName)) {
+ return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir);
+ }
+ }
+
+ if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) {
+ return new OldExcelExtractor(poifsDir);
+ }
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(oldPW);
+ }
+
+ return null;
+ }
+}
diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
deleted file mode 100644
index 1f5eee039d..0000000000
--- a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.extractor;
-
-import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME;
-import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Method;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.poi.hssf.OldExcelFormatException;
-import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
-import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.POILogFactory;
-import org.apache.poi.util.POILogger;
-
-/**
- * Figures out the correct POIOLE2TextExtractor for your supplied
- * document, and returns it.
- *
- *
Note 1 - will fail for many file formats if the POI Scratchpad jar is
- * not present on the runtime classpath
- *
Note 2 - for text extractor creation across all formats, use
- * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
- * the OOXML jar.
- *
Note 3 - rather than using this, for most cases you would be better
- * off switching to Apache Tika instead!
- */
-@SuppressWarnings({"WeakerAccess", "JavadocReference"})
-public final class OLE2ExtractorFactory {
- private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
-
- /** Should this thread prefer event based over usermodel based extractors? */
- private static final ThreadLocal threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
-
- /** Should all threads prefer event based over usermodel based extractors? */
- private static Boolean allPreferEventExtractors;
-
- private OLE2ExtractorFactory() {
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
- *
- * @return true if event extractors should be preferred in the current thread, fals otherwise.
- */
- public static boolean getThreadPrefersEventExtractors() {
- return threadPreferEventExtractors.get();
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
- *
- * @return true if event extractors should be preferred in all threads, fals otherwise.
- */
- public static Boolean getAllThreadsPreferEventExtractors() {
- return allPreferEventExtractors;
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
- *
- * @param preferEventExtractors If this threads should prefer event based extractors.
- */
- public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- threadPreferEventExtractors.set(preferEventExtractors);
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
- *
- * @param preferEventExtractors If all threads should prefer event based extractors.
- */
- public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- allPreferEventExtractors = preferEventExtractors;
- }
-
- /**
- * Should this thread use event based extractors is available?
- * Checks the all-threads one first, then thread specific.
- *
- * @return If the current thread should use event based extractors.
- */
- public static boolean getPreferEventExtractor() {
- if(allPreferEventExtractors != null) {
- return allPreferEventExtractors;
- }
- return threadPreferEventExtractors.get();
- }
-
- @SuppressWarnings("unchecked")
- public static T createExtractor(POIFSFileSystem fs) throws IOException {
- return (T)createExtractor(fs.getRoot());
- }
-
- @SuppressWarnings("unchecked")
- public static T createExtractor(InputStream input) throws IOException {
- Class> cls = getOOXMLClass();
- if (cls != null) {
- // Use Reflection to get us the full OOXML-enabled version
- try {
- Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
- return (T)m.invoke(null, input);
- } catch (IllegalArgumentException iae) {
- throw iae;
- } catch (Exception e) {
- throw new IllegalArgumentException("Error creating Extractor for InputStream", e);
- }
- } else {
- // Best hope it's OLE2....
- return createExtractor(new POIFSFileSystem(input));
- }
- }
-
- private static Class> getOOXMLClass() {
- try {
- return OLE2ExtractorFactory.class.getClassLoader().loadClass(
- "org.apache.poi.extractor.ExtractorFactory"
- );
- } catch (ClassNotFoundException e) {
- LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
- return null;
- }
- }
- private static Class> getScratchpadClass() {
- try {
- return OLE2ExtractorFactory.class.getClassLoader().loadClass(
- "org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"
- );
- } catch (ClassNotFoundException e) {
- LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
- throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
- }
- }
-
- /**
- * Create the Extractor, if possible. Generally needs the Scratchpad jar.
- * Note that this won't check for embedded OOXML resources either, use
- * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
- *
- * @param poifsDir The {@link DirectoryNode} pointing to a document.
- *
- * @return The resulting {@link POITextExtractor}, an exception is thrown if
- * no TextExtractor can be created for some reason.
- *
- * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
- * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
- * an unsupported version of Excel.
- * @throws IllegalArgumentException If creating the Extractor fails
- */
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
- // Look for certain entries in the stream, to figure it
- // out from
- for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
- if (poifsDir.hasEntry(workbookName)) {
- if (getPreferEventExtractor()) {
- return new EventBasedExcelExtractor(poifsDir);
- }
- return new ExcelExtractor(poifsDir);
- }
- }
- if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
- throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
- + "found. Please call OldExcelExtractor directly for basic text extraction");
- }
-
- // Ask Scratchpad, or fail trying
- Class> cls = getScratchpadClass();
- try {
- Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
- POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
- if (ext != null) return ext;
- } catch (IllegalArgumentException iae) {
- throw iae;
- } catch (Exception e) {
- throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
- }
-
- throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- *
- * @param ext The extractor to look at for embedded documents
- *
- * @return An array of resulting extractors. Empty if no embedded documents are found.
- *
- * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
- * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
- * an unsupported version of Excel.
- * @throws IllegalArgumentException If creating the Extractor fails
- */
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
- // All the embedded directories we spotted
- List dirs = new ArrayList<>();
- // For anything else not directly held in as a POIFS directory
- List nonPOIFS = new ArrayList<>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if(root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- if(ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator it = root.getEntries();
- while(it.hasNext()) {
- Entry entry = it.next();
- if(entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else {
- // Ask Scratchpad, or fail trying
- Class> cls = getScratchpadClass();
- try {
- Method m = cls.getDeclaredMethod(
- "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
- m.invoke(null, ext, dirs, nonPOIFS);
- } catch (Exception e) {
- throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
- }
- }
-
- // Create the extractors
- if(dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList e = new ArrayList<>();
- for (Entry dir : dirs) {
- e.add(createExtractor((DirectoryNode) dir
- ));
- }
- for (InputStream stream : nonPOIFS) {
- try {
- e.add(createExtractor(stream));
- } catch (Exception xe) {
- // Ignore, invalid format
- LOGGER.log(POILogger.WARN, xe);
- }
- }
- return e.toArray(new POITextExtractor[0]);
- }
-}
diff --git a/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java b/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java
index 465de40375..a389b71d0c 100644
--- a/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java
@@ -30,55 +30,28 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
* org.apache.poi.[format].extractor .
*
* @see org.apache.poi.hssf.extractor.ExcelExtractor
- * @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
-public abstract class POIOLE2TextExtractor extends POITextExtractor {
- /** The POIDocument that's open */
- protected POIDocument document;
-
- /**
- * Creates a new text extractor for the given document
- *
- * @param document The POIDocument to use in this extractor.
- */
- public POIOLE2TextExtractor(POIDocument document) {
- this.document = document;
-
- // Ensure any underlying resources, such as open files,
- // will get cleaned up if the user calls #close()
- setFilesystem(document);
- }
-
- /**
- * Creates a new text extractor, using the same
- * document as another text extractor. Normally
- * only used by properties extractors.
- *
- * @param otherExtractor the extractor which document to be used
- */
- protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) {
- this.document = otherExtractor.document;
- }
-
+public interface POIOLE2TextExtractor extends POITextExtractor {
/**
* Returns the document information metadata for the document
*
* @return The Document Summary Information or null
* if it could not be read for this document.
*/
- public DocumentSummaryInformation getDocSummaryInformation() {
- return document.getDocumentSummaryInformation();
+ default DocumentSummaryInformation getDocSummaryInformation() {
+ return getDocument().getDocumentSummaryInformation();
}
+
/**
* Returns the summary information metadata for the document.
*
* @return The Summary information for the document or null
* if it could not be read for this document.
*/
- public SummaryInformation getSummaryInformation() {
- return document.getSummaryInformation();
+ default SummaryInformation getSummaryInformation() {
+ return getDocument().getSummaryInformation();
}
/**
@@ -88,7 +61,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
* @return an instance of POIExtractor that can extract meta-data.
*/
@Override
- public POITextExtractor getMetadataTextExtractor() {
+ default POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
@@ -97,8 +70,8 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
*
* @return the DirectoryEntry that is associated with the POIDocument of this extractor.
*/
- public DirectoryEntry getRoot() {
- return document.getDirectory();
+ default DirectoryEntry getRoot() {
+ return getDocument().getDirectory();
}
/**
@@ -107,7 +80,5 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
* @return the underlying POIDocument
*/
@Override
- public POIDocument getDocument() {
- return document;
- }
+ POIDocument getDocument();
}
\ No newline at end of file
diff --git a/src/java/org/apache/poi/extractor/POITextExtractor.java b/src/java/org/apache/poi/extractor/POITextExtractor.java
index e32adcb12c..cf88c57bae 100644
--- a/src/java/org/apache/poi/extractor/POITextExtractor.java
+++ b/src/java/org/apache/poi/extractor/POITextExtractor.java
@@ -21,19 +21,16 @@ import java.io.IOException;
/**
* Common Parent for Text Extractors
- * of POI Documents.
+ * of POI Documents.
* You will typically find the implementation of
* a given format's text extractor under
* org.apache.poi.[format].extractor .
- *
+ *
* @see org.apache.poi.hssf.extractor.ExcelExtractor
- * @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
-public abstract class POITextExtractor implements Closeable {
- private Closeable fsToClose;
-
+public interface POITextExtractor extends Closeable {
/**
* Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text
@@ -41,42 +38,50 @@ public abstract class POITextExtractor implements Closeable {
* a specific project for details.
* @return All the text from the document
*/
- public abstract String getText();
-
+ String getText();
+
/**
* Returns another text extractor, which is able to
* output the textual content of the document
* metadata / properties, such as author and title.
- *
+ *
* @return the metadata and text extractor
*/
- public abstract POITextExtractor getMetadataTextExtractor();
+ POITextExtractor getMetadataTextExtractor();
/**
- * Used to ensure file handle cleanup.
- *
- * @param fs filesystem to close
+ * @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be
+ * closed on {@link #close()}
*/
- public void setFilesystem(Closeable fs) {
- fsToClose = fs;
- }
-
+ void setCloseFilesystem(boolean doCloseFilesystem);
+
+ /**
+ * @return {@code true}, if resources/filesystem should be closed on {@link #close()}
+ */
+ boolean isCloseFilesystem();
+
+ /**
+ * @return The underlying resources/filesystem
+ */
+ Closeable getFilesystem();
+
/**
* Allows to free resources of the Extractor as soon as
* it is not needed any more. This may include closing
* open file handles and freeing memory.
- *
+ *
* The Extractor cannot be used after close has been called.
*/
@Override
- public void close() throws IOException {
- if(fsToClose != null) {
- fsToClose.close();
+ default void close() throws IOException {
+ Closeable fs = getFilesystem();
+ if (isCloseFilesystem() && fs != null) {
+ fs.close();
}
}
/**
* @return the processed document
*/
- public abstract Object getDocument();
+ Object getDocument();
}
diff --git a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java
index 0069fafa9d..fde938d258 100644
--- a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java
+++ b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java
@@ -17,9 +17,6 @@
package org.apache.poi.hpsf.extractor;
-import java.io.File;
-import java.io.IOException;
-
import org.apache.poi.POIDocument;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
@@ -37,15 +34,20 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* build in and custom, returning them in
* textual form.
*/
-public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
+public class HPSFPropertiesExtractor implements POIOLE2TextExtractor {
+ private final POIDocument document;
+ private boolean doCloseFilesystem = true;
+
public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) {
- super(mainExtractor);
+ document = mainExtractor.getDocument();
}
- public HPSFPropertiesExtractor(POIDocument doc) {
- super(doc);
+
+ public HPSFPropertiesExtractor(POIDocument document) {
+ this.document = document;
}
+
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
- super(new HPSFPropertiesOnlyDocument(fs));
+ document = new HPSFPropertiesOnlyDocument(fs);
}
public String getDocumentSummaryInformationText() {
@@ -122,11 +124,11 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
}
private static String getPropertyValueText(Object val) {
- return (val == null)
+ return (val == null)
? "(not set)"
: PropertySet.getPropertyStringValue(val);
}
-
+
@Override
public boolean equals(Object o) {
return super.equals(o);
@@ -137,12 +139,23 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
return super.hashCode();
}
- public static void main(String[] args) throws IOException {
- for (String file : args) {
- try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(
- new POIFSFileSystem(new File(file)))) {
- System.out.println(ext.getText());
- }
- }
+ @Override
+ public POIDocument getDocument() {
+ return document;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public POIDocument getFilesystem() {
+ return document;
}
}
diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
index 122eddf109..68c69cf1d3 100644
--- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
@@ -17,6 +17,7 @@
package org.apache.poi.hssf.extractor;
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -37,9 +38,9 @@ import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
-import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -56,29 +57,31 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
*
- *
+ *
* @see XLS2CSVmra
*/
-public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
- private DirectoryNode _dir;
+public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
+ private final POIFSFileSystem poifs;
+ private final DirectoryNode _dir;
+ private boolean doCloseFilesystem = true;
boolean _includeSheetNames = true;
boolean _formulasNotResults;
- public EventBasedExcelExtractor( DirectoryNode dir )
- {
- super( (POIDocument)null );
+ public EventBasedExcelExtractor(DirectoryNode dir) {
+ poifs = null;
_dir = dir;
}
public EventBasedExcelExtractor(POIFSFileSystem fs) {
- this(fs.getRoot());
- super.setFilesystem(fs);
+ poifs = fs;
+ _dir = fs.getRoot();
}
/**
* Would return the document information metadata for the document,
* if we supported it
*/
+ @Override
public DocumentSummaryInformation getDocSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
}
@@ -86,6 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or
* Would return the summary information metadata for the document,
* if we supported it
*/
+ @Override
public SummaryInformation getSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
}
@@ -262,4 +266,29 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or
}
}
}
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return poifs;
+ }
+
+ @Override
+ public POIDocument getDocument() {
+ return null;
+ }
+
+ @Override
+ public DirectoryEntry getRoot() {
+ return _dir;
+ }
}
diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
index ec40f097f1..5fa855546f 100644
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@@ -50,12 +50,13 @@ import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
*
- *
+ *
* @see XLS2CSVmra
*/
-public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
+public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private final HSSFWorkbook _wb;
private final HSSFDataFormatter _formatter;
+ private boolean doCloseFilesystem = true;
private boolean _includeSheetNames = true;
private boolean _shouldEvaluateFormulas = true;
private boolean _includeCellComments;
@@ -63,13 +64,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
private boolean _includeHeadersFooters = true;
public ExcelExtractor(HSSFWorkbook wb) {
- super(wb);
_wb = wb;
_formatter = new HSSFDataFormatter();
}
+
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
+
public ExcelExtractor(DirectoryNode dir) throws IOException {
this(new HSSFWorkbook(dir, true));
}
@@ -201,9 +203,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
/**
* Command line extractor.
- *
+ *
* @param args the command line parameters
- *
+ *
* @throws IOException if the file can't be read or contains errors
*/
public static void main(String[] args) throws IOException {
@@ -225,7 +227,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
HSSFWorkbook wb = new HSSFWorkbook(is);
- ExcelExtractor extractor = new ExcelExtractor(wb);
+ ExcelExtractor extractor = new ExcelExtractor(wb)
) {
extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
@@ -255,7 +257,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
* Should blank cells be output? Default is to only
* output cells that are present in the file and are
* non-blank.
- *
+ *
* @param includeBlankCells {@code true} if blank cells should be included
*/
public void setIncludeBlankCells(boolean includeBlankCells) {
@@ -411,4 +413,24 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
return text.toString();
}
+
+ @Override
+ public HSSFWorkbook getDocument() {
+ return _wb;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public HSSFWorkbook getFilesystem() {
+ return _wb;
+ }
}
diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
index a4f334a4e6..56b1424b3d 100644
--- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
@@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.CodepageRecord;
@@ -58,7 +59,7 @@ import org.apache.poi.util.IOUtils;
* by Apache Tika, but not really intended for display to the user.
*
*/
-public class OldExcelExtractor implements Closeable {
+public class OldExcelExtractor implements POITextExtractor {
private final static int FILE_PASS_RECORD_SID = 0x2f;
//arbitrarily selected; may need to increase
@@ -295,24 +296,39 @@ public class OldExcelExtractor implements Closeable {
}
}
- close();
ris = null;
return text.toString();
}
- @Override
- public void close() {
- // some cases require this close here
- if(toClose != null) {
- IOUtils.closeQuietly(toClose);
- toClose = null;
- }
- }
-
protected void handleNumericCell(StringBuilder text, double value) {
// TODO Need to fetch / use format strings
text.append(value);
text.append('\n');
}
+
+ @Override
+ public POITextExtractor getMetadataTextExtractor() {
+ return null;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return toClose != null;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return toClose;
+ }
+
+ @Override
+ public Object getDocument() {
+ return ris;
+ }
}
diff --git a/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java b/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java
index d4a2645ca2..fa454501d6 100644
--- a/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java
+++ b/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java
@@ -54,14 +54,14 @@ import org.apache.poi.util.POILogger;
public class SlideShowExtractor<
S extends Shape,
P extends TextParagraph
-> extends POITextExtractor {
+> implements POITextExtractor {
private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class);
// placeholder text for slide numbers
private static final String SLIDE_NUMBER_PH = "‹#›";
- private SlideShow slideshow;
+ protected final SlideShow slideshow;
private boolean slidesByDefault = true;
private boolean notesByDefault;
@@ -69,9 +69,9 @@ public class SlideShowExtractor<
private boolean masterByDefault;
private Predicate