import java.util.Set;
import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.ss.extractor.ExcelExtractor;
import org.apache.poi.util.IOUtils;
import java.io.FileInputStream;
import java.io.InputStream;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
assertNotNull(slideInner.getPresentation());
assertNotNull(slideInner.getSlideMasterReferences());
assertNotNull(slideInner.getSlideReferences());
-
+
new POIXMLDocumentHandler().handlePOIXMLDocument(slide);
handleSlideShow(slide);
-
+
slideInner.close();
slide.close();
}
@Override
public void handleExtracting(File file) throws Exception {
super.handleExtracting(file);
-
-
+
+
// additionally try the other getText() methods
- try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
+ //noinspection rawtypes
+ try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.extractor;
+
+import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword;
+import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ServiceLoader;
+import java.util.stream.StreamSupport;
+
+import org.apache.poi.EmptyFileException;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+/**
+ * Figures out the correct POIOLE2TextExtractor for your supplied
+ * document, and returns it.
+ *
+ * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
+ * not present on the runtime classpath</p>
+ * <p>Note 2 - for text extractor creation across all formats, use
+ * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
+ * the OOXML jar.</p>
+ * <p>Note 3 - rather than using this, for most cases you would be better
+ * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
+ */
+@SuppressWarnings({"WeakerAccess", "JavadocReference"})
+public final class ExtractorFactory {
+ private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class);
+
+ /** Should this thread prefer event based over usermodel based extractors? */
+ private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
+
+ /** Should all threads prefer event based over usermodel based extractors? */
+ private static Boolean allPreferEventExtractors;
+
+
+ private static class Singleton {
+ private static final ExtractorFactory INSTANCE = new ExtractorFactory();
+ }
+
+ private interface ProviderMethod {
+ POITextExtractor create(ExtractorProvider prov) throws IOException;
+ }
+
+ private final List<ExtractorProvider> provider = new ArrayList<>();
+
+
+ private ExtractorFactory() {
+ ServiceLoader.load(ExtractorProvider.class).forEach(provider::add);
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is false.
+ *
+ * @return true if event extractors should be preferred in the current thread, fals otherwise.
+ */
+ public static boolean getThreadPrefersEventExtractors() {
+ return threadPreferEventExtractors.get();
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is to use the thread level setting, which defaults to false.
+ *
+ * @return true if event extractors should be preferred in all threads, fals otherwise.
+ */
+ public static Boolean getAllThreadsPreferEventExtractors() {
+ return allPreferEventExtractors;
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * Will only be used if the All Threads setting is null.
+ *
+ * @param preferEventExtractors If this threads should prefer event based extractors.
+ */
+ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+ threadPreferEventExtractors.set(preferEventExtractors);
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * If set, will take preference over the Thread level setting.
+ *
+ * @param preferEventExtractors If all threads should prefer event based extractors.
+ */
+ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+ allPreferEventExtractors = preferEventExtractors;
+ }
+
+ /**
+ * Should this thread use event based extractors is available?
+ * Checks the all-threads one first, then thread specific.
+ *
+ * @return If the current thread should use event based extractors.
+ */
+ public static boolean getPreferEventExtractor() {
+ return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get();
+ }
+
+ public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+ return createExtractor(fs, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException {
+ return createExtractor(fs.getRoot(), password);
+ }
+
+ public static POITextExtractor createExtractor(InputStream input) throws IOException {
+ return createExtractor(input, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(InputStream input, String password) throws IOException {
+ final InputStream is = FileMagic.prepareToCheckMagic(input);
+ byte[] emptyFileCheck = new byte[1];
+ is.mark(emptyFileCheck.length);
+ if (is.read(emptyFileCheck) < emptyFileCheck.length) {
+ throw new EmptyFileException();
+ }
+ is.reset();
+
+ final FileMagic fm = FileMagic.valueOf(is);
+ if (FileMagic.OOXML == fm) {
+ return wp(fm, w -> w.create(is, password));
+ }
+
+ if (FileMagic.OLE2 != fm) {
+ throw new IOException("Can't create extractor - unsupported file type: "+fm);
+ }
+
+ POIFSFileSystem poifs = new POIFSFileSystem(is);
+ boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
+
+ return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
+ }
+
+ public static POITextExtractor createExtractor(File file) throws IOException {
+ return createExtractor(file, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(File file, String password) throws IOException {
+ if (file.length() == 0) {
+ throw new EmptyFileException();
+ }
+
+ final FileMagic fm = FileMagic.valueOf(file);
+ if (FileMagic.OOXML == fm) {
+ return wp(fm, w -> w.create(file, password));
+ }
+
+ if (FileMagic.OLE2 != fm) {
+ throw new IOException("Can't create extractor - unsupported file type: "+fm);
+ }
+
+ POIFSFileSystem poifs = new POIFSFileSystem(file, true);
+ try {
+ boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
+ return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
+ } catch (IOException | RuntimeException e) {
+ IOUtils.closeQuietly(poifs);
+ throw e;
+ }
+ }
+
+
+ /**
+ * Create the Extractor, if possible. Generally needs the Scratchpad jar.
+ * Note that this won't check for embedded OOXML resources either, use
+ * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
+ *
+ * @param root The {@link DirectoryNode} pointing to a document.
+ *
+ * @return The resulting {@link POITextExtractor}, an exception is thrown if
+ * no TextExtractor can be created for some reason.
+ *
+ * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
+ * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
+ * an unsupported version of Excel.
+ * @throws IllegalArgumentException If creating the Extractor fails
+ */
+ public static POITextExtractor createExtractor(DirectoryNode root) throws IOException {
+ return createExtractor(root, getCurrentUserPassword());
+ }
+
+ public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException {
+ // Encrypted OOXML files go inside OLE2 containers, is this one?
+ if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) {
+ return wp(FileMagic.OOXML, w -> w.create(root, password));
+ } else {
+ return wp(FileMagic.OLE2, w -> w.create(root, password));
+ }
+ }
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ *
+ * @param ext The extractor to look at for embedded documents
+ *
+ * @return An array of resulting extractors. Empty if no embedded documents are found.
+ *
+ * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
+ * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
+ * an unsupported version of Excel.
+ * @throws IllegalArgumentException If creating the Extractor fails
+ */
+ public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+ if (ext == null) {
+ throw new IllegalStateException("extractor must be given");
+ }
+
+ // All the embedded directories we spotted
+ List<Entry> dirs = new ArrayList<>();
+ // For anything else not directly held in as a POIFS directory
+ List<InputStream> nonPOIFS = new ArrayList<>();
+
+ // Find all the embedded directories
+ DirectoryEntry root = ext.getRoot();
+ if(root == null) {
+ throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+ }
+
+ if(ext instanceof ExcelExtractor) {
+ // These are in MBD... under the root
+ StreamSupport.stream(root.spliterator(), false)
+ .filter(entry -> entry.getName().startsWith("MBD"))
+ .forEach(dirs::add);
+ } else {
+ for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
+ if (prov.accepts(FileMagic.OLE2)) {
+ prov.identifyEmbeddedResources(ext, dirs, nonPOIFS);
+ break;
+ }
+ }
+ }
+
+ // Create the extractors
+ if(dirs.size() == 0 && nonPOIFS.size() == 0){
+ return new POITextExtractor[0];
+ }
+
+ ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
+ for (Entry dir : dirs) {
+ textExtractors.add(createExtractor((DirectoryNode) dir));
+ }
+ for (InputStream stream : nonPOIFS) {
+ try {
+ textExtractors.add(createExtractor(stream));
+ } catch (IOException e) {
+ // Ignore, just means it didn't contain a format we support as yet
+ LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
+ }
+ }
+ return textExtractors.toArray(new POITextExtractor[0]);
+ }
+
+ private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException {
+ for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
+ if (prov.accepts(fm)) {
+ POITextExtractor ext = fun.create(prov);
+ if (ext != null) {
+ return ext;
+ }
+ }
+ }
+ throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " +
+ "or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm);
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.extractor;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+
+public interface ExtractorProvider {
+ boolean accepts(FileMagic fm);
+
+ /**
+ * Create Extractor via file
+ * @param file the file
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if file can't be read or parsed
+ */
+ POITextExtractor create(File file, String password) throws IOException;
+
+ /**
+ * Create Extractor via InputStream
+ * @param inputStream the stream
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if stream can't be read or parsed
+ */
+ POITextExtractor create(InputStream inputStream, String password) throws IOException;
+
+ /**
+ * Create Extractor from POIFS node
+ * @param poifsDir the node
+ * @param password the password or {@code null} if not encrypted
+ * @return the extractor
+ * @throws IOException if node can't be parsed
+ */
+ POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException;
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ *
+ * @param ext the extractor holding the directory to start parsing
+ * @param dirs a list to be filled with directory references holding embedded
+ * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
+ *
+ * @throws IOException when the format specific extraction fails because of invalid entires
+ */
+ default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
+ throw new IllegalArgumentException("Error checking for Scratchpad embedded resources");
+ }
+
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.extractor;
+
+import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.model.InternalWorkbook;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * ExtractorFactory for HSSF and Old Excel format
+ */
+public class MainExtractorFactory implements ExtractorProvider {
+ @Override
+ public boolean accepts(FileMagic fm) {
+ return FileMagic.OLE2 == fm;
+ }
+
+ @Override
+ public POITextExtractor create(File file, String password) throws IOException {
+ return create(new POIFSFileSystem(file, true).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(InputStream inputStream, String password) throws IOException {
+ return create(new POIFSFileSystem(inputStream).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
+ final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
+ try {
+ Biff8EncryptionKey.setCurrentUserPassword(password);
+
+ // Look for certain entries in the stream, to figure it out from
+ for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
+ if (poifsDir.hasEntry(workbookName)) {
+ return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir);
+ }
+ }
+
+ if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) {
+ return new OldExcelExtractor(poifsDir);
+ }
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(oldPW);
+ }
+
+ return null;
+ }
+}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.extractor;
-
-import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME;
-import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Method;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.poi.hssf.OldExcelFormatException;
-import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
-import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.POILogFactory;
-import org.apache.poi.util.POILogger;
-
-/**
- * Figures out the correct POIOLE2TextExtractor for your supplied
- * document, and returns it.
- *
- * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
- * not present on the runtime classpath</p>
- * <p>Note 2 - for text extractor creation across all formats, use
- * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
- * the OOXML jar.</p>
- * <p>Note 3 - rather than using this, for most cases you would be better
- * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
- */
-@SuppressWarnings({"WeakerAccess", "JavadocReference"})
-public final class OLE2ExtractorFactory {
- private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
-
- /** Should this thread prefer event based over usermodel based extractors? */
- private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
-
- /** Should all threads prefer event based over usermodel based extractors? */
- private static Boolean allPreferEventExtractors;
-
- private OLE2ExtractorFactory() {
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
- *
- * @return true if event extractors should be preferred in the current thread, fals otherwise.
- */
- public static boolean getThreadPrefersEventExtractors() {
- return threadPreferEventExtractors.get();
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
- *
- * @return true if event extractors should be preferred in all threads, fals otherwise.
- */
- public static Boolean getAllThreadsPreferEventExtractors() {
- return allPreferEventExtractors;
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
- *
- * @param preferEventExtractors If this threads should prefer event based extractors.
- */
- public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- threadPreferEventExtractors.set(preferEventExtractors);
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
- *
- * @param preferEventExtractors If all threads should prefer event based extractors.
- */
- public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- allPreferEventExtractors = preferEventExtractors;
- }
-
- /**
- * Should this thread use event based extractors is available?
- * Checks the all-threads one first, then thread specific.
- *
- * @return If the current thread should use event based extractors.
- */
- public static boolean getPreferEventExtractor() {
- if(allPreferEventExtractors != null) {
- return allPreferEventExtractors;
- }
- return threadPreferEventExtractors.get();
- }
-
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
- return (T)createExtractor(fs.getRoot());
- }
-
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
- Class<?> cls = getOOXMLClass();
- if (cls != null) {
- // Use Reflection to get us the full OOXML-enabled version
- try {
- Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
- return (T)m.invoke(null, input);
- } catch (IllegalArgumentException iae) {
- throw iae;
- } catch (Exception e) {
- throw new IllegalArgumentException("Error creating Extractor for InputStream", e);
- }
- } else {
- // Best hope it's OLE2....
- return createExtractor(new POIFSFileSystem(input));
- }
- }
-
- private static Class<?> getOOXMLClass() {
- try {
- return OLE2ExtractorFactory.class.getClassLoader().loadClass(
- "org.apache.poi.extractor.ExtractorFactory"
- );
- } catch (ClassNotFoundException e) {
- LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
- return null;
- }
- }
- private static Class<?> getScratchpadClass() {
- try {
- return OLE2ExtractorFactory.class.getClassLoader().loadClass(
- "org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"
- );
- } catch (ClassNotFoundException e) {
- LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
- throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
- }
- }
-
- /**
- * Create the Extractor, if possible. Generally needs the Scratchpad jar.
- * Note that this won't check for embedded OOXML resources either, use
- * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
- *
- * @param poifsDir The {@link DirectoryNode} pointing to a document.
- *
- * @return The resulting {@link POITextExtractor}, an exception is thrown if
- * no TextExtractor can be created for some reason.
- *
- * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
- * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
- * an unsupported version of Excel.
- * @throws IllegalArgumentException If creating the Extractor fails
- */
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
- // Look for certain entries in the stream, to figure it
- // out from
- for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
- if (poifsDir.hasEntry(workbookName)) {
- if (getPreferEventExtractor()) {
- return new EventBasedExcelExtractor(poifsDir);
- }
- return new ExcelExtractor(poifsDir);
- }
- }
- if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
- throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
- + "found. Please call OldExcelExtractor directly for basic text extraction");
- }
-
- // Ask Scratchpad, or fail trying
- Class<?> cls = getScratchpadClass();
- try {
- Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
- POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
- if (ext != null) return ext;
- } catch (IllegalArgumentException iae) {
- throw iae;
- } catch (Exception e) {
- throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
- }
-
- throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- *
- * @param ext The extractor to look at for embedded documents
- *
- * @return An array of resulting extractors. Empty if no embedded documents are found.
- *
- * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
- * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
- * an unsupported version of Excel.
- * @throws IllegalArgumentException If creating the Extractor fails
- */
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
- // All the embedded directories we spotted
- List<Entry> dirs = new ArrayList<>();
- // For anything else not directly held in as a POIFS directory
- List<InputStream> nonPOIFS = new ArrayList<>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if(root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- if(ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while(it.hasNext()) {
- Entry entry = it.next();
- if(entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else {
- // Ask Scratchpad, or fail trying
- Class<?> cls = getScratchpadClass();
- try {
- Method m = cls.getDeclaredMethod(
- "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
- m.invoke(null, ext, dirs, nonPOIFS);
- } catch (Exception e) {
- throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
- }
- }
-
- // Create the extractors
- if(dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList<POITextExtractor> e = new ArrayList<>();
- for (Entry dir : dirs) {
- e.add(createExtractor((DirectoryNode) dir
- ));
- }
- for (InputStream stream : nonPOIFS) {
- try {
- e.add(createExtractor(stream));
- } catch (Exception xe) {
- // Ignore, invalid format
- LOGGER.log(POILogger.WARN, xe);
- }
- }
- return e.toArray(new POITextExtractor[0]);
- }
-}
* org.apache.poi.[format].extractor .
*
* @see org.apache.poi.hssf.extractor.ExcelExtractor
- * @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
-public abstract class POIOLE2TextExtractor extends POITextExtractor {
- /** The POIDocument that's open */
- protected POIDocument document;
-
- /**
- * Creates a new text extractor for the given document
- *
- * @param document The POIDocument to use in this extractor.
- */
- public POIOLE2TextExtractor(POIDocument document) {
- this.document = document;
-
- // Ensure any underlying resources, such as open files,
- // will get cleaned up if the user calls #close()
- setFilesystem(document);
- }
-
- /**
- * Creates a new text extractor, using the same
- * document as another text extractor. Normally
- * only used by properties extractors.
- *
- * @param otherExtractor the extractor which document to be used
- */
- protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) {
- this.document = otherExtractor.document;
- }
-
+public interface POIOLE2TextExtractor extends POITextExtractor {
/**
* Returns the document information metadata for the document
*
* @return The Document Summary Information or null
* if it could not be read for this document.
*/
- public DocumentSummaryInformation getDocSummaryInformation() {
- return document.getDocumentSummaryInformation();
+ default DocumentSummaryInformation getDocSummaryInformation() {
+ return getDocument().getDocumentSummaryInformation();
}
+
/**
* Returns the summary information metadata for the document.
*
* @return The Summary information for the document or null
* if it could not be read for this document.
*/
- public SummaryInformation getSummaryInformation() {
- return document.getSummaryInformation();
+ default SummaryInformation getSummaryInformation() {
+ return getDocument().getSummaryInformation();
}
/**
* @return an instance of POIExtractor that can extract meta-data.
*/
@Override
- public POITextExtractor getMetadataTextExtractor() {
+ default POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
*
* @return the DirectoryEntry that is associated with the POIDocument of this extractor.
*/
- public DirectoryEntry getRoot() {
- return document.getDirectory();
+ default DirectoryEntry getRoot() {
+ return getDocument().getDirectory();
}
/**
* @return the underlying POIDocument
*/
@Override
- public POIDocument getDocument() {
- return document;
- }
+ POIDocument getDocument();
}
\ No newline at end of file
/**
* Common Parent for Text Extractors
- * of POI Documents.
+ * of POI Documents.
* You will typically find the implementation of
* a given format's text extractor under
* org.apache.poi.[format].extractor .
- *
+ *
* @see org.apache.poi.hssf.extractor.ExcelExtractor
- * @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
-public abstract class POITextExtractor implements Closeable {
- private Closeable fsToClose;
-
+public interface POITextExtractor extends Closeable {
/**
* Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text
* a specific project for details.
* @return All the text from the document
*/
- public abstract String getText();
-
+ String getText();
+
/**
* Returns another text extractor, which is able to
* output the textual content of the document
* metadata / properties, such as author and title.
- *
+ *
* @return the metadata and text extractor
*/
- public abstract POITextExtractor getMetadataTextExtractor();
+ POITextExtractor getMetadataTextExtractor();
/**
- * Used to ensure file handle cleanup.
- *
- * @param fs filesystem to close
+ * @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be
+ * closed on {@link #close()}
*/
- public void setFilesystem(Closeable fs) {
- fsToClose = fs;
- }
-
+ void setCloseFilesystem(boolean doCloseFilesystem);
+
+ /**
+ * @return {@code true}, if resources/filesystem should be closed on {@link #close()}
+ */
+ boolean isCloseFilesystem();
+
+ /**
+ * @return The underlying resources/filesystem
+ */
+ Closeable getFilesystem();
+
/**
* Allows to free resources of the Extractor as soon as
* it is not needed any more. This may include closing
* open file handles and freeing memory.
- *
+ *
* The Extractor cannot be used after close has been called.
*/
@Override
- public void close() throws IOException {
- if(fsToClose != null) {
- fsToClose.close();
+ default void close() throws IOException {
+ Closeable fs = getFilesystem();
+ if (isCloseFilesystem() && fs != null) {
+ fs.close();
}
}
/**
* @return the processed document
*/
- public abstract Object getDocument();
+ Object getDocument();
}
package org.apache.poi.hpsf.extractor;
-import java.io.File;
-import java.io.IOException;
-
import org.apache.poi.POIDocument;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
* build in and custom, returning them in
* textual form.
*/
-public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
+public class HPSFPropertiesExtractor implements POIOLE2TextExtractor {
+ private final POIDocument document;
+ private boolean doCloseFilesystem = true;
+
public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) {
- super(mainExtractor);
+ document = mainExtractor.getDocument();
}
- public HPSFPropertiesExtractor(POIDocument doc) {
- super(doc);
+
+ public HPSFPropertiesExtractor(POIDocument document) {
+ this.document = document;
}
+
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
- super(new HPSFPropertiesOnlyDocument(fs));
+ document = new HPSFPropertiesOnlyDocument(fs);
}
public String getDocumentSummaryInformationText() {
}
private static String getPropertyValueText(Object val) {
- return (val == null)
+ return (val == null)
? "(not set)"
: PropertySet.getPropertyStringValue(val);
}
-
+
@Override
public boolean equals(Object o) {
return super.equals(o);
return super.hashCode();
}
- public static void main(String[] args) throws IOException {
- for (String file : args) {
- try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(
- new POIFSFileSystem(new File(file)))) {
- System.out.println(ext.getText());
- }
- }
+ @Override
+ public POIDocument getDocument() {
+ return document;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public POIDocument getFilesystem() {
+ return document;
}
}
package org.apache.poi.hssf.extractor;
+import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
-import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
* </p>
- *
+ *
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
*/
-public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
- private DirectoryNode _dir;
+public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
+ private final POIFSFileSystem poifs;
+ private final DirectoryNode _dir;
+ private boolean doCloseFilesystem = true;
boolean _includeSheetNames = true;
boolean _formulasNotResults;
- public EventBasedExcelExtractor( DirectoryNode dir )
- {
- super( (POIDocument)null );
+ public EventBasedExcelExtractor(DirectoryNode dir) {
+ poifs = null;
_dir = dir;
}
public EventBasedExcelExtractor(POIFSFileSystem fs) {
- this(fs.getRoot());
- super.setFilesystem(fs);
+ poifs = fs;
+ _dir = fs.getRoot();
}
/**
* Would return the document information metadata for the document,
* if we supported it
*/
+ @Override
public DocumentSummaryInformation getDocSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
}
* Would return the summary information metadata for the document,
* if we supported it
*/
+ @Override
public SummaryInformation getSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
}
}
}
}
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return poifs;
+ }
+
+ @Override
+ public POIDocument getDocument() {
+ return null;
+ }
+
+ @Override
+ public DirectoryEntry getRoot() {
+ return _dir;
+ }
}
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
* </p>
- *
+ *
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
*/
-public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
+public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private final HSSFWorkbook _wb;
private final HSSFDataFormatter _formatter;
+ private boolean doCloseFilesystem = true;
private boolean _includeSheetNames = true;
private boolean _shouldEvaluateFormulas = true;
private boolean _includeCellComments;
private boolean _includeHeadersFooters = true;
public ExcelExtractor(HSSFWorkbook wb) {
- super(wb);
_wb = wb;
_formatter = new HSSFDataFormatter();
}
+
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
+
public ExcelExtractor(DirectoryNode dir) throws IOException {
this(new HSSFWorkbook(dir, true));
}
/**
* Command line extractor.
- *
+ *
* @param args the command line parameters
- *
+ *
* @throws IOException if the file can't be read or contains errors
*/
public static void main(String[] args) throws IOException {
try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
HSSFWorkbook wb = new HSSFWorkbook(is);
- ExcelExtractor extractor = new ExcelExtractor(wb);
+ ExcelExtractor extractor = new ExcelExtractor(wb)
) {
extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
* Should blank cells be output? Default is to only
* output cells that are present in the file and are
* non-blank.
- *
+ *
* @param includeBlankCells {@code true} if blank cells should be included
*/
public void setIncludeBlankCells(boolean includeBlankCells) {
return text.toString();
}
+
+ @Override
+ public HSSFWorkbook getDocument() {
+ return _wb;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public HSSFWorkbook getFilesystem() {
+ return _wb;
+ }
}
import java.io.InputStream;
import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.CodepageRecord;
* by Apache Tika, but not really intended for display to the user.
* </p>
*/
-public class OldExcelExtractor implements Closeable {
+public class OldExcelExtractor implements POITextExtractor {
private final static int FILE_PASS_RECORD_SID = 0x2f;
//arbitrarily selected; may need to increase
}
}
- close();
ris = null;
return text.toString();
}
- @Override
- public void close() {
- // some cases require this close here
- if(toClose != null) {
- IOUtils.closeQuietly(toClose);
- toClose = null;
- }
- }
-
protected void handleNumericCell(StringBuilder text, double value) {
// TODO Need to fetch / use format strings
text.append(value);
text.append('\n');
}
+
+ @Override
+ public POITextExtractor getMetadataTextExtractor() {
+ return null;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return toClose != null;
+ }
+
+ @Override
+ public Closeable getFilesystem() {
+ return toClose;
+ }
+
+ @Override
+ public Object getDocument() {
+ return ris;
+ }
}
public class SlideShowExtractor<
S extends Shape<S,P>,
P extends TextParagraph<S,P,? extends TextRun>
-> extends POITextExtractor {
+> implements POITextExtractor {
private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class);
// placeholder text for slide numbers
private static final String SLIDE_NUMBER_PH = "‹#›";
- private SlideShow<S,P> slideshow;
+ protected final SlideShow<S,P> slideshow;
private boolean slidesByDefault = true;
private boolean notesByDefault;
private boolean masterByDefault;
private Predicate<Object> filter = o -> true;
+ private boolean doCloseFilesystem = true;
public SlideShowExtractor(final SlideShow<S,P> slideshow) {
- setFilesystem(slideshow);
this.slideshow = slideshow;
}
* @return the opened document
*/
@Override
- public final Object getDocument() {
- return slideshow.getPersistDocument();
+ public SlideShow<S,P> getDocument() {
+ return slideshow;
}
/**
return raw;
}
- TextParagraph tp = tr.getParagraph();
- TextShape ps = (tp != null) ? tp.getParentShape() : null;
- Sheet sh = (ps != null) ? ps.getSheet() : null;
- String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : "";
+ TextParagraph<?,?,?> tp = tr.getParagraph();
+ TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null;
+ Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null;
+ String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : "";
return raw.replace(SLIDE_NUMBER_PH, slideNr);
}
private static String replaceTextCap(TextRun tr) {
- final TextParagraph tp = tr.getParagraph();
- final TextShape sh = (tp != null) ? tp.getParentShape() : null;
+ final TextParagraph<?,?,?> tp = tr.getParagraph();
+ final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null;
final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null;
// 0xB acts like cariage return in page titles and like blank in the others
(italic == null || tr.isItalic() == italic) &&
(bold == null || tr.isBold() == bold);
}
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public SlideShow<S,P> getFilesystem() {
+ return getDocument();
+ }
}
/**
* Should sheet names be included?
* Default is true
- *
+ *
* @param includeSheetNames {@code true} if the sheet names should be included
*/
- public void setIncludeSheetNames(boolean includeSheetNames);
+ void setIncludeSheetNames(boolean includeSheetNames);
/**
* Should we return the formula itself, and not the result it produces?
* Default is false
- *
+ *
* @param formulasNotResults {@code true} if the formula itself is returned
*/
- public void setFormulasNotResults(boolean formulasNotResults);
+ void setFormulasNotResults(boolean formulasNotResults);
/**
* Should headers and footers be included in the output?
* Default is true
- *
+ *
* @param includeHeadersFooters {@code true} if headers and footers should be included
*/
- public void setIncludeHeadersFooters(boolean includeHeadersFooters);
+ void setIncludeHeadersFooters(boolean includeHeadersFooters);
/**
* Should cell comments be included?
* Default is false
- *
+ *
* @param includeCellComments {@code true} if cell comments should be included
*/
- public void setIncludeCellComments(boolean includeCellComments);
+ void setIncludeCellComments(boolean includeCellComments);
/**
* Retrieves the text contents of the file
- *
+ *
* @return the text contents of the file
*/
- public String getText();
+ String getText();
}
requires java.security.jgss;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
exports org.apache.poi.xwpf.extractor;
exports org.apache.poi.xwpf.usermodel;
requires java.security.jgss;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
exports org.apache.poi.xwpf.extractor;
exports org.apache.poi.xwpf.usermodel;
requires jdk.unsupported;
uses org.apache.poi.ss.usermodel.WorkbookProvider;
+ uses org.apache.poi.extractor.ExtractorProvider;
+
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
+
exports org.apache.poi;
exports org.apache.poi.common;
requires jdk.unsupported;
uses org.apache.poi.ss.usermodel.WorkbookProvider;
+ uses org.apache.poi.extractor.ExtractorProvider;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
exports org.apache.poi;
exports org.apache.poi.common;
requires java.desktop;
requires commons.math3;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
+
exports org.apache.poi.hmef;
exports org.apache.poi.hmef.dev;
exports org.apache.poi.hmef.extractor;
requires java.desktop;
requires commons.math3;
+ provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
+
exports org.apache.poi.hmef;
exports org.apache.poi.hmef.dev;
exports org.apache.poi.hmef.extractor;
import java.io.File;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
/**
* A command line wrapper around {@link ExtractorFactory}, useful
* for when debugging.
*/
-public class CommandLineTextExtractor {
+public final class CommandLineTextExtractor {
public static final String DIVIDER = "=======================";
+ private CommandLineTextExtractor() {
+ }
+
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.ooxml.extractor;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.reflect.Method;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.poi.EncryptedDocumentException;
-import org.apache.poi.extractor.OLE2ExtractorFactory;
-import org.apache.poi.extractor.POIOLE2TextExtractor;
-import org.apache.poi.extractor.POITextExtractor;
-import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
-import org.apache.poi.openxml4j.opc.OPCPackage;
-import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
-import org.apache.poi.poifs.crypt.Decryptor;
-import org.apache.poi.poifs.crypt.EncryptionInfo;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.FileMagic;
-import org.apache.poi.poifs.filesystem.NotOLE2FileException;
-import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.sl.extractor.SlideShowExtractor;
-import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.NotImplemented;
-import org.apache.poi.util.POILogFactory;
-import org.apache.poi.util.POILogger;
-import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
-import org.apache.poi.xslf.usermodel.XMLSlideShow;
-import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
-import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
-import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.xmlbeans.XmlException;
-
-/**
- * Figures out the correct POITextExtractor for your supplied
- * document, and returns it.
- *
- * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
- * not present on the runtime classpath</p>
- * <p>Note 2 - rather than using this, for most cases you would be better
- * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
- */
-@SuppressWarnings("WeakerAccess")
-public final class ExtractorFactory {
- private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
-
- public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
- private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
- private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
-
- private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
- XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
- XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
- XSLFRelation.PRESENTATION_MACRO
- };
-
- private ExtractorFactory() {
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
- */
- public static boolean getThreadPrefersEventExtractors() {
- return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
- */
- public static Boolean getAllThreadsPreferEventExtractors() {
- return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
- */
- public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
- */
- public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should this thread use event based extractors is available?
- * Checks the all-threads one first, then thread specific.
- */
- public static boolean getPreferEventExtractor() {
- return OLE2ExtractorFactory.getPreferEventExtractor();
- }
-
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
- POIFSFileSystem fs = null;
- try {
- fs = new POIFSFileSystem(f);
- if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
- return (T)createEncryptedOOXMLExtractor(fs);
- }
- POITextExtractor extractor = createExtractor(fs);
- extractor.setFilesystem(fs);
- return (T)extractor;
- } catch (OfficeXmlFileException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
- T t = (T)createExtractor(pkg);
- t.setFilesystem(pkg);
- return t;
- } catch (NotOLE2FileException ne) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
- } catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
- }
- }
-
- public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
- InputStream is = FileMagic.prepareToCheckMagic(inp);
-
- FileMagic fm = FileMagic.valueOf(is);
-
- switch (fm) {
- case OLE2:
- POIFSFileSystem fs = new POIFSFileSystem(is);
- boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
- return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
- case OOXML:
- return createExtractor(OPCPackage.open(is));
- default:
- throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm);
- }
- }
-
- /**
- * Tries to determine the actual type of file and produces a matching text-extractor for it.
- *
- * @param pkg An {@link OPCPackage}.
- * @return A {@link POIXMLTextExtractor} for the given file.
- * @throws IOException If an error occurs while reading the file
- * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
- * @throws XmlException If an XML parsing error occurs.
- * @throws IllegalArgumentException If no matching file type could be found.
- */
- public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
- try {
- // Check for the normal Office core document
- PackageRelationshipCollection core;
- core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-
- // If nothing was found, try some of the other OOXML-based core types
- if (core.size() == 0) {
- // Could it be an OOXML-Strict one?
- core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
- }
- if (core.size() == 0) {
- // Could it be a visio one?
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (core.size() == 1)
- return new XDGFVisioExtractor(pkg);
- }
-
- // Should just be a single core document, complain if not
- if (core.size() != 1) {
- throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
- }
-
- // Grab the core document part, and try to identify from that
- final PackagePart corePart = pkg.getPart(core.getRelationship(0));
- final String contentType = corePart.getContentType();
-
- // Is it XSSF?
- for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- if (getPreferEventExtractor()) {
- return new XSSFEventBasedExcelExtractor(pkg);
- }
- return new XSSFExcelExtractor(pkg);
- }
- }
-
- // Is it XWPF?
- for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new XWPFWordExtractor(pkg);
- }
- }
-
- // Is it XSLF?
- for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new SlideShowExtractor<>(new XMLSlideShow(pkg));
- }
- }
-
- // special handling for SlideShow-Theme-files,
- if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
- return new SlideShowExtractor<>(new XMLSlideShow(pkg));
- }
-
- // How about xlsb?
- for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
- if (rel.getContentType().equals(contentType)) {
- return new XSSFBEventBasedExcelExtractor(pkg);
- }
- }
-
- throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
-
- } catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
- }
- }
-
- public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return createExtractor(fs.getRoot());
- }
-
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
- {
- // First, check for OOXML
- for (String entryName : poifsDir.getEntryNames()) {
- if (entryName.equals("Package")) {
- OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
- return (T)createExtractor(pkg);
- }
- }
-
- // If not, ask the OLE2 code to check, with Scratchpad if possible
- return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
- // All the embedded directories we spotted
- ArrayList<Entry> dirs = new ArrayList<>();
- // For anything else not directly held in as a POIFS directory
- ArrayList<InputStream> nonPOIFS = new ArrayList<>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if (root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- // provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it
- if (ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while (it.hasNext()) {
- Entry entry = it.next();
- if (entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else {
- try {
- Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory");
- Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
- m.invoke(null, ext, dirs, nonPOIFS);
- } catch (ReflectiveOperationException e) {
- logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage());
- return new POITextExtractor[0];
- }
- }
-
- // Create the extractors
- if (dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
- for (Entry dir : dirs) {
- textExtractors.add(createExtractor((DirectoryNode) dir));
- }
- for (InputStream nonPOIF : nonPOIFS) {
- try {
- textExtractors.add(createExtractor(nonPOIF));
- } catch (IllegalArgumentException e) {
- // Ignore, just means it didn't contain
- // a format we support as yet
- logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
- } catch (XmlException | OpenXML4JException e) {
- throw new IOException(e.getMessage(), e);
- }
- }
- return textExtractors.toArray(new POITextExtractor[0]);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- @NotImplemented
- @SuppressWarnings({"UnusedParameters", "UnusedReturnValue"})
- public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
- throw new IllegalStateException("Not yet supported");
- }
-
- private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
- throws IOException {
- String pass = Biff8EncryptionKey.getCurrentUserPassword();
- if (pass == null) {
- pass = Decryptor.DEFAULT_PASSWORD;
- }
-
- EncryptionInfo ei = new EncryptionInfo(fs);
- Decryptor dec = ei.getDecryptor();
- InputStream is = null;
- try {
- if (!dec.verifyPassword(pass)) {
- throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
- }
- is = dec.getDataStream(fs);
- return createExtractor(OPCPackage.open(is));
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new EncryptedDocumentException(e);
- } finally {
- IOUtils.closeQuietly(is);
-
- // also close the POIFSFileSystem here as we read all the data
- // while decrypting
- fs.close();
- }
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.ooxml.extractor;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorProvider;
+import org.apache.poi.extractor.POITextExtractor;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.crypt.EncryptionInfo;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
+import org.apache.poi.xslf.extractor.XSLFExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.xmlbeans.XmlException;
+
+/**
+ * Figures out the correct POITextExtractor for your supplied
+ * document, and returns it.
+ *
+ * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
+ * not present on the runtime classpath</p>
+ * <p>Note 2 - rather than using this, for most cases you would be better
+ * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
+ */
+@SuppressWarnings("WeakerAccess")
+public final class POIXMLExtractorFactory implements ExtractorProvider {
+ private static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
+ private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
+ private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
+
+ private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
+ XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
+ XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
+ XSLFRelation.PRESENTATION_MACRO
+ };
+
+ @Override
+ public boolean accepts(FileMagic fm) {
+ return fm == FileMagic.OOXML;
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is false.
+ */
+ public static boolean getThreadPrefersEventExtractors() {
+ return ExtractorFactory.getThreadPrefersEventExtractors();
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is to use the thread level setting, which defaults to false.
+ */
+ public static Boolean getAllThreadsPreferEventExtractors() {
+ return ExtractorFactory.getAllThreadsPreferEventExtractors();
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * Will only be used if the All Threads setting is null.
+ */
+ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+ ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * If set, will take preference over the Thread level setting.
+ */
+ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+ ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
+ }
+
+ /**
+ * Should this thread use event based extractors is available?
+ * Checks the all-threads one first, then thread specific.
+ */
+ public static boolean getPreferEventExtractor() {
+ return ExtractorFactory.getPreferEventExtractor();
+ }
+
+ @Override
+ public POITextExtractor create(File f, String password) throws IOException {
+ if (FileMagic.valueOf(f) != FileMagic.OOXML) {
+ return ExtractorFactory.createExtractor(f, password);
+ }
+
+
+ OPCPackage pkg = null;
+ try {
+ pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
+ POIXMLTextExtractor ex = create(pkg);
+ if (ex == null) {
+ pkg.revert();
+ }
+ return ex;
+ } catch (InvalidFormatException ife) {
+ throw new IOException(ife);
+ } catch (IOException e) {
+ pkg.revert();
+ throw e;
+ }
+ }
+
+ public POITextExtractor create(InputStream inp, String password) throws IOException {
+ InputStream is = FileMagic.prepareToCheckMagic(inp);
+
+ if (FileMagic.valueOf(is) != FileMagic.OOXML) {
+ return ExtractorFactory.createExtractor(is, password);
+ }
+
+ OPCPackage pkg = null;
+ try {
+ pkg = OPCPackage.open(is);
+ POIXMLTextExtractor ex = create(pkg);
+ if (ex == null) {
+ pkg.revert();
+ }
+ return ex;
+ } catch (InvalidFormatException e) {
+ throw new IOException(e);
+ } catch (RuntimeException | IOException e) {
+ if (pkg != null) {
+ pkg.revert();
+ }
+ throw e;
+ }
+ }
+
+ /**
+ * Tries to determine the actual type of file and produces a matching text-extractor for it.
+ *
+ * @param pkg An {@link OPCPackage}.
+ * @return A {@link POIXMLTextExtractor} for the given file.
+ * @throws IOException If an error occurs while reading the file
+ * @throws IllegalArgumentException If no matching file type could be found.
+ */
+ public POIXMLTextExtractor create(OPCPackage pkg) throws IOException {
+ try {
+ // Check for the normal Office core document
+ PackageRelationshipCollection core;
+ core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
+
+ // If nothing was found, try some of the other OOXML-based core types
+ if (core.size() == 0) {
+ // Could it be an OOXML-Strict one?
+ core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
+ }
+ if (core.size() == 0) {
+ // Could it be a visio one?
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ if (core.size() == 1) {
+ return new XDGFVisioExtractor(pkg);
+ }
+ }
+
+ // Should just be a single core document, complain if not
+ if (core.size() != 1) {
+ throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
+ }
+
+ // Grab the core document part, and try to identify from that
+ final PackagePart corePart = pkg.getPart(core.getRelationship(0));
+ final String contentType = corePart.getContentType();
+
+ // Is it XSSF?
+ for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
+ if (rel.getContentType().equals(contentType)) {
+ if (getPreferEventExtractor()) {
+ return new XSSFEventBasedExcelExtractor(pkg);
+ }
+ return new XSSFExcelExtractor(pkg);
+ }
+ }
+
+ // Is it XWPF?
+ for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
+ if (rel.getContentType().equals(contentType)) {
+ return new XWPFWordExtractor(pkg);
+ }
+ }
+
+ // Is it XSLF?
+ for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
+ if (rel.getContentType().equals(contentType)) {
+ return new XSLFExtractor(new XMLSlideShow(pkg));
+ }
+ }
+
+ // special handling for SlideShow-Theme-files,
+ if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
+ return new XSLFExtractor(new XMLSlideShow(pkg));
+ }
+
+ // How about xlsb?
+ for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
+ if (rel.getContentType().equals(contentType)) {
+ return new XSSFBEventBasedExcelExtractor(pkg);
+ }
+ }
+
+ return null;
+ } catch (IOException e) {
+ throw e;
+ } catch (Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
+ throw new IOException(e);
+ }
+ // we used to close (revert()) the package here, but this is the callers responsibility
+ // and we can't reuse the package
+ }
+
+ public POITextExtractor create(POIFSFileSystem fs) throws IOException {
+ return create(fs.getRoot(), Biff8EncryptionKey.getCurrentUserPassword());
+ }
+
+ @Override
+ public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
+ // First, check for plain OOXML package
+ if (poifsDir.hasEntry("Package")) {
+ try (InputStream is = poifsDir.createDocumentInputStream("Package")) {
+ return create(is, password);
+ }
+ }
+
+ if (poifsDir.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
+ EncryptionInfo ei = new EncryptionInfo(poifsDir);
+ Decryptor dec = ei.getDecryptor();
+ try {
+ if (!dec.verifyPassword(password)) {
+ throw new IOException("Invalid password specified");
+ }
+ try (InputStream is = dec.getDataStream(poifsDir)) {
+ return create(is, password);
+ }
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new IOException(e);
+ }
+ }
+
+ throw new IOException("The OLE2 file neither contained a plain OOXML package node (\"Package\") nor an encrypted one (\"EncryptedPackage\").");
+ }
+}
* content of the OOXML file properties, eg author
* and title.
*/
-public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
-
+public class POIXMLPropertiesTextExtractor implements POIXMLTextExtractor {
+ private final POIXMLDocument doc;
private final DateFormat dateFormat;
+ private boolean doCloseFilesystem = true;
/**
* Creates a new POIXMLPropertiesTextExtractor for the given open document.
* @param doc the given open document
*/
public POIXMLPropertiesTextExtractor(POIXMLDocument doc) {
- super(doc);
+ this.doc = doc;
DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT);
dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs);
dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC);
}
/*else if (property.isSetArray()) {
- // TODO Fetch the array values and output
+ // TODO Fetch the array values and output
}
else if (property.isSetVector()) {
// TODO Fetch the vector values and output
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
}
+
+ @Override
+ public POIXMLDocument getDocument() {
+ return doc;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public POIXMLDocument getFilesystem() {
+ return null;
+ }
}
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.util.ZipSecureFile;
-public abstract class POIXMLTextExtractor extends POITextExtractor {
- /** The POIXMLDocument that's open */
- private final POIXMLDocument _document;
-
- /**
- * Creates a new text extractor for the given document
- *
- * @param document the document to extract from
- */
- public POIXMLTextExtractor(POIXMLDocument document) {
- _document = document;
- }
-
+public interface POIXMLTextExtractor extends POITextExtractor {
/**
* Returns the core document properties
- *
+ *
* @return the core document properties
*/
- public CoreProperties getCoreProperties() {
- return _document.getProperties().getCoreProperties();
+ default CoreProperties getCoreProperties() {
+ return getDocument().getProperties().getCoreProperties();
}
/**
* Returns the extended document properties
- *
+ *
* @return the extended document properties
*/
- public ExtendedProperties getExtendedProperties() {
- return _document.getProperties().getExtendedProperties();
+ default ExtendedProperties getExtendedProperties() {
+ return getDocument().getProperties().getExtendedProperties();
}
/**
* Returns the custom document properties
- *
+ *
* @return the custom document properties
*/
- public CustomProperties getCustomProperties() {
- return _document.getProperties().getCustomProperties();
+ default CustomProperties getCustomProperties() {
+ return getDocument().getProperties().getCustomProperties();
}
/**
* Returns opened document
- *
+ *
* @return the opened document
*/
@Override
- public final POIXMLDocument getDocument() {
- return _document;
- }
+ POIXMLDocument getDocument();
/**
* Returns the opened OPCPackage that contains the document
- *
+ *
* @return the opened OPCPackage
*/
- public OPCPackage getPackage() {
- return _document.getPackage();
+ default OPCPackage getPackage() {
+ POIXMLDocument doc = getDocument();
+ return doc != null ? doc.getPackage() : null;
}
/**
* document properties metadata, such as title and author.
*/
@Override
- public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
- return new POIXMLPropertiesTextExtractor(_document);
+ default POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
+ return new POIXMLPropertiesTextExtractor(getDocument());
}
@Override
- public void close() throws IOException {
+ default void close() throws IOException {
// e.g. XSSFEventBaseExcelExtractor passes a null-document
- if(_document != null) {
+ if (isCloseFilesystem()) {
@SuppressWarnings("resource")
- OPCPackage pkg = _document.getPackage();
- if(pkg != null) {
+ OPCPackage pkg = getPackage();
+ if (pkg != null) {
// revert the package to not re-write the file, which is very likely not wanted for a TextExtractor!
pkg.revert();
}
}
- super.close();
}
- protected void checkMaxTextSize(CharSequence text, String string) {
+ default void checkMaxTextSize(CharSequence text, String string) {
if(string == null) {
return;
}
import java.io.IOException;
-import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xdgf.usermodel.XDGFPage;
/**
* Helper class to extract text from an OOXML Visio File
*/
-public class XDGFVisioExtractor extends POIXMLTextExtractor {
+public class XDGFVisioExtractor implements POIXMLTextExtractor {
protected final XmlVisioDocument document;
-
+ private boolean doCloseFilesystem = true;
+
public XDGFVisioExtractor(XmlVisioDocument document) {
- super(document);
this.document = document;
}
public String getText() {
ShapeTextVisitor visitor = new ShapeTextVisitor();
-
+
for (XDGFPage page: document.getPages()) {
page.getContent().visitShapes(visitor);
}
-
+
return visitor.getText();
}
-
- public static void main(String [] args) throws IOException {
- if (args.length < 1) {
- System.err.println("Use:");
- System.err.println(" XDGFVisioExtractor <filename.vsdx>");
- System.exit(1);
- }
- POIXMLTextExtractor extractor =
- new XDGFVisioExtractor(POIXMLDocument.openPackage(
- args[0]
- ));
- System.out.println(extractor.getText());
- extractor.close();
+
+ @Override
+ public XmlVisioDocument getDocument() {
+ return document;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public XmlVisioDocument getFilesystem() {
+ return document;
}
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xslf.extractor;
+
+import org.apache.poi.ooxml.extractor.POIXMLPropertiesTextExtractor;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
+import org.apache.poi.sl.extractor.SlideShowExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFShape;
+import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
+
+
+/**
+ * Helper class to extract text from an OOXML Powerpoint file
+ */
+public class XSLFExtractor extends SlideShowExtractor<XSLFShape, XSLFTextParagraph> implements POIXMLTextExtractor {
+ public XSLFExtractor(XMLSlideShow slideshow) {
+ super(slideshow);
+ }
+
+ @Override
+ public XMLSlideShow getDocument() {
+ return (XMLSlideShow)slideshow;
+ }
+
+ @Override
+ public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
+ return POIXMLTextExtractor.super.getMetadataTextExtractor();
+ }
+}
import java.io.IOException;
import java.io.InputStream;
-import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.DataFormatter;
*
* @since 3.16-beta3
*/
-public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor
- implements org.apache.poi.ss.extractor.ExcelExtractor {
+public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor {
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFBEventBasedExcelExtractor.class);
super(container);
}
- public static void main(String[] args) throws Exception {
- if (args.length < 1) {
- System.err.println("Use:");
- System.err.println(" XSSFBEventBasedExcelExtractor <filename.xlsb>");
- System.exit(1);
- }
- POIXMLTextExtractor extractor =
- new XSSFBEventBasedExcelExtractor(args[0]);
- System.out.println(extractor.getText());
- extractor.close();
- }
-
public void setHandleHyperlinksInCells(boolean handleHyperlinksInCells) {
this.handleHyperlinksInCells = handleHyperlinksInCells;
}
import javax.xml.parsers.ParserConfigurationException;
+import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
import org.apache.poi.ooxml.POIXMLProperties.CustomProperties;
* Implementation of a text extractor from OOXML Excel
* files that uses SAX event based parsing.
*/
-public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
- implements org.apache.poi.ss.extractor.ExcelExtractor {
+public class XSSFEventBasedExcelExtractor
+ implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class);
- protected OPCPackage container;
- protected POIXMLProperties properties;
+ protected final OPCPackage container;
+ protected final POIXMLProperties properties;
protected Locale locale;
protected boolean includeTextBoxes = true;
protected boolean formulasNotResults;
protected boolean concatenatePhoneticRuns = true;
+ private boolean doCloseFilesystem = true;
+
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path));
}
public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
- super(null);
this.container = container;
-
properties = new POIXMLProperties(container);
}
- public static void main(String[] args) throws Exception {
- if (args.length < 1) {
- System.err.println("Use:");
- System.err.println(" XSSFEventBasedExcelExtractor <filename.xlsx>");
- System.exit(1);
- }
- POIXMLTextExtractor extractor =
- new XSSFEventBasedExcelExtractor(args[0]);
- System.out.println(extractor.getText());
- extractor.close();
- }
-
/**
* Should sheet names be included? Default is true
*/
}
@Override
- public void close() throws IOException {
- if (container != null) {
- container.close();
- container = null;
- }
- super.close();
+ public POIXMLDocument getDocument() {
+ return null;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public OPCPackage getFilesystem() {
+ return container;
}
protected class SheetTextExtractor implements SheetContentsHandler {
import java.util.Iterator;
import java.util.Locale;
-import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Cell;
/**
* Helper class to extract text from an OOXML Excel file
*/
-public class XSSFExcelExtractor extends POIXMLTextExtractor
- implements org.apache.poi.ss.extractor.ExcelExtractor {
+public class XSSFExcelExtractor
+ implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] {
XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK,
XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK,
};
private Locale locale;
- private XSSFWorkbook workbook;
+ private final XSSFWorkbook workbook;
private boolean includeSheetNames = true;
private boolean formulasNotResults;
private boolean includeCellComments;
private boolean includeHeadersFooters = true;
private boolean includeTextBoxes = true;
+ private boolean doCloseFilesystem = true;
public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
this(new XSSFWorkbook(container));
}
public XSSFExcelExtractor(XSSFWorkbook workbook) {
- super(workbook);
this.workbook = workbook;
}
- public static void main(String[] args) throws Exception {
- if(args.length < 1) {
- System.err.println("Use:");
- System.err.println(" XSSFExcelExtractor <filename.xlsx>");
- System.exit(1);
- }
-
- try (OPCPackage pkg = OPCPackage.create(args[0]);
- POIXMLTextExtractor extractor = new XSSFExcelExtractor(pkg)) {
- System.out.println(extractor.getText());
- }
- }
-
/**
* Should sheet names be included? Default is true
*/
}
text.append("\n");
}
-
+
// add textboxes
if (includeTextBoxes){
XSSFDrawing drawing = sheet.getDrawingPatriarch();
private String extractHeaderFooter(HeaderFooter hf) {
return ExcelExtractor._extractHeaderFooter(hf);
}
+
+ @Override
+ public XSSFWorkbook getDocument() {
+ return workbook;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public XSSFWorkbook getFilesystem() {
+ return workbook;
+ }
}
import java.io.IOException;
import java.util.List;
-import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
-import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
/**
* Helper class to extract text from an OOXML Word file
*/
-public class XWPFWordExtractor extends POIXMLTextExtractor {
+public class XWPFWordExtractor implements POIXMLTextExtractor {
public static final XWPFRelation[] SUPPORTED_TYPES = {
XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
XWPFRelation.MACRO_DOCUMENT,
XWPFRelation.MACRO_TEMPLATE_DOCUMENT
};
- private XWPFDocument document;
+ private final XWPFDocument document;
private boolean fetchHyperlinks;
private boolean concatenatePhoneticRuns = true;
+ private boolean doCloseFilesystem = true;
- public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+ public XWPFWordExtractor(OPCPackage container) throws IOException {
this(new XWPFDocument(container));
}
public XWPFWordExtractor(XWPFDocument document) {
- super(document);
this.document = document;
}
- public static void main(String[] args) throws Exception {
- if (args.length < 1) {
- System.err.println("Use:");
- System.err.println(" XWPFWordExtractor <filename.docx>");
- System.exit(1);
- }
- POIXMLTextExtractor extractor =
- new XWPFWordExtractor(POIXMLDocument.openPackage(
- args[0]
- ));
- System.out.println(extractor.getText());
- extractor.close();
- }
-
/**
* Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the
text.append(hfPolicy.getDefaultHeader().getText());
}
}
+
+ @Override
+ public XWPFDocument getDocument() {
+ return document;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public XWPFDocument getFilesystem() {
+ return document;
+ }
}
import org.apache.poi.POIDataSamples;
import org.apache.poi.UnsupportedFileFormatException;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
-import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
-import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.xmlbeans.XmlException;
+import org.junit.Rule;
import org.junit.Test;
+import org.junit.rules.ExpectedException;
/**
* Test that the extractor factory plays nicely
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
private static File pub = getFileAndCheck(pubTests, "Simple.pub");
+ private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
+
private static File getFileAndCheck(POIDataSamples samples, String name) {
File file = samples.getFile(name);
"Word 6", doc6, "Word6Extractor", 20,
"Word 95", doc95, "Word6Extractor", 120,
"PowerPoint", ppt, "SlideShowExtractor", 120,
- "PowerPoint - pptx", pptx, "SlideShowExtractor", 120,
+ "PowerPoint - pptx", pptx, "XSLFExtractor", 120,
"Visio", vsd, "VisioTextExtractor", 50,
"Visio - vsdx", vsdx, "XDGFVisioExtractor", 20,
"Publisher", pub, "PublisherTextExtractor", 50,
R apply(T t) throws IOException, OpenXML4JException, XmlException;
}
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
@Test
public void testFile() throws Exception {
}
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testFileInvalid() throws Exception {
+ thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
+ thrown.expect(IOException.class);
// Text
- try (POITextExtractor ignored = ExtractorFactory.createExtractor(txt)) {
- fail("extracting from invalid package");
- }
+ ExtractorFactory.createExtractor(txt);
}
@Test
testStream(ExtractorFactory::createExtractor, true);
}
- @Test(expected = IllegalArgumentException.class)
+ @Test
public void testInputStreamInvalid() throws Exception {
+ thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
+ thrown.expect(IOException.class);
testInvalid(ExtractorFactory::createExtractor);
}
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
}
- @Test(expected = IOException.class)
+ @Test
public void testPOIFSInvalid() throws Exception {
+ thrown.expectMessage("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0");
+ thrown.expect(NotOLE2FileException.class);
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
}
POITextExtractor ignored = poifs.apply(fis)) {
fail("extracting from invalid package");
} catch (IllegalArgumentException e) {
- assertTrue("Had: " + e,
- e.getMessage().contains(FileMagic.UNKNOWN.name()));
-
+ assertTrue("Had: " + e, e.getMessage().contains(FileMagic.UNKNOWN.name()));
throw e;
}
}
}
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
- final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
+ final POITextExtractor ext = xmlFactory.create(pkg)) {
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
pkg.revert();
}
public void testPackageInvalid() throws Exception {
// Text
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
- final POITextExtractor ignored = ExtractorFactory.createExtractor(pkg)) {
+ final POITextExtractor ignored = xmlFactory.create(pkg)) {
fail("extracting from invalid package");
}
}
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
+ try {
+ // Check we get the right extractors now
+ try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
+ assertTrue(extractor instanceof EventBasedExcelExtractor);
+ }
+ try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
+ assertTrue(extractor.getText().length() > 200);
+ }
+
+ try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
+ assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
+ }
+
+ try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
+ assertTrue(extractor.getText().length() > 200);
+ }
+ } finally {
+ // Put back to normal
+ ExtractorFactory.setThreadPrefersEventExtractors(false);
+ }
- // Check we get the right extractors now
- POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof EventBasedExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
-
- // Put back to normal
- ExtractorFactory.setThreadPrefersEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// And back
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
+ try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
+ assertTrue(extractor instanceof ExcelExtractor);
+ }
+ try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
+ assertTrue(extractor.getText().length() > 200);
+ }
+
+ try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
+ assertTrue(extractor instanceof XSSFExcelExtractor);
+ }
+ try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
+ assertTrue(extractor.getText().length() > 200);
+ }
}
/**
};
for (int i=0; i<testObj.length; i+=3) {
- try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
+ try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor)ExtractorFactory.createExtractor((File)testObj[i+1])) {
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
"spreadsheet/WithChartSheet.xlsx",
"spreadsheet/chart_sheet.xlsx",
};
-
+
@Test
public void testFileLeak() {
- // run a number of files that might fail in order to catch
+ // run a number of files that might fail in order to catch
// leaked file resources when using file-leak-detector while
// running the test
-
+
for(String file : EXPECTED_FAILURES) {
try {
ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
}
}
}
-
+
/**
- * #59074 - Excel 95 files should give a helpful message, not just
+ * #59074 - Excel 95 files should give a helpful message, not just
* "No supported documents found in the OLE2 stream"
*/
- @Test(expected = OldExcelFormatException.class)
public void bug59074() throws Exception {
- ExtractorFactory.createExtractor(
- POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
+ try (POITextExtractor extractor = ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"))) {
+ String text = extractor.getText();
+ assertContains(text, "testdoc");
+ }
}
@Test(expected = IllegalStateException.class)
- public void testGetEmbeddedFromXMLExtractor() {
+ public void testGetEmbeddedFromXMLExtractor() throws IOException {
// currently not implemented
- ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
+ ExtractorFactory.getEmbeddedDocsTextExtractors(null);
}
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
import org.apache.poi.POIDataSamples;
import org.apache.poi.POITestCase;
import org.apache.poi.UnsupportedFileFormatException;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.ooxml.POIXMLException;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.util.DocumentHelper;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@Test
public void testZipEntityExpansionExceedsMemory() throws IOException, OpenXML4JException, XmlException {
- expectedEx.expect(POIXMLException.class);
+ expectedEx.expect(IOException.class);
expectedEx.expectMessage("unable to parse shared strings table");
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than"));
openXmlBombFile("poc-xmlbomb.xlsx");
@Test
public void testZipEntityExpansionExceedsMemory2() throws IOException, OpenXML4JException, XmlException {
- expectedEx.expect(POIXMLException.class);
+ expectedEx.expect(IOException.class);
expectedEx.expectMessage("unable to parse shared strings table");
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than"));
openXmlBombFile("poc-xmlbomb-empty.xlsx");
import org.apache.poi.POIDataSamples;
import org.apache.poi.POIDocument;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.crypt.cryptoapi.CryptoAPIEncryptionHeader;
import org.apache.poi.poifs.storage.RawDataUtil;
-import org.apache.xmlbeans.XmlException;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
}
@Test
- public void extract() throws IOException, OpenXML4JException, XmlException {
+ public void extract() throws IOException {
File f = sampleDir.getFile(file);
Biff8EncryptionKey.setCurrentUserPassword(password);
try (POITextExtractor te = ExtractorFactory.createExtractor(f)) {
}
@Test
- public void changePassword() throws IOException, OpenXML4JException, XmlException {
+ public void changePassword() throws IOException {
newPassword("test");
}
@Test
- public void removePassword() throws IOException, OpenXML4JException, XmlException {
+ public void removePassword() throws IOException {
newPassword(null);
}
- private void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
+ private void newPassword(String newPass) throws IOException {
File f = sampleDir.getFile(file);
Biff8EncryptionKey.setCurrentUserPassword(password);
try (POITextExtractor te1 = ExtractorFactory.createExtractor(f)) {
/** changing the encryption mode and key size in poor mans style - see comments below */
@Test
- public void changeEncryption() throws IOException, OpenXML4JException, XmlException {
+ public void changeEncryption() throws IOException {
File f = sampleDir.getFile(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
Biff8EncryptionKey.setCurrentUserPassword(password);
POIDocument doc = (POIDocument) te3.getDocument()) {
// need to cache data (i.e. read all data) before changing the key size
Class<?> clazz = doc.getClass();
- if ("HSLFSlideShowImpl".equals(clazz.getSimpleName())) {
+ if ("HSLFSlideShow".equals(clazz.getSimpleName())) {
try {
clazz.getDeclaredMethod("getPictureData").invoke(doc);
} catch (ReflectiveOperationException e) {
private String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
try (SlideShowExtractor<XSLFShape,XSLFTextParagraph> extr = new SlideShowExtractor<>(ppt)) {
// do not auto-close the slideshow
- extr.setFilesystem(null);
+ extr.setCloseFilesystem(false);
extr.setSlidesByDefault(true);
extr.setNotesByDefault(false);
extr.setMasterByDefault(false);
import java.io.InputStream;
import org.apache.poi.POIDataSamples;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
-import org.apache.xmlbeans.XmlException;
import org.junit.Test;
/**
* Tests for XSLFPowerPointExtractor
*/
public class TestXSLFPowerPointExtractor {
- private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
+ private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
/**
* Get text out of the simple file
}
@Test
- public void test45541() throws IOException, OpenXML4JException, XmlException {
+ public void test45541() throws IOException {
// extract text from a powerpoint that has a header in the notes-element
final File headerFile = slTests.getFile("45541_Header.pptx");
- try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
+ //noinspection rawtypes
+ try (final SlideShowExtractor extr = (SlideShowExtractor) ExtractorFactory.createExtractor(headerFile)) {
String text = extr.getText();
assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc"));
// extract text from a powerpoint that has a footer in the master-slide
final File footerFile = slTests.getFile("45541_Footer.pptx");
- try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
+ //noinspection rawtypes
+ try (SlideShowExtractor extr = (SlideShowExtractor)ExtractorFactory.createExtractor(footerFile)) {
String text = extr.getText();
assertNotContained(text, "testdoc");
==================================================================== */
package org.apache.poi.xssf.extractor;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.junit.After;
ExtractorFactory.setAllThreadsPreferEventExtractors(true);
return (XSSFEventBasedExcelExtractor) ExtractorFactory.createExtractor(HSSFTestDataSamples.openSampleFileStream(sampleName));
}
-
+
@After
public void tearDown() {
// reset setting to not affect other tests
package org.apache.poi.xssf.extractor;
+import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.HSSFTestDataSamples;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.junit.After;
/**
--- /dev/null
+# ====================================================================
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+
+org.apache.poi.extractor.MainExtractorFactory
\ No newline at end of file
--- /dev/null
+# ====================================================================
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+
+org.apache.poi.ooxml.extractor.POIXMLExtractorFactory
\ No newline at end of file
--- /dev/null
+# ====================================================================
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+
+org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory
\ No newline at end of file
package org.apache.poi.extractor.ole2;
import java.io.ByteArrayInputStream;
+import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
-import java.util.Iterator;
import java.util.List;
+import java.util.stream.StreamSupport;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.extractor.ExtractorProvider;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
-import org.apache.poi.extractor.OLE2ExtractorFactory;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
+import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
- * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
+ * Scratchpad-specific logic for {@link ExtractorFactory} and
* {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with
* no Scratchpad jar (though without functionality!)
* <p>Note - should not be used standalone, always use via the other
* two classes</p>
*/
@SuppressWarnings("WeakerAccess")
-public class OLE2ScratchpadExtractorFactory {
+public class OLE2ScratchpadExtractorFactory implements ExtractorProvider {
private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class);
+ @Override
+ public boolean accepts(FileMagic fm) {
+ return FileMagic.OLE2 == fm;
+ }
+
+ @Override
+ public POITextExtractor create(File file, String password) throws IOException {
+ return create(new POIFSFileSystem(file, true).getRoot(), password);
+ }
+
+ @Override
+ public POITextExtractor create(InputStream inputStream, String password) throws IOException {
+ return create(new POIFSFileSystem(inputStream).getRoot(), password);
+ }
+
/**
* Look for certain entries in the stream, to figure it
* out what format is desired
*
* @throws IOException when the format specific extraction fails because of invalid entires
*/
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
- if (poifsDir.hasEntry("WordDocument")) {
- // Old or new style word document?
- try {
- return new WordExtractor(poifsDir);
- } catch (OldWordFileFormatException e) {
- return new Word6Extractor(poifsDir);
+ public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
+ final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
+ try {
+ Biff8EncryptionKey.setCurrentUserPassword(password);
+ if (poifsDir.hasEntry("WordDocument")) {
+ // Old or new style word document?
+ try {
+ return new WordExtractor(poifsDir);
+ } catch (OldWordFileFormatException e) {
+ return new Word6Extractor(poifsDir);
+ }
}
- }
- if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
- return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
- }
+ if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
+ return new SlideShowExtractor<HSLFShape, HSLFTextParagraph>(SlideShowFactory.create(poifsDir));
+ }
- if (poifsDir.hasEntry("VisioDocument")) {
- return new VisioTextExtractor(poifsDir);
- }
+ if (poifsDir.hasEntry("VisioDocument")) {
+ return new VisioTextExtractor(poifsDir);
+ }
- if (poifsDir.hasEntry("Quill")) {
- return new PublisherTextExtractor(poifsDir);
- }
+ if (poifsDir.hasEntry("Quill")) {
+ return new PublisherTextExtractor(poifsDir);
+ }
- final String[] outlookEntryNames = new String[] {
- // message bodies, saved as plain text (PtypString)
- // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
- // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
- // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
- // @see org.apache.poi.hsmf.Types.MAPIType
- "__substg1.0_1000001E", //PidTagBody ASCII
- "__substg1.0_1000001F", //PidTagBody Unicode
- "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
- "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
- "__substg1.0_0037001E", //PidTagSubject ASCII
- "__substg1.0_0037001F", //PidTagSubject Unicode
- };
- for (String entryName : outlookEntryNames) {
- if (poifsDir.hasEntry(entryName)) {
- return new OutlookTextExtractor(poifsDir);
+ final String[] outlookEntryNames = new String[]{
+ // message bodies, saved as plain text (PtypString)
+ // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
+ // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
+ // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
+ // @see org.apache.poi.hsmf.Types.MAPIType
+ "__substg1.0_1000001E", //PidTagBody ASCII
+ "__substg1.0_1000001F", //PidTagBody Unicode
+ "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
+ "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
+ "__substg1.0_0037001E", //PidTagSubject ASCII
+ "__substg1.0_0037001F", //PidTagSubject Unicode
+ };
+ for (String entryName : outlookEntryNames) {
+ if (poifsDir.hasEntry(entryName)) {
+ return new OutlookTextExtractor(poifsDir);
+ }
}
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(oldPW);
}
- throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+ return null;
}
/**
* @param ext the extractor holding the directory to start parsing
* @param dirs a list to be filled with directory references holding embedded
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
- *
- * @throws IOException when the format specific extraction fails because of invalid entires
*/
- public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
+ @Override
+ public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while (it.hasNext()) {
- Entry entry = it.next();
- if (entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
+ StreamSupport.stream(root.spliterator(), false)
+ .filter(entry -> entry.getName().startsWith("MBD"))
+ .forEach(dirs::add);
} else if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
- DirectoryEntry op = (DirectoryEntry)
- root.getEntry("ObjectPool");
- Iterator<Entry> it = op.getEntries();
- while(it.hasNext()) {
- Entry entry = it.next();
- if(entry.getName().startsWith("_")) {
- dirs.add(entry);
- }
- }
+ DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
+ StreamSupport.stream(op.spliterator(), false)
+ .filter(entry -> entry.getName().startsWith("_"))
+ .forEach(dirs::add);
} catch(FileNotFoundException e) {
logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
// ignored here
package org.apache.poi.hdgf.extractor;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
* Can operate on the command line (outputs to stdout), or
* can return the text for you (example: for use with Lucene).
*/
-public final class VisioTextExtractor extends POIOLE2TextExtractor {
+public final class VisioTextExtractor implements POIOLE2TextExtractor {
private HDGFDiagram hdgf;
+ private boolean doCloseFilesystem = true;
public VisioTextExtractor(HDGFDiagram hdgf) {
- super(hdgf);
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
// Capture the text, as long as it isn't
// simply an empty string
String str = cmd.getValue().toString();
- if(str.isEmpty() || "\n".equals(str)) {
- // Ignore empty strings
- } else {
+ if (!(str.isEmpty() || "\n".equals(str))) {
text.add( str );
}
}
return text.toString();
}
- public static void main(String[] args) throws Exception {
- if(args.length == 0) {
- System.err.println("Use:");
- System.err.println(" VisioTextExtractor <file.vsd>");
- System.exit(1);
- }
+ @Override
+ public HDGFDiagram getDocument() {
+ return hdgf;
+ }
- try (FileInputStream fis = new FileInputStream(args[0])) {
- VisioTextExtractor extractor =
- new VisioTextExtractor(fis);
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
- // Print not PrintLn as already has \n added to it
- System.out.print(extractor.getText());
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
- extractor.close();
- }
+ @Override
+ public HDGFDiagram getFilesystem() {
+ return hdgf;
}
}
package org.apache.poi.hpbf.extractor;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit;
-import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
+import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Extract text from HPBF Publisher files
*/
-public final class PublisherTextExtractor extends POIOLE2TextExtractor {
- private HPBFDocument doc;
+public final class PublisherTextExtractor implements POIOLE2TextExtractor {
+ private final HPBFDocument doc;
private boolean hyperlinksByDefault;
+ private boolean doCloseFilesystem = true;
public PublisherTextExtractor(HPBFDocument doc) {
- super(doc);
this.doc = doc;
}
+
public PublisherTextExtractor(DirectoryNode dir) throws IOException {
this(new HPBFDocument(dir));
}
+
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
+
public PublisherTextExtractor(InputStream is) throws IOException {
this(new POIFSFileSystem(is));
}
// Get the text from the Quill Contents
QCBit[] bits = doc.getQuillContents().getBits();
for (QCBit bit1 : bits) {
- if (bit1 != null && bit1 instanceof QCTextBit) {
+ if (bit1 instanceof QCTextBit) {
QCTextBit t = (QCTextBit) bit1;
text.append(t.getText().replace('\r', '\n'));
}
// how to tie that together.
if(hyperlinksByDefault) {
for (QCBit bit : bits) {
- if (bit != null && bit instanceof Type12) {
+ if (bit instanceof Type12) {
Type12 hyperlinks = (Type12) bit;
for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) {
text.append("<");
return text.toString();
}
+ @Override
+ public HPBFDocument getDocument() {
+ return doc;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
- public static void main(String[] args) throws Exception {
- if(args.length == 0) {
- System.err.println("Use:");
- System.err.println(" PublisherTextExtractor <file.pub>");
- }
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
- for (String arg : args) {
- try (FileInputStream fis = new FileInputStream(arg)) {
- PublisherTextExtractor te = new PublisherTextExtractor(fis);
- System.out.println(te.getText());
- te.close();
- }
- }
+ @Override
+ public HPBFDocument getFilesystem() {
+ return doc;
}
}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-
-package org.apache.poi.hslf.extractor;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
-
-import org.apache.poi.EncryptedDocumentException;
-import org.apache.poi.extractor.POIOLE2TextExtractor;
-import org.apache.poi.hslf.usermodel.HSLFObjectShape;
-import org.apache.poi.hslf.usermodel.HSLFShape;
-import org.apache.poi.hslf.usermodel.HSLFSlideShow;
-import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
-import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
-import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.sl.extractor.SlideShowExtractor;
-import org.apache.poi.sl.usermodel.SlideShow;
-import org.apache.poi.sl.usermodel.SlideShowFactory;
-import org.apache.poi.util.Removal;
-
-/**
- * This class can be used to extract text from a PowerPoint file. Can optionally
- * also get the notes from one.
- *
- * @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead
- */
-@SuppressWarnings("WeakerAccess")
-@Deprecated
-@Removal(version="5.0.0")
-public final class PowerPointExtractor extends POIOLE2TextExtractor {
- private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
-
- private boolean slidesByDefault = true;
- private boolean notesByDefault;
- private boolean commentsByDefault;
- private boolean masterByDefault;
-
- /**
- * Basic extractor. Returns all the text, and optionally all the notes
- */
- public static void main(String[] args) throws IOException {
- if (args.length < 1) {
- System.err.println("Usage:");
- System.err.println("\tPowerPointExtractor [-notes] <file>");
- System.exit(1);
- }
-
- boolean notes = false;
- boolean comments = false;
- boolean master = true;
-
- String file;
- if (args.length > 1) {
- notes = true;
- file = args[1];
- if (args.length > 2) {
- comments = true;
- }
- } else {
- file = args[0];
- }
-
- try (PowerPointExtractor ppe = new PowerPointExtractor(file)) {
- System.out.println(ppe.getText(true, notes, comments, master));
- }
- }
-
- public PowerPointExtractor(final HSLFSlideShow slideShow) {
- super(slideShow.getSlideShowImpl());
- setFilesystem(slideShow);
- delegate = new SlideShowExtractor<>(slideShow);
- }
-
- /**
- * Creates a PowerPointExtractor, from a file
- *
- * @param fileName The name of the file to extract from
- */
- public PowerPointExtractor(String fileName) throws IOException {
- this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true));
- }
-
- /**
- * Creates a PowerPointExtractor, from an Input Stream
- *
- * @param iStream The input stream containing the PowerPoint document
- */
- public PowerPointExtractor(InputStream iStream) throws IOException {
- this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword()));
- }
-
- /**
- * Creates a PowerPointExtractor, from an open POIFSFileSystem
- *
- * @param fs the POIFSFileSystem containing the PowerPoint document
- */
- public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
- this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword()));
- }
-
- /**
- * Creates a PowerPointExtractor, from a specific place
- * inside an open {@link POIFSFileSystem}
- *
- * @param dir the POIFS Directory containing the PowerPoint document
- */
- public PowerPointExtractor(DirectoryNode dir) throws IOException {
- this(new HSLFSlideShow(dir));
- }
-
- /**
- * Creates a PowerPointExtractor, from a HSLFSlideShow
- *
- * @param ss the HSLFSlideShow to extract text from
- */
- public PowerPointExtractor(HSLFSlideShowImpl ss) {
- this(new HSLFSlideShow(ss));
- }
-
- /**
- * Should a call to getText() return slide text? Default is yes
- */
- public void setSlidesByDefault(final boolean slidesByDefault) {
- this.slidesByDefault = slidesByDefault;
- delegate.setSlidesByDefault(slidesByDefault);
- }
-
- /**
- * Should a call to getText() return notes text? Default is no
- */
- public void setNotesByDefault(final boolean notesByDefault) {
- this.notesByDefault = notesByDefault;
- delegate.setNotesByDefault(notesByDefault);
- }
-
- /**
- * Should a call to getText() return comments text? Default is no
- */
- public void setCommentsByDefault(final boolean commentsByDefault) {
- this.commentsByDefault = commentsByDefault;
- delegate.setCommentsByDefault(commentsByDefault);
- }
-
- /**
- * Should a call to getText() return text from master? Default is no
- */
- public void setMasterByDefault(final boolean masterByDefault) {
- this.masterByDefault = masterByDefault;
- delegate.setMasterByDefault(masterByDefault);
- }
-
- /**
- * Fetches all the slide text from the slideshow, but not the notes, unless
- * you've called setSlidesByDefault() and setNotesByDefault() to change this
- */
- @Override
- public String getText() {
- return delegate.getText();
- }
-
- /**
- * Fetches text from the slideshow, be it slide text or note text. Because
- * the final block of text in a TextRun normally have their last \n
- * stripped, we add it back
- *
- * @param getSlideText fetch slide text
- * @param getNoteText fetch note text
- */
- public String getText(boolean getSlideText, boolean getNoteText) {
- return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault);
- }
-
- public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
- delegate.setSlidesByDefault(getSlideText);
- delegate.setNotesByDefault(getNoteText);
- delegate.setCommentsByDefault(getCommentText);
- delegate.setMasterByDefault(getMasterText);
- try {
- return delegate.getText();
- } finally {
- delegate.setSlidesByDefault(slidesByDefault);
- delegate.setNotesByDefault(notesByDefault);
- delegate.setCommentsByDefault(commentsByDefault);
- delegate.setMasterByDefault(masterByDefault);
- }
- }
-
- /**
- * Fetches all the notes text from the slideshow, but not the slide text
- */
- public String getNotes() {
- return getText(false, true, false, false);
- }
-
- @SuppressWarnings("unchecked")
- public List<HSLFObjectShape> getOLEShapes() {
- return (List<HSLFObjectShape>)delegate.getOLEShapes();
- }
-
- /**
- * Helper method to avoid problems with compiling code in Eclipse
- *
- * Eclipse javac has some bugs with complex casts, this method tries
- * to work around this.
- *
- * @param fs The {@link POIFSFileSystem} to read the document from
- * @param password The password that should be used or null if no password is necessary.
- *
- * @return The created SlideShow
- *
- * @throws IOException if an error occurs while reading the data
- */
- private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException {
- // Note: don't change the code here, it is required for Eclipse to compile the code
- SlideShow slideShowOrig = SlideShowFactory.create(fs, password);
- return (HSLFSlideShow)slideShowOrig;
- }
-
- /**
- * Helper method to avoid problems with compiling code in Eclipse
- *
- * Eclipse javac has some bugs with complex casts, this method tries
- * to work around this.
- *
- * @param inp The {@link InputStream} to read data from.
- * @param password The password that should be used or null if no password is necessary.
- *
- * @return The created SlideShow
- *
- * @throws IOException if an error occurs while reading the data
- * @throws EncryptedDocumentException If the wrong password is given for a protected file
- */
- private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException {
- // Note: don't change the code here, it is required for Eclipse to compile the code
- SlideShow slideShowOrig = SlideShowFactory.create(inp, password);
- return (HSLFSlideShow)slideShowOrig;
- }
-
- /**
- * Helper method to avoid problems with compiling code in Eclipse
- *
- * Eclipse javac has some bugs with complex casts, this method tries
- * to work around this.
- *
- * @param file The file to read data from.
- * @param password The password that should be used or null if no password is necessary.
- * @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back
- * changes when the document is closed.
- *
- * @return The created SlideShow
- *
- * @throws IOException if an error occurs while reading the data
- * @throws EncryptedDocumentException If the wrong password is given for a protected file
- */
- private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException {
- // Note: don't change the code here, it is required for Eclipse to compile the code
- SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly);
- return (HSLFSlideShow)slideShowOrig;
- }
-}
import java.util.Map;
import java.util.function.Supplier;
+import org.apache.poi.POIDocument;
import org.apache.poi.common.usermodel.GenericRecord;
import org.apache.poi.common.usermodel.fonts.FontInfo;
import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherOptRecord;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.ClassIDPredefined;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException;
import org.apache.poi.hslf.exceptions.HSLFException;
import org.apache.poi.hslf.model.MovieShape;
import org.apache.poi.hslf.record.*;
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
+import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* TODO: - figure out how to match notes to their correct sheet (will involve
* understanding DocSlideList and DocNotesList) - handle Slide creation cleaner
*/
-public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord {
+public final class HSLFSlideShow extends POIDocument implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord {
//arbitrarily selected; may need to increase
private static final int MAX_RECORD_LENGTH = 10_000_000;
* @param hslfSlideShow the HSLFSlideShow to base on
*/
public HSLFSlideShow(HSLFSlideShowImpl hslfSlideShow) {
+ super(hslfSlideShow.getDirectory());
+
loadSavePhase.set(LoadSavePhase.INIT);
// Get useful things from our base slideshow
public HPSFPropertiesExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(getSlideShowImpl());
}
-
+
int addToObjListAtom(RecordContainer exObj) {
ExObjList lst = getDocumentRecord().getExObjList(true);
ExObjListAtom objAtom = lst.getExObjListAtom();
Map<String,ClassID> olemap = new HashMap<>();
olemap.put(POWERPOINT_DOCUMENT, ClassIDPredefined.POWERPOINT_V8.getClassID());
// as per BIFF8 spec
- olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID());
+ olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID());
// Typically from third party programs
olemap.put("WORKBOOK", ClassIDPredefined.EXCEL_V8.getClassID());
// Typically odd Crystal Reports exports
public List<? extends GenericRecord> getGenericChildren() {
return Arrays.asList(_hslfSlideShow.getRecords());
}
+
+ @Override
+ public void write() throws IOException {
+ getSlideShowImpl().write();
+ }
+
+ @Override
+ public void write(File newFile) throws IOException {
+ getSlideShowImpl().write(newFile);
+ }
+
+ @Override
+ public DocumentSummaryInformation getDocumentSummaryInformation() {
+ return getSlideShowImpl().getDocumentSummaryInformation();
+ }
+
+ @Override
+ public SummaryInformation getSummaryInformation() {
+ return getSlideShowImpl().getSummaryInformation();
+ }
+
+ @Override
+ public void createInformationProperties() {
+ getSlideShowImpl().createInformationProperties();
+ }
+
+ @Override
+ public void readProperties() {
+ getSlideShowImpl().readProperties();
+ }
+
+ @Override
+ protected PropertySet getPropertySet(String setName) throws IOException {
+ return getSlideShowImpl().getPropertySetImpl(setName);
+ }
+
+ @Override
+ protected PropertySet getPropertySet(String setName, EncryptionInfo encryptionInfo) throws IOException {
+ return getSlideShowImpl().getPropertySetImpl(setName, encryptionInfo);
+ }
+
+ @Override
+ protected void writeProperties() throws IOException {
+ getSlideShowImpl().writePropertiesImpl();
+ }
+
+ @Override
+ public void writeProperties(POIFSFileSystem outFS) throws IOException {
+ getSlideShowImpl().writeProperties(outFS);
+ }
+
+ @Override
+ protected void writeProperties(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException {
+ getSlideShowImpl().writePropertiesImpl(outFS, writtenEntries);
+ }
+
+ @Override
+ protected void validateInPlaceWritePossible() throws IllegalStateException {
+ getSlideShowImpl().validateInPlaceWritePossibleImpl();
+ }
+
+ @Override
+ public DirectoryNode getDirectory() {
+ return getSlideShowImpl().getDirectory();
+ }
+
+ @Override
+ protected void clearDirectory() {
+ getSlideShowImpl().clearDirectoryImpl();
+ }
+
+ @Override
+ protected boolean initDirectory() {
+ return getSlideShowImpl().initDirectoryImpl();
+ }
+
+ @Override
+ protected void replaceDirectory(DirectoryNode newDirectory) {
+ getSlideShowImpl().replaceDirectoryImpl(newDirectory);
+ }
+
+ @Override
+ protected String getEncryptedPropertyStreamName() {
+ return getSlideShowImpl().getEncryptedPropertyStreamName();
+ }
+
+ @Override
+ public EncryptionInfo getEncryptionInfo() throws IOException {
+ return getSlideShowImpl().getEncryptionInfo();
+ }
}
import java.util.TreeMap;
import org.apache.poi.POIDocument;
+import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException;
import org.apache.poi.hslf.exceptions.HSLFException;
import org.apache.poi.hslf.exceptions.OldPowerPointFormatException;
}
-
-
/* ******************* adding methods follow ********************* */
/**
return "EncryptedSummary";
}
+ void writePropertiesImpl() throws IOException {
+ super.writeProperties();
+ }
+
+ PropertySet getPropertySetImpl(String setName) throws IOException {
+ return super.getPropertySet(setName);
+ }
+
+ PropertySet getPropertySetImpl(String setName, EncryptionInfo encryptionInfo) throws IOException {
+ return super.getPropertySet(setName, encryptionInfo);
+ }
+
+ void writePropertiesImpl(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException {
+ super.writeProperties(outFS, writtenEntries);
+ }
+
+ void validateInPlaceWritePossibleImpl() throws IllegalStateException {
+ super.validateInPlaceWritePossible();
+ }
+
+ void clearDirectoryImpl() {
+ super.clearDirectory();
+ }
+
+ boolean initDirectoryImpl() {
+ return super.initDirectory();
+ }
+
+ void replaceDirectoryImpl(DirectoryNode newDirectory) {
+ super.replaceDirectory(newDirectory);
+ }
+
private static class BufAccessBAOS extends ByteArrayOutputStream {
public byte[] getBuf() {
return buf;
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hsmf.extractor;
-
-import org.apache.poi.hsmf.MAPIMessage;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.Removal;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-
-/**
- * A text extractor for HSMF (Outlook) .msg files.
- * Outputs in a format somewhat like a plain text email.
- *
- * @deprecated use @{link OutlookTextExtractor} instead
- */
-@Deprecated
-@Removal(version = "5.0.0")
-public class OutlookTextExtactor extends OutlookTextExtractor {
- public OutlookTextExtactor(MAPIMessage msg) {
- super(msg);
- }
-
- public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException {
- super(new MAPIMessage(poifsDir));
- }
-
- public OutlookTextExtactor(POIFSFileSystem fs) throws IOException {
- super(new MAPIMessage(fs));
- }
-
- public OutlookTextExtactor(InputStream inp) throws IOException {
- super(new MAPIMessage(inp));
- }
-
- public static void main(String[] args) throws Exception {
- for (String filename : args) {
- try (POIFSFileSystem poifs = new POIFSFileSystem(new File(filename));
- OutlookTextExtractor extractor = new OutlookTextExtractor(poifs)) {
- System.out.println(extractor.getText());
- }
- }
- }
-}
*
* @since 4.1.2
*/
-public class OutlookTextExtractor extends POIOLE2TextExtractor {
+public class OutlookTextExtractor implements POIOLE2TextExtractor {
+ private final MAPIMessage msg;
+ private boolean doCloseFilesystem = true;
+
public OutlookTextExtractor(MAPIMessage msg) {
- super(msg);
+ this.msg = msg;
}
public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException {
* Returns the underlying MAPI message
*/
public MAPIMessage getMAPIMessage() {
- return (MAPIMessage) document;
+ return msg;
}
/**
* Outputs something a little like a RFC822 email
*/
public String getText() {
- MAPIMessage msg = (MAPIMessage) document;
StringBuilder s = new StringBuilder();
// See if we can get a suitable encoding for any
}
s.append("\n");
}
+
+ @Override
+ public MAPIMessage getDocument() {
+ return msg;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public MAPIMessage getFilesystem() {
+ return msg;
+ }
}
* Class to extract the text from old (Word 6 / Word 95) Word Documents.
*
* This should only be used on the older files, for most uses you
- * should call {@link WordExtractor} which deals properly
+ * should call {@link WordExtractor} which deals properly
* with HWPF.
*
* @author Nick Burch
*/
-public final class Word6Extractor extends POIOLE2TextExtractor {
+public final class Word6Extractor implements POIOLE2TextExtractor {
private HWPFOldDocument doc;
+ private boolean doCloseFilesystem = true;
/**
* Create a new Word Extractor
/**
* Create a new Word Extractor
- *
+ *
* @param fs
* POIFSFileSystem containing the word file
*/
- public Word6Extractor( POIFSFileSystem fs ) throws IOException
- {
+ public Word6Extractor( POIFSFileSystem fs ) throws IOException {
this( fs.getRoot() );
}
* @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
*/
@Deprecated
- public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
- throws IOException
- {
+ public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException {
this( dir );
}
- public Word6Extractor( DirectoryNode dir ) throws IOException
- {
+ public Word6Extractor( DirectoryNode dir ) throws IOException {
this( new HWPFOldDocument( dir ) );
}
* @param doc The HWPFOldDocument to extract from
*/
public Word6Extractor(HWPFOldDocument doc) {
- super(doc);
this.doc = doc;
}
ret = new String[doc.getTextTable().getTextPieces().size()];
for(int i=0; i<ret.length; i++) {
ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuilder().toString();
-
+
// Fix the line endings
ret[i] = ret[i].replaceAll("\r", "\ufffe");
ret[i] = ret[i].replaceAll("\ufffe","\r\n");
return ret;
}
- public String getText()
- {
- try
- {
+ public String getText() {
+ try {
WordToTextConverter wordToTextConverter = new WordToTextConverter();
wordToTextConverter.processDocument( doc );
return wordToTextConverter.getText();
- }
- catch ( Exception exc )
- {
+ } catch ( Exception exc ) {
// fall-back
StringBuilder text = new StringBuilder();
- for ( String t : getParagraphText() )
- {
+ for ( String t : getParagraphText() ) {
text.append( t );
}
return text.toString();
}
}
+
+ @Override
+ public HWPFOldDocument getDocument() {
+ return doc;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public HWPFOldDocument getFilesystem() {
+ return doc;
+ }
}
package org.apache.poi.hwpf.extractor;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
*
* @author Nick Burch
*/
-public final class WordExtractor extends POIOLE2TextExtractor {
- private HWPFDocument doc;
+public final class WordExtractor implements POIOLE2TextExtractor {
+ private final HWPFDocument doc;
+ private boolean doCloseFilesystem = true;
/**
* Create a new Word Extractor
* The HWPFDocument to extract from
*/
public WordExtractor( HWPFDocument doc ) {
- super( doc );
this.doc = doc;
}
- /**
- * Command line extractor, so people will stop moaning that they can't just
- * run this.
- */
- public static void main( String[] args ) throws IOException {
- if ( args.length == 0 ) {
- System.err.println( "Use:" );
- System.err
- .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
- System.exit( 1 );
- }
-
- // Process the first argument as a file
- InputStream fin = new FileInputStream( args[0] );
- try (WordExtractor extractor = new WordExtractor(fin)) {
- System.out.println(extractor.getText());
- }
- }
-
/**
* Get the text from the word file, as an array with one String per
* paragraph
return getParagraphText( r );
}
- protected static String[] getParagraphText( Range r ) {
+ static String[] getParagraphText( Range r ) {
String[] ret;
ret = new String[r.numParagraphs()];
for ( int i = 0; i < ret.length; i++ ) {
/**
* Removes any fields (eg macros, page markers etc) from the string.
*/
- public static String stripFields( String text )
- {
+ public static String stripFields( String text ) {
return Range.stripFields( text );
}
+
+ @Override
+ public HWPFDocument getDocument() {
+ return doc;
+ }
+
+ @Override
+ public void setCloseFilesystem(boolean doCloseFilesystem) {
+ this.doCloseFilesystem = doCloseFilesystem;
+ }
+
+ @Override
+ public boolean isCloseFilesystem() {
+ return doCloseFilesystem;
+ }
+
+ @Override
+ public HWPFDocument getFilesystem() {
+ return doc;
+ }
}
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.PrintStream;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.HDGFDiagram;
import org.junit.Test;
public final class TestVisioExtractor {
- private static POIDataSamples _dgTests = POIDataSamples.getDiagramInstance();
+ private static final POIDataSamples _dgTests = POIDataSamples.getDiagramInstance();
private final String defFilename = "Test_Visio-Some_Random_Text.vsd";
private final int defTextChunks = 5;
is3.close();
HDGFDiagram hdgf3 = new HDGFDiagram(poifs3);
-
+
VisioTextExtractor extractor3 = new VisioTextExtractor(hdgf3);
assertNotNull(extractor3);
assertNotNull(extractor3.getAllText());
@Test
public void testProblemFiles() throws Exception {
String[] files = {
- "44594.vsd", "44594-2.vsd",
+ "44594.vsd", "44594-2.vsd",
"ShortChunk1.vsd", "ShortChunk2.vsd", "ShortChunk3.vsd",
"NegativeChunkLength.vsd", "NegativeChunkLength2.vsd"
};
}
}
- @Test
- public void testMain() throws Exception {
- PrintStream oldOut = System.out;
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- PrintStream capture = new PrintStream(baos);
- System.setOut(capture);
-
- String path = _dgTests.getFile(defFilename).getPath();
- VisioTextExtractor.main(new String[] {path});
-
- // Put things back
- System.setOut(oldOut);
-
- // Check
- capture.flush();
- String text = baos.toString();
- // YK: stdout can contain lots of other stuff if logging is sent to console
- // ( -Dorg.apache.poi.util.POILogger=org.apache.poi.util.SystemOutLogger)
- assertTrue( text.contains(
- "text\nView\n" +
- "Test View\nI am a test view\n" +
- "Some random text, on a page\n"
- ));
- }
-
private VisioTextExtractor openExtractor(String fileName) throws IOException {
try (InputStream is = _dgTests.openResourceAsStream(fileName)) {
return new VisioTextExtractor(is);
import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue;
import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue;
import org.apache.poi.hsmf.dev.HSMFDump;
-import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LocaleUtil;
fsMessageFails = new POIFSFileSystem(samples.getFile(messageFails));
mapiMessageSucceeds = new MAPIMessage(fsMessageSucceeds);
- mapiMessageFails = new MAPIMessage(fsMessageFails);
-
+ mapiMessageFails = new MAPIMessage(fsMessageFails);
+
messageDateFormat = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss", Locale.ROOT);
- messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC);
+ messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC);
userTimeZone = LocaleUtil.getUserTimeZone();
LocaleUtil.setUserTimeZone(LocaleUtil.TIMEZONE_UTC);
}
-
-
+
+
@AfterClass
public static void closeFS() throws Exception {
LocaleUtil.setUserTimeZone(userTimeZone);
fsMessageSucceeds.close();
fsMessageFails.close();
}
-
+
/**
* Check we can find a sensible number of properties on a few
* of our test files
@Test
public void testPropertiesFound() {
Map<MAPIProperty,List<PropertyValue>> props;
-
+
props = mapiMessageSucceeds.getMainChunks().getProperties();
assertTrue(props.toString(), props.size() > 10);
-
+
props = mapiMessageFails.getMainChunks().getProperties();
assertTrue(props.toString(), props.size() > 10);
}
-
+
/**
* Check we find properties of a variety of different types
*/
@Test
public void testPropertyValueTypes() {
Chunks mainChunks = mapiMessageSucceeds.getMainChunks();
-
+
// Ask to have the values looked up
Map<MAPIProperty,List<PropertyValue>> props = mainChunks.getProperties();
HashSet<Class<? extends PropertyValue>> seenTypes =
assertTrue(seenTypes.toString(), seenTypes.contains(LongPropertyValue.class));
assertTrue(seenTypes.toString(), seenTypes.contains(TimePropertyValue.class));
assertFalse(seenTypes.toString(), seenTypes.contains(ChunkBasedPropertyValue.class));
-
+
// Ask for the raw values
seenTypes.clear();
for (PropertyValue pv : mainChunks.getRawProperties().values()) {
@Test
public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception {
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds);
- ext.setFilesystem(null); // Don't close re-used test resources here
-
+ ext.setCloseFilesystem(false);
+
String text = ext.getText();
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n");
ext.close();
}
- @Test
- public void testReadMessageDateSucceedsWithOutlookTextExtactor() throws Exception {
- OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageSucceeds);
- ext.setFilesystem(null); // Don't close re-used test resources here
-
- String text = ext.getText();
- assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n");
- ext.close();
- }
-
/**
* Test to see if we can read the Date Chunk with OutlookTextExtractor.
*/
@Test
public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception {
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails);
- ext.setFilesystem(null); // Don't close re-used test resources here
-
+ ext.setCloseFilesystem(false);
+
String text = ext.getText();
assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n");
ext.close();
PrintStream stream = new PrintStream(new ByteArrayOutputStream());
HSMFDump dump = new HSMFDump(fsMessageSucceeds);
dump.dump(stream);
- }
+ }
/**
* Test to see if we can read the Date Chunk with HSMFDump.
// Check via the message date
Calendar clientSubmitTime = mapiMessageSucceeds.getMessageDate();
assertEquals(
- "Fri, 22 Jun 2012 18:32:54",
+ "Fri, 22 Jun 2012 18:32:54",
messageDateFormat.format(clientSubmitTime.getTime()));
-
+
// Fetch the property value directly
Map<MAPIProperty,List<PropertyValue>> props =
mapiMessageSucceeds.getMainChunks().getProperties();
- List<PropertyValue> pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME);
+ List<PropertyValue> pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME);
assertNotNull(pv);
assertEquals(1, pv.size());
-
+
clientSubmitTime = (Calendar)pv.get(0).getValue();
assertEquals(
- "Fri, 22 Jun 2012 18:32:54",
+ "Fri, 22 Jun 2012 18:32:54",
messageDateFormat.format(clientSubmitTime.getTime()));
}
}
import static org.apache.poi.POITestCase.assertContains;
import static org.apache.poi.POITestCase.assertNotContained;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
import java.io.FileInputStream;
import java.text.SimpleDateFormat;
@Test
public void testQuick() throws Exception {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true);
- MAPIMessage msg = new MAPIMessage(poifs);
-
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
- String text = ext.getText();
-
- assertContains(text, "From: Kevin Roast\n");
- assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n");
- assertNotContained(text, "CC:");
- assertNotContained(text, "BCC:");
- assertNotContained(text, "Attachment:");
- assertContains(text, "Subject: Test the content transformer\n");
- Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55);
- SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
- f.setTimeZone(LocaleUtil.getUserTimeZone());
- String dateText = f.format(cal.getTime());
- assertContains(text, "Date: " + dateText + "\n");
- assertContains(text, "The quick brown fox jumps over the lazy dog");
-
- ext.close();
- poifs.close();
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+ String text = ext.getText();
+
+ assertContains(text, "From: Kevin Roast\n");
+ assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n");
+ assertNotContained(text, "CC:");
+ assertNotContained(text, "BCC:");
+ assertNotContained(text, "Attachment:");
+ assertContains(text, "Subject: Test the content transformer\n");
+ Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55);
+ SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
+ f.setTimeZone(LocaleUtil.getUserTimeZone());
+ String dateText = f.format(cal.getTime());
+ assertContains(text, "Date: " + dateText + "\n");
+ assertContains(text, "The quick brown fox jumps over the lazy dog");
+ }
}
@Test
public void testSimple() throws Exception {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
- MAPIMessage msg = new MAPIMessage(poifs);
-
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
- String text = ext.getText();
-
- assertContains(text, "From: Travis Ferguson\n");
- assertContains(text, "To: travis@overwrittenstack.com\n");
- assertNotContained(text, "CC:");
- assertNotContained(text, "BCC:");
- assertContains(text, "Subject: test message\n");
- assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n");
- assertContains(text, "This is a test message.");
-
- ext.close();
- poifs.close();
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+ String text = ext.getText();
+
+ assertContains(text, "From: Travis Ferguson\n");
+ assertContains(text, "To: travis@overwrittenstack.com\n");
+ assertNotContained(text, "CC:");
+ assertNotContained(text, "BCC:");
+ assertContains(text, "Subject: test message\n");
+ assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n");
+ assertContains(text, "This is a test message.");
+ }
}
@Test
public void testConstructors() throws Exception {
- FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
- OutlookTextExtractor ext = new OutlookTextExtractor(fis);
- String inp = ext.getText();
- ext.close();
- fis.close();
-
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
- ext = new OutlookTextExtractor(poifs);
- String poifsTxt = ext.getText();
- ext.close();
- poifs.close();
-
- fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
- ext = new OutlookTextExtractor(new MAPIMessage(fis));
- String mapi = ext.getText();
- ext.close();
- fis.close();
+ String inp;
+ try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
+ OutlookTextExtractor ext = new OutlookTextExtractor(fis)) {
+ inp = ext.getText();
+ }
+
+ String poifsTxt;
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
+ OutlookTextExtractor ext = new OutlookTextExtractor(poifs)){
+ poifsTxt = ext.getText();
+ }
+
+ String mapi;
+ try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
+ OutlookTextExtractor ext = new OutlookTextExtractor(new MAPIMessage(fis))) {
+ mapi = ext.getText();
+ }
assertEquals(inp, poifsTxt);
assertEquals(inp, mapi);
"example_sent_regular.msg", "example_sent_unicode.msg"
};
for (String file : files) {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
- MAPIMessage msg = new MAPIMessage(poifs);
-
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
- String text = ext.getText();
-
- assertContains(text, "From: Mike Farman\n");
- assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
- "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
- assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " +
- "'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
- assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " +
- "'Vonka Jan' <jan.vonka@alfresco.com>\n");
- assertContains(text, "Subject: This is a test message please ignore\n");
- assertContains(text, "Date:");
- assertContains(text, "The quick brown fox jumps over the lazy dog");
-
- ext.close();
- poifs.close();
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+ String text = ext.getText();
+
+ assertContains(text, "From: Mike Farman\n");
+ assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
+ "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
+ assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " +
+ "'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
+ assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " +
+ "'Vonka Jan' <jan.vonka@alfresco.com>\n");
+ assertContains(text, "Subject: This is a test message please ignore\n");
+ assertContains(text, "Date:");
+ assertContains(text, "The quick brown fox jumps over the lazy dog");
+ }
}
}
"example_received_regular.msg", "example_received_unicode.msg"
};
for (String file : files) {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
- MAPIMessage msg = new MAPIMessage(poifs);
-
-
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
- String text = ext.getText();
-
- assertContains(text, "From: Mike Farman\n");
- assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
- "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
- assertContains(text, "CC: nickb@alfresco.com; " +
- "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
- assertNotContained(text, "BCC:");
- assertContains(text, "Subject: This is a test message please ignore\n");
- assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly
- assertContains(text, "The quick brown fox jumps over the lazy dog");
-
- ext.close();
- poifs.close();
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+ String text = ext.getText();
+
+ assertContains(text, "From: Mike Farman\n");
+ assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
+ "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
+ assertContains(text, "CC: nickb@alfresco.com; " +
+ "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
+ assertNotContained(text, "BCC:");
+ assertContains(text, "Subject: This is a test message please ignore\n");
+ assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly
+ assertContains(text, "The quick brown fox jumps over the lazy dog");
+ }
}
}
@SuppressWarnings("JavadocReference")
@Test
public void testWithAttachments() throws Exception {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true);
- MAPIMessage msg = new MAPIMessage(poifs);
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
-
- // Check the normal bits
- String text = ext.getText();
-
- assertContains(text, "From: Nicolas1");
- assertContains(text, "To: 'nicolas1.23456@free.fr'");
- assertNotContained(text, "CC:");
- assertNotContained(text, "BCC:");
- assertContains(text, "Subject: test");
- assertContains(text, "Date: Wed, 22 Apr");
- assertContains(text, "Attachment: test-unicode.doc\n");
- assertContains(text, "Attachment: pj1.txt\n");
- assertContains(text, "contenu");
-
- // Embeded bits are checked in
- // TestExtractorFactory
-
- ext.close();
- poifs.close();
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+
+ // Check the normal bits
+ String text = ext.getText();
+
+ assertContains(text, "From: Nicolas1");
+ assertContains(text, "To: 'nicolas1.23456@free.fr'");
+ assertNotContained(text, "CC:");
+ assertNotContained(text, "BCC:");
+ assertContains(text, "Subject: test");
+ assertContains(text, "Date: Wed, 22 Apr");
+ assertContains(text, "Attachment: test-unicode.doc\n");
+ assertContains(text, "Attachment: pj1.txt\n");
+ assertContains(text, "contenu");
+
+ // Embeded bits are checked in
+ // TestExtractorFactory
+ }
}
@Test
public void testWithAttachedMessage() throws Exception {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true);
- MAPIMessage msg = new MAPIMessage(poifs);
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
- String text = ext.getText();
-
- // Check we got bits from the main message
- assertContains(text, "Master mail");
- assertContains(text, "ante in lacinia euismod");
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+ String text = ext.getText();
- // But not the attached message
- assertNotContained(text, "Test mail attachment");
- assertNotContained(text, "Lorem ipsum dolor sit");
+ // Check we got bits from the main message
+ assertContains(text, "Master mail");
+ assertContains(text, "ante in lacinia euismod");
- ext.close();
- poifs.close();
+ // But not the attached message
+ assertNotContained(text, "Test mail attachment");
+ assertNotContained(text, "Lorem ipsum dolor sit");
+ }
}
@Test
public void testEncodings() throws Exception {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
- MAPIMessage msg = new MAPIMessage(poifs);
- OutlookTextExtractor ext = new OutlookTextExtractor(msg);
- String text = ext.getText();
-
- // Check the english bits
- assertContains(text, "From: Tests Chang@FT");
- assertContains(text, "tests.chang@fengttt.com");
-
- // And check some chinese bits
- assertContains(text, "(\u5f35\u6bd3\u502b)");
- assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
+ try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
+ MAPIMessage msg = new MAPIMessage(poifs);
+ OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
+ String text = ext.getText();
- ext.close();
- poifs.close();
- }
+ // Check the english bits
+ assertContains(text, "From: Tests Chang@FT");
+ assertContains(text, "tests.chang@fengttt.com");
- @Test
- public void testEncodingsDeprecatedClass() throws Exception {
- POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
- MAPIMessage msg = new MAPIMessage(poifs);
- OutlookTextExtactor ext = new OutlookTextExtactor(msg);
- assertTrue("OutlookTextExtactor instanceof OutlookTextExtractor", ext instanceof OutlookTextExtractor);
- String text = ext.getText();
-
- // Check the english bits
- assertContains(text, "From: Tests Chang@FT");
- assertContains(text, "tests.chang@fengttt.com");
-
- // And check some chinese bits
- assertContains(text, "(\u5f35\u6bd3\u502b)");
- assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
-
- ext.close();
- poifs.close();
+ // And check some chinese bits
+ assertContains(text, "(\u5f35\u6bd3\u502b)");
+ assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
+ }
}
}
package org.apache.poi.hwpf.extractor;
-import org.apache.poi.POIDataSamples;
-import org.apache.poi.extractor.POITextExtractor;
-import org.apache.poi.extractor.OLE2ExtractorFactory;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.junit.Test;
+import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import java.io.InputStream;
-import static org.junit.Assert.assertNotNull;
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.extractor.POITextExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.junit.Test;
/**
* Tests for bugs with the WordExtractor
@Test
public void testBug60374() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC"));
- final POITextExtractor extractor = OLE2ExtractorFactory.createExtractor(fs);
+ final POITextExtractor extractor = ExtractorFactory.createExtractor(fs);
// Check it gives text without error
assertNotNull(extractor.getText());
import java.io.InputStream;
import org.apache.poi.POIDataSamples;
-import org.apache.poi.hpsf.*;
+import org.apache.poi.hpsf.Thumbnail;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
@Test
public void testConstructors() throws IOException {
- POIFSFileSystem fs;
- HSSFWorkbook wb;
- try {
- fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
- wb = new HSSFWorkbook(fs);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- ExcelExtractor excelExt = new ExcelExtractor(wb);
-
final String fsText;
- HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs);
- fsExt.setFilesystem(null); // Don't close re-used test resources!
- try {
- fsText = fsExt.getText();
- } finally {
- fsExt.close();
- }
-
final String hwText;
- HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb);
- hwExt.setFilesystem(null); // Don't close re-used test resources!
- try {
- hwText = hwExt.getText();
- } finally {
- hwExt.close();
- }
-
final String eeText;
- HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt);
- eeExt.setFilesystem(null); // Don't close re-used test resources!
- try {
- eeText = eeExt.getText();
- } finally {
- eeExt.close();
- wb.close();
+
+ try (POIFSFileSystem fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
+ HSSFWorkbook wb = new HSSFWorkbook(fs);
+ ExcelExtractor excelExt = new ExcelExtractor(wb)) {
+
+ try (HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs)) {
+ // Don't close re-used test resources!
+ fsExt.setCloseFilesystem(false);
+ fsText = fsExt.getText();
+ }
+
+ try (HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb)) {
+ // Don't close re-used test resources!
+ hwExt.setCloseFilesystem(false);
+ hwText = hwExt.getText();
+ }
+
+ try (HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt)) {
+ // Don't close re-used test resources!
+ eeExt.setCloseFilesystem(false);
+ eeText = eeExt.getText();
+ }
}
assertEquals(fsText, hwText);
private static ExcelExtractor createExtractor(String sampleFileName) throws IOException {
File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
POIFSFileSystem fs = new POIFSFileSystem(file);
- ExcelExtractor extractor = new ExcelExtractor(fs);
- extractor.setFilesystem(fs);
- return extractor;
+ return new ExcelExtractor(fs);
}
@Test
extractor.setIncludeBlankCells(false);
extractor.setIncludeSheetNames(false);
String text = extractor.getText();
-
+
// Note - not all the formats in the file
// actually quite match what they claim to
// be, as some are auto-local builtins...
-
+
assertStartsWith(text, "Dates, all 24th November 2006\n");
assertContains(text, "yyyy/mm/dd\t2006/11/24\n");
assertContains(text, "yyyy-mm-dd\t2006-11-24\n");
assertContains(text, "dd-mm-yy\t24-11-06\n");
-
+
assertContains(text, "nn.nn\t10.52\n");
assertContains(text, "nn.nnn\t10.520\n");
assertContains(text, "\u00a3nn.nn\t\u00a310.52\n");
@Test
public void testWithEmbeded() throws Exception {
POIFSFileSystem fs = null;
-
+
HSSFWorkbook wbA = null, wbB = null;
ExcelExtractor exA = null, exB = null;
DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool");
DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460");
DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461");
-
+
wbA = new HSSFWorkbook(dirA, fs, true);
exA = new ExcelExtractor(wbA);
wbB = new HSSFWorkbook(dirB, fs, true);
exB = new ExcelExtractor(wbB);
assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
-
+
assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
-
+
// And the base file too
ex = new ExcelExtractor(fs);
assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText());