From dfdf9e6d6f470b82ad2a6b77e3059dd0df23905b Mon Sep 17 00:00:00 2001 From: Andreas Beeker Date: Thu, 13 Aug 2020 21:08:24 +0000 Subject: [PATCH] #64411 - Provide JigSaw modules - rework extractors - see bugzilla entry for more information git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1880839 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/stress/AbstractFileHandler.java | 2 +- .../apache/poi/stress/XSLFFileHandler.java | 13 +- .../poi/extractor/ExtractorFactory.java | 304 ++++++++++++++ .../poi/extractor/ExtractorProvider.java | 76 ++++ .../poi/extractor/MainExtractorFactory.java | 76 ++++ .../poi/extractor/OLE2ExtractorFactory.java | 279 ------------- .../poi/extractor/POIOLE2TextExtractor.java | 49 +-- .../poi/extractor/POITextExtractor.java | 49 ++- .../extractor/HPSFPropertiesExtractor.java | 47 ++- .../extractor/EventBasedExcelExtractor.java | 47 ++- .../poi/hssf/extractor/ExcelExtractor.java | 36 +- .../poi/hssf/extractor/OldExcelExtractor.java | 38 +- .../poi/sl/extractor/SlideShowExtractor.java | 37 +- .../poi/ss/extractor/ExcelExtractor.java | 20 +- src/multimodule/ooxml/java9/module-info.class | Bin 2602 -> 2715 bytes src/multimodule/ooxml/java9/module-info.java | 1 + src/multimodule/ooxml/test9/module-info.class | Bin 3471 -> 3584 bytes src/multimodule/ooxml/test9/module-info.java | 1 + src/multimodule/poi/java9/module-info.class | Bin 3062 -> 3169 bytes src/multimodule/poi/java9/module-info.java | 4 + src/multimodule/poi/test9/module-info.class | Bin 3189 -> 3296 bytes src/multimodule/poi/test9/module-info.java | 2 + .../scratchpad/java9/module-info.class | Bin 2099 -> 2219 bytes .../scratchpad/java9/module-info.java | 2 + .../scratchpad/test9/module-info.class | Bin 2273 -> 2393 bytes .../scratchpad/test9/module-info.java | 2 + .../extractor/CommandLineTextExtractor.java | 6 +- .../poi/ooxml/extractor/ExtractorFactory.java | 384 ------------------ .../extractor/POIXMLExtractorFactory.java | 281 +++++++++++++ .../POIXMLPropertiesTextExtractor.java | 29 +- .../ooxml/extractor/POIXMLTextExtractor.java | 60 ++- .../xdgf/extractor/XDGFVisioExtractor.java | 43 +- .../poi/xslf/extractor/XSLFExtractor.java | 45 ++ .../XSSFBEventBasedExcelExtractor.java | 16 +- .../XSSFEventBasedExcelExtractor.java | 48 +-- .../xssf/extractor/XSSFExcelExtractor.java | 45 +- .../poi/xwpf/extractor/XWPFWordExtractor.java | 45 +- .../extractor/ooxml/TestExtractorFactory.java | 143 ++++--- .../apache/poi/openxml4j/opc/TestPackage.java | 6 +- .../poifs/crypt/tests/TestHxxFEncryption.java | 16 +- .../org/apache/poi/xslf/TestXSLFBugs.java | 2 +- .../TestXSLFPowerPointExtractor.java | 14 +- ...FEventBasedExcelExtractorUsingFactory.java | 4 +- .../TestXSSFExcelExtractorUsingFactory.java | 2 +- ...org.apache.poi.extractor.ExtractorProvider | 18 + ...org.apache.poi.extractor.ExtractorProvider | 18 + ...org.apache.poi.extractor.ExtractorProvider | 18 + .../ole2/OLE2ScratchpadExtractorFactory.java | 132 +++--- .../hdgf/extractor/VisioTextExtractor.java | 37 +- .../extractor/PublisherTextExtractor.java | 44 +- .../hslf/extractor/PowerPointExtractor.java | 279 ------------- .../poi/hslf/usermodel/HSLFSlideShow.java | 103 ++++- .../poi/hslf/usermodel/HSLFSlideShowImpl.java | 35 +- .../hsmf/extractor/OutlookTextExtactor.java | 61 --- .../hsmf/extractor/OutlookTextExtractor.java | 30 +- .../poi/hwpf/extractor/Word6Extractor.java | 53 ++- .../poi/hwpf/extractor/WordExtractor.java | 51 ++- .../hdgf/extractor/TestVisioExtractor.java | 34 +- .../poi/hsmf/TestFixedSizedProperties.java | 53 +-- .../extractor/TestOutlookTextExtractor.java | 282 ++++++------- .../hwpf/extractor/TestWordExtractorBugs.java | 14 +- .../TestHPSFPropertiesExtractor.java | 57 ++- .../hssf/extractor/TestExcelExtractor.java | 18 +- 63 files changed, 1811 insertions(+), 1800 deletions(-) create mode 100644 src/java/org/apache/poi/extractor/ExtractorFactory.java create mode 100644 src/java/org/apache/poi/extractor/ExtractorProvider.java create mode 100644 src/java/org/apache/poi/extractor/MainExtractorFactory.java delete mode 100644 src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java delete mode 100644 src/ooxml/java/org/apache/poi/ooxml/extractor/ExtractorFactory.java create mode 100644 src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java create mode 100644 src/ooxml/java/org/apache/poi/xslf/extractor/XSLFExtractor.java create mode 100644 src/resources/main/META-INF/services/org.apache.poi.extractor.ExtractorProvider create mode 100644 src/resources/ooxml/META-INF/services/org.apache.poi.extractor.ExtractorProvider create mode 100644 src/resources/scratchpad/META-INF/services/org.apache.poi.extractor.ExtractorProvider delete mode 100644 src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java delete mode 100644 src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java diff --git a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java index 94bc98b9ec..d5a019c448 100644 --- a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java @@ -29,11 +29,11 @@ import java.util.HashSet; import java.util.Set; import org.apache.poi.EncryptedDocumentException; +import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; -import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.ss.extractor.ExcelExtractor; import org.apache.poi.util.IOUtils; diff --git a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java index ba6e4c1b0a..fa4d038682 100644 --- a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java @@ -23,7 +23,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.InputStream; -import org.apache.poi.ooxml.extractor.ExtractorFactory; +import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlideShow; @@ -37,11 +37,11 @@ public class XSLFFileHandler extends SlideShowHandler { assertNotNull(slideInner.getPresentation()); assertNotNull(slideInner.getSlideMasterReferences()); assertNotNull(slideInner.getSlideReferences()); - + new POIXMLDocumentHandler().handlePOIXMLDocument(slide); handleSlideShow(slide); - + slideInner.close(); slide.close(); } @@ -49,11 +49,12 @@ public class XSLFFileHandler extends SlideShowHandler { @Override public void handleExtracting(File file) throws Exception { super.handleExtracting(file); - - + + // additionally try the other getText() methods - try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) { + //noinspection rawtypes + try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) { assertNotNull(extractor); extractor.setSlidesByDefault(true); extractor.setNotesByDefault(true); diff --git a/src/java/org/apache/poi/extractor/ExtractorFactory.java b/src/java/org/apache/poi/extractor/ExtractorFactory.java new file mode 100644 index 0000000000..fa57be7a8c --- /dev/null +++ b/src/java/org/apache/poi/extractor/ExtractorFactory.java @@ -0,0 +1,304 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.extractor; + +import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword; +import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.ServiceLoader; +import java.util.stream.StreamSupport; + +import org.apache.poi.EmptyFileException; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.poifs.crypt.Decryptor; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.IOUtils; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +/** + * Figures out the correct POIOLE2TextExtractor for your supplied + * document, and returns it. + * + *

Note 1 - will fail for many file formats if the POI Scratchpad jar is + * not present on the runtime classpath

+ *

Note 2 - for text extractor creation across all formats, use + * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within + * the OOXML jar.

+ *

Note 3 - rather than using this, for most cases you would be better + * off switching to Apache Tika instead!

+ */ +@SuppressWarnings({"WeakerAccess", "JavadocReference"}) +public final class ExtractorFactory { + private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class); + + /** Should this thread prefer event based over usermodel based extractors? */ + private static final ThreadLocal threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE); + + /** Should all threads prefer event based over usermodel based extractors? */ + private static Boolean allPreferEventExtractors; + + + private static class Singleton { + private static final ExtractorFactory INSTANCE = new ExtractorFactory(); + } + + private interface ProviderMethod { + POITextExtractor create(ExtractorProvider prov) throws IOException; + } + + private final List provider = new ArrayList<>(); + + + private ExtractorFactory() { + ServiceLoader.load(ExtractorProvider.class).forEach(provider::add); + } + + /** + * Should this thread prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is false. + * + * @return true if event extractors should be preferred in the current thread, fals otherwise. + */ + public static boolean getThreadPrefersEventExtractors() { + return threadPreferEventExtractors.get(); + } + + /** + * Should all threads prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is to use the thread level setting, which defaults to false. + * + * @return true if event extractors should be preferred in all threads, fals otherwise. + */ + public static Boolean getAllThreadsPreferEventExtractors() { + return allPreferEventExtractors; + } + + /** + * Should this thread prefer event based over usermodel based extractors? + * Will only be used if the All Threads setting is null. + * + * @param preferEventExtractors If this threads should prefer event based extractors. + */ + public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { + threadPreferEventExtractors.set(preferEventExtractors); + } + + /** + * Should all threads prefer event based over usermodel based extractors? + * If set, will take preference over the Thread level setting. + * + * @param preferEventExtractors If all threads should prefer event based extractors. + */ + public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { + allPreferEventExtractors = preferEventExtractors; + } + + /** + * Should this thread use event based extractors is available? + * Checks the all-threads one first, then thread specific. + * + * @return If the current thread should use event based extractors. + */ + public static boolean getPreferEventExtractor() { + return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get(); + } + + public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { + return createExtractor(fs, getCurrentUserPassword()); + } + + public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException { + return createExtractor(fs.getRoot(), password); + } + + public static POITextExtractor createExtractor(InputStream input) throws IOException { + return createExtractor(input, getCurrentUserPassword()); + } + + public static POITextExtractor createExtractor(InputStream input, String password) throws IOException { + final InputStream is = FileMagic.prepareToCheckMagic(input); + byte[] emptyFileCheck = new byte[1]; + is.mark(emptyFileCheck.length); + if (is.read(emptyFileCheck) < emptyFileCheck.length) { + throw new EmptyFileException(); + } + is.reset(); + + final FileMagic fm = FileMagic.valueOf(is); + if (FileMagic.OOXML == fm) { + return wp(fm, w -> w.create(is, password)); + } + + if (FileMagic.OLE2 != fm) { + throw new IOException("Can't create extractor - unsupported file type: "+fm); + } + + POIFSFileSystem poifs = new POIFSFileSystem(is); + boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY); + + return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password)); + } + + public static POITextExtractor createExtractor(File file) throws IOException { + return createExtractor(file, getCurrentUserPassword()); + } + + public static POITextExtractor createExtractor(File file, String password) throws IOException { + if (file.length() == 0) { + throw new EmptyFileException(); + } + + final FileMagic fm = FileMagic.valueOf(file); + if (FileMagic.OOXML == fm) { + return wp(fm, w -> w.create(file, password)); + } + + if (FileMagic.OLE2 != fm) { + throw new IOException("Can't create extractor - unsupported file type: "+fm); + } + + POIFSFileSystem poifs = new POIFSFileSystem(file, true); + try { + boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY); + return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password)); + } catch (IOException | RuntimeException e) { + IOUtils.closeQuietly(poifs); + throw e; + } + } + + + /** + * Create the Extractor, if possible. Generally needs the Scratchpad jar. + * Note that this won't check for embedded OOXML resources either, use + * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that. + * + * @param root The {@link DirectoryNode} pointing to a document. + * + * @return The resulting {@link POITextExtractor}, an exception is thrown if + * no TextExtractor can be created for some reason. + * + * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails + * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of + * an unsupported version of Excel. + * @throws IllegalArgumentException If creating the Extractor fails + */ + public static POITextExtractor createExtractor(DirectoryNode root) throws IOException { + return createExtractor(root, getCurrentUserPassword()); + } + + public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException { + // Encrypted OOXML files go inside OLE2 containers, is this one? + if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) { + return wp(FileMagic.OOXML, w -> w.create(root, password)); + } else { + return wp(FileMagic.OLE2, w -> w.create(root, password)); + } + } + + /** + * Returns an array of text extractors, one for each of + * the embedded documents in the file (if there are any). + * If there are no embedded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embedded file. + * + * @param ext The extractor to look at for embedded documents + * + * @return An array of resulting extractors. Empty if no embedded documents are found. + * + * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails + * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of + * an unsupported version of Excel. + * @throws IllegalArgumentException If creating the Extractor fails + */ + public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { + if (ext == null) { + throw new IllegalStateException("extractor must be given"); + } + + // All the embedded directories we spotted + List dirs = new ArrayList<>(); + // For anything else not directly held in as a POIFS directory + List nonPOIFS = new ArrayList<>(); + + // Find all the embedded directories + DirectoryEntry root = ext.getRoot(); + if(root == null) { + throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); + } + + if(ext instanceof ExcelExtractor) { + // These are in MBD... under the root + StreamSupport.stream(root.spliterator(), false) + .filter(entry -> entry.getName().startsWith("MBD")) + .forEach(dirs::add); + } else { + for (ExtractorProvider prov : Singleton.INSTANCE.provider) { + if (prov.accepts(FileMagic.OLE2)) { + prov.identifyEmbeddedResources(ext, dirs, nonPOIFS); + break; + } + } + } + + // Create the extractors + if(dirs.size() == 0 && nonPOIFS.size() == 0){ + return new POITextExtractor[0]; + } + + ArrayList textExtractors = new ArrayList<>(); + for (Entry dir : dirs) { + textExtractors.add(createExtractor((DirectoryNode) dir)); + } + for (InputStream stream : nonPOIFS) { + try { + textExtractors.add(createExtractor(stream)); + } catch (IOException e) { + // Ignore, just means it didn't contain a format we support as yet + LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage()); + } + } + return textExtractors.toArray(new POITextExtractor[0]); + } + + private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException { + for (ExtractorProvider prov : Singleton.INSTANCE.provider) { + if (prov.accepts(fm)) { + POITextExtractor ext = fun.create(prov); + if (ext != null) { + return ext; + } + } + } + throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " + + "or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm); + } + +} diff --git a/src/java/org/apache/poi/extractor/ExtractorProvider.java b/src/java/org/apache/poi/extractor/ExtractorProvider.java new file mode 100644 index 0000000000..ccbeee15d0 --- /dev/null +++ b/src/java/org/apache/poi/extractor/ExtractorProvider.java @@ -0,0 +1,76 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.extractor; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.FileMagic; + +public interface ExtractorProvider { + boolean accepts(FileMagic fm); + + /** + * Create Extractor via file + * @param file the file + * @param password the password or {@code null} if not encrypted + * @return the extractor + * @throws IOException if file can't be read or parsed + */ + POITextExtractor create(File file, String password) throws IOException; + + /** + * Create Extractor via InputStream + * @param inputStream the stream + * @param password the password or {@code null} if not encrypted + * @return the extractor + * @throws IOException if stream can't be read or parsed + */ + POITextExtractor create(InputStream inputStream, String password) throws IOException; + + /** + * Create Extractor from POIFS node + * @param poifsDir the node + * @param password the password or {@code null} if not encrypted + * @return the extractor + * @throws IOException if node can't be parsed + */ + POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException; + + /** + * Returns an array of text extractors, one for each of + * the embedded documents in the file (if there are any). + * If there are no embedded documents, you'll get back an + * empty array. Otherwise, you'll get one open + * {@link POITextExtractor} for each embedded file. + * + * @param ext the extractor holding the directory to start parsing + * @param dirs a list to be filled with directory references holding embedded + * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries + * + * @throws IOException when the format specific extraction fails because of invalid entires + */ + default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) throws IOException { + throw new IllegalArgumentException("Error checking for Scratchpad embedded resources"); + } + +} diff --git a/src/java/org/apache/poi/extractor/MainExtractorFactory.java b/src/java/org/apache/poi/extractor/MainExtractorFactory.java new file mode 100644 index 0000000000..7f8733eecf --- /dev/null +++ b/src/java/org/apache/poi/extractor/MainExtractorFactory.java @@ -0,0 +1,76 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.extractor; + +import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hssf.extractor.OldExcelExtractor; +import org.apache.poi.hssf.model.InternalWorkbook; +import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * ExtractorFactory for HSSF and Old Excel format + */ +public class MainExtractorFactory implements ExtractorProvider { + @Override + public boolean accepts(FileMagic fm) { + return FileMagic.OLE2 == fm; + } + + @Override + public POITextExtractor create(File file, String password) throws IOException { + return create(new POIFSFileSystem(file, true).getRoot(), password); + } + + @Override + public POITextExtractor create(InputStream inputStream, String password) throws IOException { + return create(new POIFSFileSystem(inputStream).getRoot(), password); + } + + @Override + public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { + final String oldPW = Biff8EncryptionKey.getCurrentUserPassword(); + try { + Biff8EncryptionKey.setCurrentUserPassword(password); + + // Look for certain entries in the stream, to figure it out from + for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) { + if (poifsDir.hasEntry(workbookName)) { + return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir); + } + } + + if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) { + return new OldExcelExtractor(poifsDir); + } + } finally { + Biff8EncryptionKey.setCurrentUserPassword(oldPW); + } + + return null; + } +} diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java deleted file mode 100644 index 1f5eee039d..0000000000 --- a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java +++ /dev/null @@ -1,279 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.extractor; - -import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME; -import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES; - -import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Method; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.poi.hssf.OldExcelFormatException; -import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; -import org.apache.poi.hssf.extractor.ExcelExtractor; -import org.apache.poi.poifs.filesystem.DirectoryEntry; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.Entry; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.POILogFactory; -import org.apache.poi.util.POILogger; - -/** - * Figures out the correct POIOLE2TextExtractor for your supplied - * document, and returns it. - * - *

Note 1 - will fail for many file formats if the POI Scratchpad jar is - * not present on the runtime classpath

- *

Note 2 - for text extractor creation across all formats, use - * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within - * the OOXML jar.

- *

Note 3 - rather than using this, for most cases you would be better - * off switching to Apache Tika instead!

- */ -@SuppressWarnings({"WeakerAccess", "JavadocReference"}) -public final class OLE2ExtractorFactory { - private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class); - - /** Should this thread prefer event based over usermodel based extractors? */ - private static final ThreadLocal threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE); - - /** Should all threads prefer event based over usermodel based extractors? */ - private static Boolean allPreferEventExtractors; - - private OLE2ExtractorFactory() { - } - - /** - * Should this thread prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is false. - * - * @return true if event extractors should be preferred in the current thread, fals otherwise. - */ - public static boolean getThreadPrefersEventExtractors() { - return threadPreferEventExtractors.get(); - } - - /** - * Should all threads prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is to use the thread level setting, which defaults to false. - * - * @return true if event extractors should be preferred in all threads, fals otherwise. - */ - public static Boolean getAllThreadsPreferEventExtractors() { - return allPreferEventExtractors; - } - - /** - * Should this thread prefer event based over usermodel based extractors? - * Will only be used if the All Threads setting is null. - * - * @param preferEventExtractors If this threads should prefer event based extractors. - */ - public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { - threadPreferEventExtractors.set(preferEventExtractors); - } - - /** - * Should all threads prefer event based over usermodel based extractors? - * If set, will take preference over the Thread level setting. - * - * @param preferEventExtractors If all threads should prefer event based extractors. - */ - public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { - allPreferEventExtractors = preferEventExtractors; - } - - /** - * Should this thread use event based extractors is available? - * Checks the all-threads one first, then thread specific. - * - * @return If the current thread should use event based extractors. - */ - public static boolean getPreferEventExtractor() { - if(allPreferEventExtractors != null) { - return allPreferEventExtractors; - } - return threadPreferEventExtractors.get(); - } - - @SuppressWarnings("unchecked") - public static T createExtractor(POIFSFileSystem fs) throws IOException { - return (T)createExtractor(fs.getRoot()); - } - - @SuppressWarnings("unchecked") - public static T createExtractor(InputStream input) throws IOException { - Class cls = getOOXMLClass(); - if (cls != null) { - // Use Reflection to get us the full OOXML-enabled version - try { - Method m = cls.getDeclaredMethod("createExtractor", InputStream.class); - return (T)m.invoke(null, input); - } catch (IllegalArgumentException iae) { - throw iae; - } catch (Exception e) { - throw new IllegalArgumentException("Error creating Extractor for InputStream", e); - } - } else { - // Best hope it's OLE2.... - return createExtractor(new POIFSFileSystem(input)); - } - } - - private static Class getOOXMLClass() { - try { - return OLE2ExtractorFactory.class.getClassLoader().loadClass( - "org.apache.poi.extractor.ExtractorFactory" - ); - } catch (ClassNotFoundException e) { - LOGGER.log(POILogger.WARN, "POI OOXML jar missing"); - return null; - } - } - private static Class getScratchpadClass() { - try { - return OLE2ExtractorFactory.class.getClassLoader().loadClass( - "org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory" - ); - } catch (ClassNotFoundException e) { - LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing"); - throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory"); - } - } - - /** - * Create the Extractor, if possible. Generally needs the Scratchpad jar. - * Note that this won't check for embedded OOXML resources either, use - * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that. - * - * @param poifsDir The {@link DirectoryNode} pointing to a document. - * - * @return The resulting {@link POITextExtractor}, an exception is thrown if - * no TextExtractor can be created for some reason. - * - * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails - * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of - * an unsupported version of Excel. - * @throws IllegalArgumentException If creating the Extractor fails - */ - public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { - // Look for certain entries in the stream, to figure it - // out from - for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) { - if (poifsDir.hasEntry(workbookName)) { - if (getPreferEventExtractor()) { - return new EventBasedExcelExtractor(poifsDir); - } - return new ExcelExtractor(poifsDir); - } - } - if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) { - throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) " - + "found. Please call OldExcelExtractor directly for basic text extraction"); - } - - // Ask Scratchpad, or fail trying - Class cls = getScratchpadClass(); - try { - Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class); - POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir); - if (ext != null) return ext; - } catch (IllegalArgumentException iae) { - throw iae; - } catch (Exception e) { - throw new IllegalArgumentException("Error creating Scratchpad Extractor", e); - } - - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); - } - - /** - * Returns an array of text extractors, one for each of - * the embedded documents in the file (if there are any). - * If there are no embedded documents, you'll get back an - * empty array. Otherwise, you'll get one open - * {@link POITextExtractor} for each embedded file. - * - * @param ext The extractor to look at for embedded documents - * - * @return An array of resulting extractors. Empty if no embedded documents are found. - * - * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails - * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of - * an unsupported version of Excel. - * @throws IllegalArgumentException If creating the Extractor fails - */ - public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { - // All the embedded directories we spotted - List dirs = new ArrayList<>(); - // For anything else not directly held in as a POIFS directory - List nonPOIFS = new ArrayList<>(); - - // Find all the embedded directories - DirectoryEntry root = ext.getRoot(); - if(root == null) { - throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); - } - - if(ext instanceof ExcelExtractor) { - // These are in MBD... under the root - Iterator it = root.getEntries(); - while(it.hasNext()) { - Entry entry = it.next(); - if(entry.getName().startsWith("MBD")) { - dirs.add(entry); - } - } - } else { - // Ask Scratchpad, or fail trying - Class cls = getScratchpadClass(); - try { - Method m = cls.getDeclaredMethod( - "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class); - m.invoke(null, ext, dirs, nonPOIFS); - } catch (Exception e) { - throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e); - } - } - - // Create the extractors - if(dirs.size() == 0 && nonPOIFS.size() == 0){ - return new POITextExtractor[0]; - } - - ArrayList e = new ArrayList<>(); - for (Entry dir : dirs) { - e.add(createExtractor((DirectoryNode) dir - )); - } - for (InputStream stream : nonPOIFS) { - try { - e.add(createExtractor(stream)); - } catch (Exception xe) { - // Ignore, invalid format - LOGGER.log(POILogger.WARN, xe); - } - } - return e.toArray(new POITextExtractor[0]); - } -} diff --git a/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java b/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java index 465de40375..a389b71d0c 100644 --- a/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/extractor/POIOLE2TextExtractor.java @@ -30,55 +30,28 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; * org.apache.poi.[format].extractor . * * @see org.apache.poi.hssf.extractor.ExcelExtractor - * @see org.apache.poi.hslf.extractor.PowerPointExtractor * @see org.apache.poi.hdgf.extractor.VisioTextExtractor * @see org.apache.poi.hwpf.extractor.WordExtractor */ -public abstract class POIOLE2TextExtractor extends POITextExtractor { - /** The POIDocument that's open */ - protected POIDocument document; - - /** - * Creates a new text extractor for the given document - * - * @param document The POIDocument to use in this extractor. - */ - public POIOLE2TextExtractor(POIDocument document) { - this.document = document; - - // Ensure any underlying resources, such as open files, - // will get cleaned up if the user calls #close() - setFilesystem(document); - } - - /** - * Creates a new text extractor, using the same - * document as another text extractor. Normally - * only used by properties extractors. - * - * @param otherExtractor the extractor which document to be used - */ - protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) { - this.document = otherExtractor.document; - } - +public interface POIOLE2TextExtractor extends POITextExtractor { /** * Returns the document information metadata for the document * * @return The Document Summary Information or null * if it could not be read for this document. */ - public DocumentSummaryInformation getDocSummaryInformation() { - return document.getDocumentSummaryInformation(); + default DocumentSummaryInformation getDocSummaryInformation() { + return getDocument().getDocumentSummaryInformation(); } + /** * Returns the summary information metadata for the document. * * @return The Summary information for the document or null * if it could not be read for this document. */ - public SummaryInformation getSummaryInformation() { - return document.getSummaryInformation(); + default SummaryInformation getSummaryInformation() { + return getDocument().getSummaryInformation(); } /** @@ -88,7 +61,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { * @return an instance of POIExtractor that can extract meta-data. */ @Override - public POITextExtractor getMetadataTextExtractor() { + default POITextExtractor getMetadataTextExtractor() { return new HPSFPropertiesExtractor(this); } @@ -97,8 +70,8 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { * * @return the DirectoryEntry that is associated with the POIDocument of this extractor. */ - public DirectoryEntry getRoot() { - return document.getDirectory(); + default DirectoryEntry getRoot() { + return getDocument().getDirectory(); } /** @@ -107,7 +80,5 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { * @return the underlying POIDocument */ @Override - public POIDocument getDocument() { - return document; - } + POIDocument getDocument(); } \ No newline at end of file diff --git a/src/java/org/apache/poi/extractor/POITextExtractor.java b/src/java/org/apache/poi/extractor/POITextExtractor.java index e32adcb12c..cf88c57bae 100644 --- a/src/java/org/apache/poi/extractor/POITextExtractor.java +++ b/src/java/org/apache/poi/extractor/POITextExtractor.java @@ -21,19 +21,16 @@ import java.io.IOException; /** * Common Parent for Text Extractors - * of POI Documents. + * of POI Documents. * You will typically find the implementation of * a given format's text extractor under * org.apache.poi.[format].extractor . - * + * * @see org.apache.poi.hssf.extractor.ExcelExtractor - * @see org.apache.poi.hslf.extractor.PowerPointExtractor * @see org.apache.poi.hdgf.extractor.VisioTextExtractor * @see org.apache.poi.hwpf.extractor.WordExtractor */ -public abstract class POITextExtractor implements Closeable { - private Closeable fsToClose; - +public interface POITextExtractor extends Closeable { /** * Retrieves all the text from the document. * How cells, paragraphs etc are separated in the text @@ -41,42 +38,50 @@ public abstract class POITextExtractor implements Closeable { * a specific project for details. * @return All the text from the document */ - public abstract String getText(); - + String getText(); + /** * Returns another text extractor, which is able to * output the textual content of the document * metadata / properties, such as author and title. - * + * * @return the metadata and text extractor */ - public abstract POITextExtractor getMetadataTextExtractor(); + POITextExtractor getMetadataTextExtractor(); /** - * Used to ensure file handle cleanup. - * - * @param fs filesystem to close + * @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be + * closed on {@link #close()} */ - public void setFilesystem(Closeable fs) { - fsToClose = fs; - } - + void setCloseFilesystem(boolean doCloseFilesystem); + + /** + * @return {@code true}, if resources/filesystem should be closed on {@link #close()} + */ + boolean isCloseFilesystem(); + + /** + * @return The underlying resources/filesystem + */ + Closeable getFilesystem(); + /** * Allows to free resources of the Extractor as soon as * it is not needed any more. This may include closing * open file handles and freeing memory. - * + * * The Extractor cannot be used after close has been called. */ @Override - public void close() throws IOException { - if(fsToClose != null) { - fsToClose.close(); + default void close() throws IOException { + Closeable fs = getFilesystem(); + if (isCloseFilesystem() && fs != null) { + fs.close(); } } /** * @return the processed document */ - public abstract Object getDocument(); + Object getDocument(); } diff --git a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java index 0069fafa9d..fde938d258 100644 --- a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java +++ b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java @@ -17,9 +17,6 @@ package org.apache.poi.hpsf.extractor; -import java.io.File; -import java.io.IOException; - import org.apache.poi.POIDocument; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POITextExtractor; @@ -37,15 +34,20 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * build in and custom, returning them in * textual form. */ -public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { +public class HPSFPropertiesExtractor implements POIOLE2TextExtractor { + private final POIDocument document; + private boolean doCloseFilesystem = true; + public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) { - super(mainExtractor); + document = mainExtractor.getDocument(); } - public HPSFPropertiesExtractor(POIDocument doc) { - super(doc); + + public HPSFPropertiesExtractor(POIDocument document) { + this.document = document; } + public HPSFPropertiesExtractor(POIFSFileSystem fs) { - super(new HPSFPropertiesOnlyDocument(fs)); + document = new HPSFPropertiesOnlyDocument(fs); } public String getDocumentSummaryInformationText() { @@ -122,11 +124,11 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { } private static String getPropertyValueText(Object val) { - return (val == null) + return (val == null) ? "(not set)" : PropertySet.getPropertyStringValue(val); } - + @Override public boolean equals(Object o) { return super.equals(o); @@ -137,12 +139,23 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { return super.hashCode(); } - public static void main(String[] args) throws IOException { - for (String file : args) { - try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor( - new POIFSFileSystem(new File(file)))) { - System.out.println(ext.getText()); - } - } + @Override + public POIDocument getDocument() { + return document; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public POIDocument getFilesystem() { + return document; } } diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java index 122eddf109..68c69cf1d3 100644 --- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java @@ -17,6 +17,7 @@ package org.apache.poi.hssf.extractor; +import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -37,9 +38,9 @@ import org.apache.poi.hssf.record.LabelRecord; import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NoteRecord; import org.apache.poi.hssf.record.NumberRecord; -import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.StringRecord; +import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -56,29 +57,31 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * To turn an excel file into a CSV or similar, then see * the XLS2CSVmra example *

- * + * * @see XLS2CSVmra */ -public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { - private DirectoryNode _dir; +public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { + private final POIFSFileSystem poifs; + private final DirectoryNode _dir; + private boolean doCloseFilesystem = true; boolean _includeSheetNames = true; boolean _formulasNotResults; - public EventBasedExcelExtractor( DirectoryNode dir ) - { - super( (POIDocument)null ); + public EventBasedExcelExtractor(DirectoryNode dir) { + poifs = null; _dir = dir; } public EventBasedExcelExtractor(POIFSFileSystem fs) { - this(fs.getRoot()); - super.setFilesystem(fs); + poifs = fs; + _dir = fs.getRoot(); } /** * Would return the document information metadata for the document, * if we supported it */ + @Override public DocumentSummaryInformation getDocSummaryInformation() { throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); } @@ -86,6 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or * Would return the summary information metadata for the document, * if we supported it */ + @Override public SummaryInformation getSummaryInformation() { throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); } @@ -262,4 +266,29 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or } } } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public Closeable getFilesystem() { + return poifs; + } + + @Override + public POIDocument getDocument() { + return null; + } + + @Override + public DirectoryEntry getRoot() { + return _dir; + } } diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java index ec40f097f1..5fa855546f 100644 --- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java @@ -50,12 +50,13 @@ import org.apache.poi.ss.usermodel.Row.MissingCellPolicy; * To turn an excel file into a CSV or similar, then see * the XLS2CSVmra example *

- * + * * @see XLS2CSVmra */ -public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { +public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { private final HSSFWorkbook _wb; private final HSSFDataFormatter _formatter; + private boolean doCloseFilesystem = true; private boolean _includeSheetNames = true; private boolean _shouldEvaluateFormulas = true; private boolean _includeCellComments; @@ -63,13 +64,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p private boolean _includeHeadersFooters = true; public ExcelExtractor(HSSFWorkbook wb) { - super(wb); _wb = wb; _formatter = new HSSFDataFormatter(); } + public ExcelExtractor(POIFSFileSystem fs) throws IOException { this(fs.getRoot()); } + public ExcelExtractor(DirectoryNode dir) throws IOException { this(new HSSFWorkbook(dir, true)); } @@ -201,9 +203,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p /** * Command line extractor. - * + * * @param args the command line parameters - * + * * @throws IOException if the file can't be read or contains errors */ public static void main(String[] args) throws IOException { @@ -225,7 +227,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile()); HSSFWorkbook wb = new HSSFWorkbook(is); - ExcelExtractor extractor = new ExcelExtractor(wb); + ExcelExtractor extractor = new ExcelExtractor(wb) ) { extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames()); extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas()); @@ -255,7 +257,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p * Should blank cells be output? Default is to only * output cells that are present in the file and are * non-blank. - * + * * @param includeBlankCells {@code true} if blank cells should be included */ public void setIncludeBlankCells(boolean includeBlankCells) { @@ -411,4 +413,24 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p return text.toString(); } + + @Override + public HSSFWorkbook getDocument() { + return _wb; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public HSSFWorkbook getFilesystem() { + return _wb; + } } diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java index a4f334a4e6..56b1424b3d 100644 --- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.io.InputStream; import org.apache.poi.EncryptedDocumentException; +import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.hssf.OldExcelFormatException; import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.CodepageRecord; @@ -58,7 +59,7 @@ import org.apache.poi.util.IOUtils; * by Apache Tika, but not really intended for display to the user. *

*/ -public class OldExcelExtractor implements Closeable { +public class OldExcelExtractor implements POITextExtractor { private final static int FILE_PASS_RECORD_SID = 0x2f; //arbitrarily selected; may need to increase @@ -295,24 +296,39 @@ public class OldExcelExtractor implements Closeable { } } - close(); ris = null; return text.toString(); } - @Override - public void close() { - // some cases require this close here - if(toClose != null) { - IOUtils.closeQuietly(toClose); - toClose = null; - } - } - protected void handleNumericCell(StringBuilder text, double value) { // TODO Need to fetch / use format strings text.append(value); text.append('\n'); } + + @Override + public POITextExtractor getMetadataTextExtractor() { + return null; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + + } + + @Override + public boolean isCloseFilesystem() { + return toClose != null; + } + + @Override + public Closeable getFilesystem() { + return toClose; + } + + @Override + public Object getDocument() { + return ris; + } } diff --git a/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java b/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java index d4a2645ca2..fa454501d6 100644 --- a/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java +++ b/src/java/org/apache/poi/sl/extractor/SlideShowExtractor.java @@ -54,14 +54,14 @@ import org.apache.poi.util.POILogger; public class SlideShowExtractor< S extends Shape, P extends TextParagraph -> extends POITextExtractor { +> implements POITextExtractor { private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class); // placeholder text for slide numbers private static final String SLIDE_NUMBER_PH = "‹#›"; - private SlideShow slideshow; + protected final SlideShow slideshow; private boolean slidesByDefault = true; private boolean notesByDefault; @@ -69,9 +69,9 @@ public class SlideShowExtractor< private boolean masterByDefault; private Predicate filter = o -> true; + private boolean doCloseFilesystem = true; public SlideShowExtractor(final SlideShow slideshow) { - setFilesystem(slideshow); this.slideshow = slideshow; } @@ -81,8 +81,8 @@ public class SlideShowExtractor< * @return the opened document */ @Override - public final Object getDocument() { - return slideshow.getPersistDocument(); + public SlideShow getDocument() { + return slideshow; } /** @@ -339,17 +339,17 @@ public class SlideShowExtractor< return raw; } - TextParagraph tp = tr.getParagraph(); - TextShape ps = (tp != null) ? tp.getParentShape() : null; - Sheet sh = (ps != null) ? ps.getSheet() : null; - String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : ""; + TextParagraph tp = tr.getParagraph(); + TextShape ps = (tp != null) ? tp.getParentShape() : null; + Sheet sh = (ps != null) ? ps.getSheet() : null; + String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : ""; return raw.replace(SLIDE_NUMBER_PH, slideNr); } private static String replaceTextCap(TextRun tr) { - final TextParagraph tp = tr.getParagraph(); - final TextShape sh = (tp != null) ? tp.getParentShape() : null; + final TextParagraph tp = tr.getParagraph(); + final TextShape sh = (tp != null) ? tp.getParentShape() : null; final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null; // 0xB acts like cariage return in page titles and like blank in the others @@ -438,4 +438,19 @@ public class SlideShowExtractor< (italic == null || tr.isItalic() == italic) && (bold == null || tr.isBold() == bold); } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public SlideShow getFilesystem() { + return getDocument(); + } } diff --git a/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java b/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java index 82a31592bf..9adc35cb84 100644 --- a/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java +++ b/src/java/org/apache/poi/ss/extractor/ExcelExtractor.java @@ -24,39 +24,39 @@ public interface ExcelExtractor { /** * Should sheet names be included? * Default is true - * + * * @param includeSheetNames {@code true} if the sheet names should be included */ - public void setIncludeSheetNames(boolean includeSheetNames); + void setIncludeSheetNames(boolean includeSheetNames); /** * Should we return the formula itself, and not the result it produces? * Default is false - * + * * @param formulasNotResults {@code true} if the formula itself is returned */ - public void setFormulasNotResults(boolean formulasNotResults); + void setFormulasNotResults(boolean formulasNotResults); /** * Should headers and footers be included in the output? * Default is true - * + * * @param includeHeadersFooters {@code true} if headers and footers should be included */ - public void setIncludeHeadersFooters(boolean includeHeadersFooters); + void setIncludeHeadersFooters(boolean includeHeadersFooters); /** * Should cell comments be included? * Default is false - * + * * @param includeCellComments {@code true} if cell comments should be included */ - public void setIncludeCellComments(boolean includeCellComments); + void setIncludeCellComments(boolean includeCellComments); /** * Retrieves the text contents of the file - * + * * @return the text contents of the file */ - public String getText(); + String getText(); } diff --git a/src/multimodule/ooxml/java9/module-info.class b/src/multimodule/ooxml/java9/module-info.class index 8dd7fcb574f36329ccb84eebadeac8048484eb6a..0895d4b496877476aabecd9fc83cf487727451ee 100644 GIT binary patch delta 144 zcmZ1_GFz1E)W2Q(7#J8#8QL~-wQ-oSGc+?YXyq5B>n9c@CTFDT7vyK^r&g2{B_@~T z7wNmgSOG=(Wtl0dMeGc%j0~oe`B_9Kv2beZ2l#tN`1-(Axq)?6HZU+S0_}hRMh0de Xw`cPbj(SF3CWbnoR0Bg3kZb_}=ddUu delta 53 zcmbO&x=MuW)W2Q(7#J8#8Jae7wQ-0vFfcFzxe&m}zzpQJZRY2!XJlt&r~^th003O> B3M&8r diff --git a/src/multimodule/ooxml/java9/module-info.java b/src/multimodule/ooxml/java9/module-info.java index 794e086e4e..876e6a3b1d 100644 --- a/src/multimodule/ooxml/java9/module-info.java +++ b/src/multimodule/ooxml/java9/module-info.java @@ -29,6 +29,7 @@ module org.apache.poi.ooxml { requires java.security.jgss; provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory; + provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; exports org.apache.poi.xwpf.extractor; exports org.apache.poi.xwpf.usermodel; diff --git a/src/multimodule/ooxml/test9/module-info.class b/src/multimodule/ooxml/test9/module-info.class index 8d3730adb6d15a55b8b38c5afc8aa8a799b16ab4..7777b5f6815c43d46b442fd5fa3a192b4595e44e 100644 GIT binary patch delta 109 zcmeB|Zjj+R^>5cc1_lOGh7B9J+_~A<8P+l~XifI#kQ8^VC@D%zF3B$nD9SI(Oi3+b zXIRh3U^>~5MN~2sCZ`|Z?-}9i16MoQi(87(U~?+BG7~Qo!wLpQhE)t}faE#=LR%n1 delta 53 zcmZpW>6hj@^>5cc1_lOGhBX_x+_^;>7#J9VTnJ!fU}j)o(%O81TbYTSkzoZ;Y83!f CNeWT` diff --git a/src/multimodule/ooxml/test9/module-info.java b/src/multimodule/ooxml/test9/module-info.java index 36427adde4..bd8a23a725 100644 --- a/src/multimodule/ooxml/test9/module-info.java +++ b/src/multimodule/ooxml/test9/module-info.java @@ -29,6 +29,7 @@ module org.apache.poi.ooxml { requires java.security.jgss; provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory; + provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; exports org.apache.poi.xwpf.extractor; exports org.apache.poi.xwpf.usermodel; diff --git a/src/multimodule/poi/java9/module-info.class b/src/multimodule/poi/java9/module-info.class index a8abd6c13b22fdeb9962d3ba55f99290a2cf98b1..ba2f2330ed860c9fc5c11cc23c4641012eb8e300 100644 GIT binary patch delta 108 zcmew+{!oJJ)W2Q(7#J8#85V5hy1~h)HF+MRgs8r2MM+U&a!GzsKv8~KW=d)iJHy<` z4>(QO*cs+CGU$R7@#*^}X6C^aPUhv3V!XClgX=J(FcZTJhFJ_Cl96E!kj2O_4*=lN BA#(r# delta 46 ycmaDT@lBlT)W2Q(7#J8#8Rl%{y1_YFh|6elIhQcwxy{{NhZ(sU8D=m*@GJn3hYooF diff --git a/src/multimodule/poi/java9/module-info.java b/src/multimodule/poi/java9/module-info.java index 1ae77363e7..2995a21864 100644 --- a/src/multimodule/poi/java9/module-info.java +++ b/src/multimodule/poi/java9/module-info.java @@ -28,8 +28,12 @@ module org.apache.poi.poi { requires jdk.unsupported; uses org.apache.poi.ss.usermodel.WorkbookProvider; + uses org.apache.poi.extractor.ExtractorProvider; + provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory; + provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory; + exports org.apache.poi; exports org.apache.poi.common; diff --git a/src/multimodule/poi/test9/module-info.class b/src/multimodule/poi/test9/module-info.class index 7695551235910ee70f64d82eafe1be242afbc3ad..9010295aa09fe706e8109f234dfa5f2a9527416f 100644 GIT binary patch delta 115 zcmew=@j#O6)W2Q(7#J8#8CGoMvgTscn!JZmLPFoQqNFG>xg@_RpeVmAGbOc%ona{> zgW=?jETWU6xn$Vb8J06L=z`Sp>H8*T=E2oZF5r@4{It27>pr6}6T0Fmt@PXGV_ delta 67 zcmaDL`Bj4J)W2Q(7#J8#8J2A1vgTqmoV<}mbaFSBj7S3m10zrZ0vH*Xf!tS{-*Me% N`HnpUmw1car`6=jSf~I1jb}n*{VnanPvg>9Agj1@tb^T@khj*cY3nl1h|c z)ukj3OQ~;^^{a^rick>H_f0ibsjO06ggya79ntti5&8uTP*|YTWg90&7!)w*$59m5 z(^8~t^{j*)0*d&smzDDHL9ZI=iiBMPh8e*NbW=i6zp%;jg5reT|$(#CM-9s zmN3lJ4wkU?KBhJbS?y5)qn)*#m|CM%luFpoRCWPvi<=NI#^u2F>sA{~azMa17oorV zIpmcoZxteZf4$fDb=tc&g}uX@nd8h4b1?oBQS|F0m7pRjq0EyM(matvS0fOZ#_aiTT3e`n*ILd_U2FkV&F3 zAlr5iFC{DsIN0gw`k18Fz+4gIri;toFg%queCVB;?|cDP1Wa`l%KP=^{v8NUP23Cv zZ3>VInC%+x-bin8es1PPe%|FN?$OEeV+}yy5t{u8*o?O?p@(jJpm)t(Y=ueWHg^dF z#51s+es>awh?2O6xR*FW93zesCy57%hlxjt$B9$KQ^XSSG;xM_j(CB1iFk!LOT13J zNxV(GOT0&XKzu};Cq5xQB|ay^U9 literal 2099 zcmaKt*|Oydbf(i$PH*%V`)n==Z`P2rEfGP-$t-L5}yhwX!G>kjwDota*P)#|;jKAWN%^$DU{Oxb|XOtw3wF(l8 zH={O1Gh*A#X;xAl^Y$^Dtk*mpr9Ril!9JUTSE^Q)SSL zhmpLnvZ7;-w1eK1XQnG=e>KrD&yg5FS`TO{W2OVKRXU)ZCQAi{dci;q=V_SUprzp9e1+T+%YwnLCsOwjT0ohaQTeQdOVDtUuoQ-)f4! zQO(Tv#yE&n%y)@1%``)^TI;0Cj}9i=tJZ1|m8-UT8KVw&!bpu~t^*GLpw<2yY~t4i zY?jw9bgi3*Etp_-n@98rXRuZN_X@WO`-Izt{lXo>ox)wh-NFH3PDsK%!o9*l;gE1x zxKFrWI3gSsjtR$w6T$<+N#Q}^A>m=+5#dqcl<=7FxbTGVq;Oi87rMez!WrSLa87ty Oct&_ucusg;DEJFC;XZBv diff --git a/src/multimodule/scratchpad/java9/module-info.java b/src/multimodule/scratchpad/java9/module-info.java index df7ea261e5..717730fdd4 100644 --- a/src/multimodule/scratchpad/java9/module-info.java +++ b/src/multimodule/scratchpad/java9/module-info.java @@ -20,6 +20,8 @@ module org.apache.poi.scratchpad { requires java.desktop; requires commons.math3; + provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory; + exports org.apache.poi.hmef; exports org.apache.poi.hmef.dev; exports org.apache.poi.hmef.extractor; diff --git a/src/multimodule/scratchpad/test9/module-info.class b/src/multimodule/scratchpad/test9/module-info.class index 5809da74f2c1a22a93e58626e20a224185af7474..489d1d1ab92275167d33a3cb01f534a3e45de336 100644 GIT binary patch literal 2393 zcmZ{l>2}mc5QT4(V27|7s}pu`2r*k25d%peEMYMamaqi^WJ4phJ+en5(drr7N#OV= z&y*i|03Irb>e4WghZ>*bPm*uBy6SdG{pP=a{snLvug0-XVPlpj-L^TNcII-0u{-<} zV7H=(|dVHQNj~7#yYXk}b{UNN%2q|gU%FLX` zCkopIAxun9W4pp$Ni@sOMzPDS#?BETk#ja}cAe4KEv%i))7pE5wMnSf-mkEGcx^A1 z&YCDIH9iwo_5roxK3CWy<)HRE&R9zFg~C25!Ys$8a4ET~aY$j86ce|)okh}*2>mS; z)Za0oKN70`jw|dK&g#Xou$e|EH0RYg@AQnN+BLo!5mM{_|hcIuO8I3E#HWf|UrdeT|NT{~y`umAnUL0iC_(8DHhx9GHr7$&Y&_E7f#Ld)^ zLwAHC%67kqyMldQd}rT%p-8XSihL*xdKL2jpg$Usrsng{!bbTt&30O7JQbYqYTt=p z6b=tNF_1$?)fP4{XuMNDGrx^ks_Rmk+drgQ%QY`ZBy4z@(T2y8*M8dkH_u3ky%@Qu zKh5(aLTYxWlHHW0{#{uVzHn4uSVWfIPjVm=k{Al8wmqa<4LWNM412mZrf7F5*9*Z- zA6LC$rO=UAYdhann3fi5QJi0*K5R>p^KGhjMG)vaFpj?zjt*rXxEapAJ8tgTyq6}% zj^m}mxsmbyeGl*6o}GC(IB5qduJO6GvI;0VysICD59skVKIF$SjIH{ck1$32*xzCk za|?5Vc?y$!)yz+sJD9tedzkx}Q_KU*gUo5>Vdhchm&^w91apRYiupD3Tjp8jIpziC z_sq-8tITW68_b)`+sq%C_m~fuKQSLOpD>>>e`Ws8e9mk#6XqPV#av*vnH^@qv`j}- a*n(dFy4=66gYQgX0=_GM_5725|9%Th^^!dR literal 2273 zcmaKt*LKrD5QcwI5?W{)AR&bidb1#e-h1!8Bg^YpiDfC$+D^F2;jy^j0eC15vttt+ zYmF{GN&D~a)Rlhy{_zvQDSYe4We0ta;!?e;TwHN+)x|Xz*InFjanr>u!fnDG!d=2W z!hOO6!b8F%!ehb{!c)RC!gIn4!b`#{!fV1C!dt>S!h6C8!bidecGzs&B1qbUR@%&g=E5{Co;&i&; zCw@>-g(MCOS&;fVs3d;L!P*(W;TMX2rW|bTn#Xe;Y>){hmCfonaWD|XQ54s*Lgeep zNe8QE>a|d_MYA@FgV|8CMVH^amPLzX~!*(*)k?D1&J-f*`)qhvZ&ew10P8MVWk5!-4Gv!+$b%g1c8nh#W>L(ZIoowf-< zrCyt5F&9cRv4z5;txyKLH#16>q>&{KBL#7-p;Gp=hu#rqt}14IJyR)nm#VhuFVI{Vkz~TwxC$*aJoCFrFgx#swOS@ zPl~qu^JtIOHOkVamLwUyouu*ZEG%i>Tu=-(ry+M{vTECKu8WNFH$mM|R*cx;Rjx-{ z`oA8vZqSj&Tm{Sq#cIf(2=lpF))q!;M|`lS=4o~>N#lf@Gc*lgwNTd~H(uVkY*}OH z&QKQFYRIEKYAA}Y{dOfTk+01=!jDf^Y`7Cjyxnd|*Tb1^4f=0i{@c9 z#+iNQ5&gn7!nMLFtdskC;RfM=aHDXOaINote 1 - will fail for many file formats if the POI Scratchpad jar is - * not present on the runtime classpath

- *

Note 2 - rather than using this, for most cases you would be better - * off switching to Apache Tika instead!

- */ -@SuppressWarnings("WeakerAccess") -public final class ExtractorFactory { - private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class); - - public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; - private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; - private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; - - private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{ - XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, - XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE, - XSLFRelation.PRESENTATION_MACRO - }; - - private ExtractorFactory() { - } - - /** - * Should this thread prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is false. - */ - public static boolean getThreadPrefersEventExtractors() { - return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); - } - - /** - * Should all threads prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is to use the thread level setting, which defaults to false. - */ - public static Boolean getAllThreadsPreferEventExtractors() { - return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); - } - - /** - * Should this thread prefer event based over usermodel based extractors? - * Will only be used if the All Threads setting is null. - */ - public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { - OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); - } - - /** - * Should all threads prefer event based over usermodel based extractors? - * If set, will take preference over the Thread level setting. - */ - public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { - OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); - } - - /** - * Should this thread use event based extractors is available? - * Checks the all-threads one first, then thread specific. - */ - public static boolean getPreferEventExtractor() { - return OLE2ExtractorFactory.getPreferEventExtractor(); - } - - @SuppressWarnings("unchecked") - public static T createExtractor(File f) throws IOException, OpenXML4JException, XmlException { - POIFSFileSystem fs = null; - try { - fs = new POIFSFileSystem(f); - if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { - return (T)createEncryptedOOXMLExtractor(fs); - } - POITextExtractor extractor = createExtractor(fs); - extractor.setFilesystem(fs); - return (T)extractor; - } catch (OfficeXmlFileException e) { - // ensure file-handle release - IOUtils.closeQuietly(fs); - OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ); - T t = (T)createExtractor(pkg); - t.setFilesystem(pkg); - return t; - } catch (NotOLE2FileException ne) { - // ensure file-handle release - IOUtils.closeQuietly(fs); - throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne); - } catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR - // ensure file-handle release - IOUtils.closeQuietly(fs); - throw e; - } - } - - public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException { - InputStream is = FileMagic.prepareToCheckMagic(inp); - - FileMagic fm = FileMagic.valueOf(is); - - switch (fm) { - case OLE2: - POIFSFileSystem fs = new POIFSFileSystem(is); - boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY); - return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs); - case OOXML: - return createExtractor(OPCPackage.open(is)); - default: - throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm); - } - } - - /** - * Tries to determine the actual type of file and produces a matching text-extractor for it. - * - * @param pkg An {@link OPCPackage}. - * @return A {@link POIXMLTextExtractor} for the given file. - * @throws IOException If an error occurs while reading the file - * @throws OpenXML4JException If an error parsing the OpenXML file format is found. - * @throws XmlException If an XML parsing error occurs. - * @throws IllegalArgumentException If no matching file type could be found. - */ - public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { - try { - // Check for the normal Office core document - PackageRelationshipCollection core; - core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); - - // If nothing was found, try some of the other OOXML-based core types - if (core.size() == 0) { - // Could it be an OOXML-Strict one? - core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); - } - if (core.size() == 0) { - // Could it be a visio one? - core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); - if (core.size() == 1) - return new XDGFVisioExtractor(pkg); - } - - // Should just be a single core document, complain if not - if (core.size() != 1) { - throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); - } - - // Grab the core document part, and try to identify from that - final PackagePart corePart = pkg.getPart(core.getRelationship(0)); - final String contentType = corePart.getContentType(); - - // Is it XSSF? - for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { - if ( rel.getContentType().equals( contentType ) ) { - if (getPreferEventExtractor()) { - return new XSSFEventBasedExcelExtractor(pkg); - } - return new XSSFExcelExtractor(pkg); - } - } - - // Is it XWPF? - for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { - if ( rel.getContentType().equals( contentType ) ) { - return new XWPFWordExtractor(pkg); - } - } - - // Is it XSLF? - for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) { - if ( rel.getContentType().equals( contentType ) ) { - return new SlideShowExtractor<>(new XMLSlideShow(pkg)); - } - } - - // special handling for SlideShow-Theme-files, - if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { - return new SlideShowExtractor<>(new XMLSlideShow(pkg)); - } - - // How about xlsb? - for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) { - if (rel.getContentType().equals(contentType)) { - return new XSSFBEventBasedExcelExtractor(pkg); - } - } - - throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")"); - - } catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR - // ensure that we close the package again if there is an error opening it, however - // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! - pkg.revert(); - throw e; - } - } - - public static T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { - return createExtractor(fs.getRoot()); - } - - @SuppressWarnings("unchecked") - public static T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException - { - // First, check for OOXML - for (String entryName : poifsDir.getEntryNames()) { - if (entryName.equals("Package")) { - OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); - return (T)createExtractor(pkg); - } - } - - // If not, ask the OLE2 code to check, with Scratchpad if possible - return (T)OLE2ExtractorFactory.createExtractor(poifsDir); - } - - /** - * Returns an array of text extractors, one for each of - * the embedded documents in the file (if there are any). - * If there are no embedded documents, you'll get back an - * empty array. Otherwise, you'll get one open - * {@link POITextExtractor} for each embedded file. - */ - public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { - // All the embedded directories we spotted - ArrayList dirs = new ArrayList<>(); - // For anything else not directly held in as a POIFS directory - ArrayList nonPOIFS = new ArrayList<>(); - - // Find all the embedded directories - DirectoryEntry root = ext.getRoot(); - if (root == null) { - throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); - } - - // provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it - if (ext instanceof ExcelExtractor) { - // These are in MBD... under the root - Iterator it = root.getEntries(); - while (it.hasNext()) { - Entry entry = it.next(); - if (entry.getName().startsWith("MBD")) { - dirs.add(entry); - } - } - } else { - try { - Class clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"); - Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class); - m.invoke(null, ext, dirs, nonPOIFS); - } catch (ReflectiveOperationException e) { - logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage()); - return new POITextExtractor[0]; - } - } - - // Create the extractors - if (dirs.size() == 0 && nonPOIFS.size() == 0){ - return new POITextExtractor[0]; - } - - ArrayList textExtractors = new ArrayList<>(); - for (Entry dir : dirs) { - textExtractors.add(createExtractor((DirectoryNode) dir)); - } - for (InputStream nonPOIF : nonPOIFS) { - try { - textExtractors.add(createExtractor(nonPOIF)); - } catch (IllegalArgumentException e) { - // Ignore, just means it didn't contain - // a format we support as yet - logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage()); - } catch (XmlException | OpenXML4JException e) { - throw new IOException(e.getMessage(), e); - } - } - return textExtractors.toArray(new POITextExtractor[0]); - } - - /** - * Returns an array of text extractors, one for each of - * the embedded documents in the file (if there are any). - * If there are no embedded documents, you'll get back an - * empty array. Otherwise, you'll get one open - * {@link POITextExtractor} for each embedded file. - */ - @NotImplemented - @SuppressWarnings({"UnusedParameters", "UnusedReturnValue"}) - public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) { - throw new IllegalStateException("Not yet supported"); - } - - private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs) - throws IOException { - String pass = Biff8EncryptionKey.getCurrentUserPassword(); - if (pass == null) { - pass = Decryptor.DEFAULT_PASSWORD; - } - - EncryptionInfo ei = new EncryptionInfo(fs); - Decryptor dec = ei.getDecryptor(); - InputStream is = null; - try { - if (!dec.verifyPassword(pass)) { - throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor"); - } - is = dec.getDataStream(fs); - return createExtractor(OPCPackage.open(is)); - } catch (IOException e) { - throw e; - } catch (Exception e) { - throw new EncryptedDocumentException(e); - } finally { - IOUtils.closeQuietly(is); - - // also close the POIFSFileSystem here as we read all the data - // while decrypting - fs.close(); - } - } -} diff --git a/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java b/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java new file mode 100644 index 0000000000..ed3067869f --- /dev/null +++ b/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLExtractorFactory.java @@ -0,0 +1,281 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.ooxml.extractor; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.extractor.ExtractorProvider; +import org.apache.poi.extractor.POITextExtractor; +import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackageAccess; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; +import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; +import org.apache.poi.poifs.crypt.Decryptor; +import org.apache.poi.poifs.crypt.EncryptionInfo; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; +import org.apache.poi.xslf.extractor.XSLFExtractor; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFRelation; +import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; +import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.xmlbeans.XmlException; + +/** + * Figures out the correct POITextExtractor for your supplied + * document, and returns it. + * + *

Note 1 - will fail for many file formats if the POI Scratchpad jar is + * not present on the runtime classpath

+ *

Note 2 - rather than using this, for most cases you would be better + * off switching to Apache Tika instead!

+ */ +@SuppressWarnings("WeakerAccess") +public final class POIXMLExtractorFactory implements ExtractorProvider { + private static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; + private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; + private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; + + private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{ + XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE, + XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE, + XSLFRelation.PRESENTATION_MACRO + }; + + @Override + public boolean accepts(FileMagic fm) { + return fm == FileMagic.OOXML; + } + + /** + * Should this thread prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is false. + */ + public static boolean getThreadPrefersEventExtractors() { + return ExtractorFactory.getThreadPrefersEventExtractors(); + } + + /** + * Should all threads prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is to use the thread level setting, which defaults to false. + */ + public static Boolean getAllThreadsPreferEventExtractors() { + return ExtractorFactory.getAllThreadsPreferEventExtractors(); + } + + /** + * Should this thread prefer event based over usermodel based extractors? + * Will only be used if the All Threads setting is null. + */ + public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { + ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); + } + + /** + * Should all threads prefer event based over usermodel based extractors? + * If set, will take preference over the Thread level setting. + */ + public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { + ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); + } + + /** + * Should this thread use event based extractors is available? + * Checks the all-threads one first, then thread specific. + */ + public static boolean getPreferEventExtractor() { + return ExtractorFactory.getPreferEventExtractor(); + } + + @Override + public POITextExtractor create(File f, String password) throws IOException { + if (FileMagic.valueOf(f) != FileMagic.OOXML) { + return ExtractorFactory.createExtractor(f, password); + } + + + OPCPackage pkg = null; + try { + pkg = OPCPackage.open(f.toString(), PackageAccess.READ); + POIXMLTextExtractor ex = create(pkg); + if (ex == null) { + pkg.revert(); + } + return ex; + } catch (InvalidFormatException ife) { + throw new IOException(ife); + } catch (IOException e) { + pkg.revert(); + throw e; + } + } + + public POITextExtractor create(InputStream inp, String password) throws IOException { + InputStream is = FileMagic.prepareToCheckMagic(inp); + + if (FileMagic.valueOf(is) != FileMagic.OOXML) { + return ExtractorFactory.createExtractor(is, password); + } + + OPCPackage pkg = null; + try { + pkg = OPCPackage.open(is); + POIXMLTextExtractor ex = create(pkg); + if (ex == null) { + pkg.revert(); + } + return ex; + } catch (InvalidFormatException e) { + throw new IOException(e); + } catch (RuntimeException | IOException e) { + if (pkg != null) { + pkg.revert(); + } + throw e; + } + } + + /** + * Tries to determine the actual type of file and produces a matching text-extractor for it. + * + * @param pkg An {@link OPCPackage}. + * @return A {@link POIXMLTextExtractor} for the given file. + * @throws IOException If an error occurs while reading the file + * @throws IllegalArgumentException If no matching file type could be found. + */ + public POIXMLTextExtractor create(OPCPackage pkg) throws IOException { + try { + // Check for the normal Office core document + PackageRelationshipCollection core; + core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); + + // If nothing was found, try some of the other OOXML-based core types + if (core.size() == 0) { + // Could it be an OOXML-Strict one? + core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); + } + if (core.size() == 0) { + // Could it be a visio one? + core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); + if (core.size() == 1) { + return new XDGFVisioExtractor(pkg); + } + } + + // Should just be a single core document, complain if not + if (core.size() != 1) { + throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); + } + + // Grab the core document part, and try to identify from that + final PackagePart corePart = pkg.getPart(core.getRelationship(0)); + final String contentType = corePart.getContentType(); + + // Is it XSSF? + for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { + if (rel.getContentType().equals(contentType)) { + if (getPreferEventExtractor()) { + return new XSSFEventBasedExcelExtractor(pkg); + } + return new XSSFExcelExtractor(pkg); + } + } + + // Is it XWPF? + for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { + if (rel.getContentType().equals(contentType)) { + return new XWPFWordExtractor(pkg); + } + } + + // Is it XSLF? + for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) { + if (rel.getContentType().equals(contentType)) { + return new XSLFExtractor(new XMLSlideShow(pkg)); + } + } + + // special handling for SlideShow-Theme-files, + if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { + return new XSLFExtractor(new XMLSlideShow(pkg)); + } + + // How about xlsb? + for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) { + if (rel.getContentType().equals(contentType)) { + return new XSSFBEventBasedExcelExtractor(pkg); + } + } + + return null; + } catch (IOException e) { + throw e; + } catch (Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR + throw new IOException(e); + } + // we used to close (revert()) the package here, but this is the callers responsibility + // and we can't reuse the package + } + + public POITextExtractor create(POIFSFileSystem fs) throws IOException { + return create(fs.getRoot(), Biff8EncryptionKey.getCurrentUserPassword()); + } + + @Override + public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { + // First, check for plain OOXML package + if (poifsDir.hasEntry("Package")) { + try (InputStream is = poifsDir.createDocumentInputStream("Package")) { + return create(is, password); + } + } + + if (poifsDir.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { + EncryptionInfo ei = new EncryptionInfo(poifsDir); + Decryptor dec = ei.getDecryptor(); + try { + if (!dec.verifyPassword(password)) { + throw new IOException("Invalid password specified"); + } + try (InputStream is = dec.getDataStream(poifsDir)) { + return create(is, password); + } + } catch (IOException e) { + throw e; + } catch (Exception e) { + throw new IOException(e); + } + } + + throw new IOException("The OLE2 file neither contained a plain OOXML package node (\"Package\") nor an encrypted one (\"EncryptedPackage\")."); + } +} diff --git a/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java b/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java index 7eda767022..6df03370e2 100644 --- a/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java +++ b/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLPropertiesTextExtractor.java @@ -36,9 +36,10 @@ import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProper * content of the OOXML file properties, eg author * and title. */ -public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { - +public class POIXMLPropertiesTextExtractor implements POIXMLTextExtractor { + private final POIXMLDocument doc; private final DateFormat dateFormat; + private boolean doCloseFilesystem = true; /** * Creates a new POIXMLPropertiesTextExtractor for the given open document. @@ -46,7 +47,7 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { * @param doc the given open document */ public POIXMLPropertiesTextExtractor(POIXMLDocument doc) { - super(doc); + this.doc = doc; DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT); dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs); dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); @@ -242,7 +243,7 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { } /*else if (property.isSetArray()) { - // TODO Fetch the array values and output + // TODO Fetch the array values and output } else if (property.isSetVector()) { // TODO Fetch the vector values and output @@ -281,4 +282,24 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!"); } + + @Override + public POIXMLDocument getDocument() { + return doc; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public POIXMLDocument getFilesystem() { + return null; + } } diff --git a/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLTextExtractor.java b/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLTextExtractor.java index ada32a1cc0..eb235fa6aa 100644 --- a/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLTextExtractor.java +++ b/src/ooxml/java/org/apache/poi/ooxml/extractor/POIXMLTextExtractor.java @@ -27,61 +27,48 @@ import org.apache.poi.ooxml.POIXMLProperties.ExtendedProperties; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.util.ZipSecureFile; -public abstract class POIXMLTextExtractor extends POITextExtractor { - /** The POIXMLDocument that's open */ - private final POIXMLDocument _document; - - /** - * Creates a new text extractor for the given document - * - * @param document the document to extract from - */ - public POIXMLTextExtractor(POIXMLDocument document) { - _document = document; - } - +public interface POIXMLTextExtractor extends POITextExtractor { /** * Returns the core document properties - * + * * @return the core document properties */ - public CoreProperties getCoreProperties() { - return _document.getProperties().getCoreProperties(); + default CoreProperties getCoreProperties() { + return getDocument().getProperties().getCoreProperties(); } /** * Returns the extended document properties - * + * * @return the extended document properties */ - public ExtendedProperties getExtendedProperties() { - return _document.getProperties().getExtendedProperties(); + default ExtendedProperties getExtendedProperties() { + return getDocument().getProperties().getExtendedProperties(); } /** * Returns the custom document properties - * + * * @return the custom document properties */ - public CustomProperties getCustomProperties() { - return _document.getProperties().getCustomProperties(); + default CustomProperties getCustomProperties() { + return getDocument().getProperties().getCustomProperties(); } /** * Returns opened document - * + * * @return the opened document */ @Override - public final POIXMLDocument getDocument() { - return _document; - } + POIXMLDocument getDocument(); /** * Returns the opened OPCPackage that contains the document - * + * * @return the opened OPCPackage */ - public OPCPackage getPackage() { - return _document.getPackage(); + default OPCPackage getPackage() { + POIXMLDocument doc = getDocument(); + return doc != null ? doc.getPackage() : null; } /** @@ -89,25 +76,24 @@ public abstract class POIXMLTextExtractor extends POITextExtractor { * document properties metadata, such as title and author. */ @Override - public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { - return new POIXMLPropertiesTextExtractor(_document); + default POIXMLPropertiesTextExtractor getMetadataTextExtractor() { + return new POIXMLPropertiesTextExtractor(getDocument()); } @Override - public void close() throws IOException { + default void close() throws IOException { // e.g. XSSFEventBaseExcelExtractor passes a null-document - if(_document != null) { + if (isCloseFilesystem()) { @SuppressWarnings("resource") - OPCPackage pkg = _document.getPackage(); - if(pkg != null) { + OPCPackage pkg = getPackage(); + if (pkg != null) { // revert the package to not re-write the file, which is very likely not wanted for a TextExtractor! pkg.revert(); } } - super.close(); } - protected void checkMaxTextSize(CharSequence text, String string) { + default void checkMaxTextSize(CharSequence text, String string) { if(string == null) { return; } diff --git a/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java b/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java index 6ebdf2f703..7ffbf62b63 100644 --- a/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java +++ b/src/ooxml/java/org/apache/poi/xdgf/extractor/XDGFVisioExtractor.java @@ -18,7 +18,6 @@ package org.apache.poi.xdgf.extractor; import java.io.IOException; -import org.apache.poi.ooxml.POIXMLDocument; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xdgf.usermodel.XDGFPage; @@ -28,12 +27,12 @@ import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor; /** * Helper class to extract text from an OOXML Visio File */ -public class XDGFVisioExtractor extends POIXMLTextExtractor { +public class XDGFVisioExtractor implements POIXMLTextExtractor { protected final XmlVisioDocument document; - + private boolean doCloseFilesystem = true; + public XDGFVisioExtractor(XmlVisioDocument document) { - super(document); this.document = document; } @@ -43,25 +42,31 @@ public class XDGFVisioExtractor extends POIXMLTextExtractor { public String getText() { ShapeTextVisitor visitor = new ShapeTextVisitor(); - + for (XDGFPage page: document.getPages()) { page.getContent().visitShapes(visitor); } - + return visitor.getText(); } - - public static void main(String [] args) throws IOException { - if (args.length < 1) { - System.err.println("Use:"); - System.err.println(" XDGFVisioExtractor "); - System.exit(1); - } - POIXMLTextExtractor extractor = - new XDGFVisioExtractor(POIXMLDocument.openPackage( - args[0] - )); - System.out.println(extractor.getText()); - extractor.close(); + + @Override + public XmlVisioDocument getDocument() { + return document; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public XmlVisioDocument getFilesystem() { + return document; } } diff --git a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFExtractor.java b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFExtractor.java new file mode 100644 index 0000000000..fbdeede6a3 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFExtractor.java @@ -0,0 +1,45 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xslf.extractor; + +import org.apache.poi.ooxml.extractor.POIXMLPropertiesTextExtractor; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; +import org.apache.poi.sl.extractor.SlideShowExtractor; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFShape; +import org.apache.poi.xslf.usermodel.XSLFTextParagraph; + + +/** + * Helper class to extract text from an OOXML Powerpoint file + */ +public class XSLFExtractor extends SlideShowExtractor implements POIXMLTextExtractor { + public XSLFExtractor(XMLSlideShow slideshow) { + super(slideshow); + } + + @Override + public XMLSlideShow getDocument() { + return (XMLSlideShow)slideshow; + } + + @Override + public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { + return POIXMLTextExtractor.super.getMetadataTextExtractor(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFBEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFBEventBasedExcelExtractor.java index 5a96cfef25..9375572ab8 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFBEventBasedExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFBEventBasedExcelExtractor.java @@ -19,7 +19,6 @@ package org.apache.poi.xssf.extractor; import java.io.IOException; import java.io.InputStream; -import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.DataFormatter; @@ -43,8 +42,7 @@ import org.xml.sax.SAXException; * * @since 3.16-beta3 */ -public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor - implements org.apache.poi.ss.extractor.ExcelExtractor { +public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor { private static final POILogger LOGGER = POILogFactory.getLogger(XSSFBEventBasedExcelExtractor.class); @@ -62,18 +60,6 @@ public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor super(container); } - public static void main(String[] args) throws Exception { - if (args.length < 1) { - System.err.println("Use:"); - System.err.println(" XSSFBEventBasedExcelExtractor "); - System.exit(1); - } - POIXMLTextExtractor extractor = - new XSSFBEventBasedExcelExtractor(args[0]); - System.out.println(extractor.getText()); - extractor.close(); - } - public void setHandleHyperlinksInCells(boolean handleHyperlinksInCells) { this.handleHyperlinksInCells = handleHyperlinksInCells; } diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java index 998037b59b..cfd295c2b9 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java @@ -25,6 +25,7 @@ import java.util.Map; import javax.xml.parsers.ParserConfigurationException; +import org.apache.poi.ooxml.POIXMLDocument; import org.apache.poi.ooxml.POIXMLProperties; import org.apache.poi.ooxml.POIXMLProperties.CoreProperties; import org.apache.poi.ooxml.POIXMLProperties.CustomProperties; @@ -57,13 +58,13 @@ import org.xml.sax.XMLReader; * Implementation of a text extractor from OOXML Excel * files that uses SAX event based parsing. */ -public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor - implements org.apache.poi.ss.extractor.ExcelExtractor { +public class XSSFEventBasedExcelExtractor + implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class); - protected OPCPackage container; - protected POIXMLProperties properties; + protected final OPCPackage container; + protected final POIXMLProperties properties; protected Locale locale; protected boolean includeTextBoxes = true; @@ -73,29 +74,17 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor protected boolean formulasNotResults; protected boolean concatenatePhoneticRuns = true; + private boolean doCloseFilesystem = true; + public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { this(OPCPackage.open(path)); } public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { - super(null); this.container = container; - properties = new POIXMLProperties(container); } - public static void main(String[] args) throws Exception { - if (args.length < 1) { - System.err.println("Use:"); - System.err.println(" XSSFEventBasedExcelExtractor "); - System.exit(1); - } - POIXMLTextExtractor extractor = - new XSSFEventBasedExcelExtractor(args[0]); - System.out.println(extractor.getText()); - extractor.close(); - } - /** * Should sheet names be included? Default is true */ @@ -319,12 +308,23 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor } @Override - public void close() throws IOException { - if (container != null) { - container.close(); - container = null; - } - super.close(); + public POIXMLDocument getDocument() { + return null; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public OPCPackage getFilesystem() { + return container; } protected class SheetTextExtractor implements SheetContentsHandler { diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java index 17a38f5c56..43c9dcf101 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java @@ -20,8 +20,8 @@ import java.io.IOException; import java.util.Iterator; import java.util.Locale; -import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.Cell; @@ -44,8 +44,8 @@ import org.apache.xmlbeans.XmlException; /** * Helper class to extract text from an OOXML Excel file */ -public class XSSFExcelExtractor extends POIXMLTextExtractor - implements org.apache.poi.ss.extractor.ExcelExtractor { +public class XSSFExcelExtractor + implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor { public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] { XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK, XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK, @@ -53,34 +53,21 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor }; private Locale locale; - private XSSFWorkbook workbook; + private final XSSFWorkbook workbook; private boolean includeSheetNames = true; private boolean formulasNotResults; private boolean includeCellComments; private boolean includeHeadersFooters = true; private boolean includeTextBoxes = true; + private boolean doCloseFilesystem = true; public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { this(new XSSFWorkbook(container)); } public XSSFExcelExtractor(XSSFWorkbook workbook) { - super(workbook); this.workbook = workbook; } - public static void main(String[] args) throws Exception { - if(args.length < 1) { - System.err.println("Use:"); - System.err.println(" XSSFExcelExtractor "); - System.exit(1); - } - - try (OPCPackage pkg = OPCPackage.create(args[0]); - POIXMLTextExtractor extractor = new XSSFExcelExtractor(pkg)) { - System.out.println(extractor.getText()); - } - } - /** * Should sheet names be included? Default is true */ @@ -194,7 +181,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor } text.append("\n"); } - + // add textboxes if (includeTextBoxes){ XSSFDrawing drawing = sheet.getDrawingPatriarch(); @@ -262,4 +249,24 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor private String extractHeaderFooter(HeaderFooter hf) { return ExcelExtractor._extractHeaderFooter(hf); } + + @Override + public XSSFWorkbook getDocument() { + return workbook; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public XSSFWorkbook getFilesystem() { + return workbook; + } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 4b61c09d19..578d6f2828 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -19,9 +19,7 @@ package org.apache.poi.xwpf.extractor; import java.io.IOException; import java.util.List; -import org.apache.poi.ooxml.POIXMLDocument; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; -import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; @@ -39,46 +37,31 @@ import org.apache.poi.xwpf.usermodel.XWPFSDTCell; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; -import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; /** * Helper class to extract text from an OOXML Word file */ -public class XWPFWordExtractor extends POIXMLTextExtractor { +public class XWPFWordExtractor implements POIXMLTextExtractor { public static final XWPFRelation[] SUPPORTED_TYPES = { XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE, XWPFRelation.MACRO_DOCUMENT, XWPFRelation.MACRO_TEMPLATE_DOCUMENT }; - private XWPFDocument document; + private final XWPFDocument document; private boolean fetchHyperlinks; private boolean concatenatePhoneticRuns = true; + private boolean doCloseFilesystem = true; - public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { + public XWPFWordExtractor(OPCPackage container) throws IOException { this(new XWPFDocument(container)); } public XWPFWordExtractor(XWPFDocument document) { - super(document); this.document = document; } - public static void main(String[] args) throws Exception { - if (args.length < 1) { - System.err.println("Use:"); - System.err.println(" XWPFWordExtractor "); - System.exit(1); - } - POIXMLTextExtractor extractor = - new XWPFWordExtractor(POIXMLDocument.openPackage( - args[0] - )); - System.out.println(extractor.getText()); - extractor.close(); - } - /** * Should we also fetch the hyperlinks, when fetching * the text content? Default is to only output the @@ -217,4 +200,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { text.append(hfPolicy.getDefaultHeader().getText()); } } + + @Override + public XWPFDocument getDocument() { + return document; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public XWPFDocument getFilesystem() { + return document; + } } diff --git a/src/ooxml/testcases/org/apache/poi/extractor/ooxml/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/ooxml/TestExtractorFactory.java index 9fec43d358..3e570e2b51 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/ooxml/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/ooxml/TestExtractorFactory.java @@ -31,23 +31,25 @@ import java.util.Locale; import org.apache.poi.POIDataSamples; import org.apache.poi.UnsupportedFileFormatException; +import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.hssf.HSSFTestDataSamples; -import org.apache.poi.hssf.OldExcelFormatException; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; -import org.apache.poi.ooxml.extractor.ExtractorFactory; -import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; +import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.NotOLE2FileException; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.xmlbeans.XmlException; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; /** * Test that the extractor factory plays nicely @@ -89,6 +91,8 @@ public class TestExtractorFactory { private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); private static File pub = getFileAndCheck(pubTests, "Simple.pub"); + private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory(); + private static File getFileAndCheck(POIDataSamples samples, String name) { File file = samples.getFile(name); @@ -110,7 +114,7 @@ public class TestExtractorFactory { "Word 6", doc6, "Word6Extractor", 20, "Word 95", doc95, "Word6Extractor", 120, "PowerPoint", ppt, "SlideShowExtractor", 120, - "PowerPoint - pptx", pptx, "SlideShowExtractor", 120, + "PowerPoint - pptx", pptx, "XSLFExtractor", 120, "Visio", vsd, "VisioTextExtractor", 50, "Visio - vsdx", vsdx, "XDGFVisioExtractor", 20, "Publisher", pub, "PublisherTextExtractor", 50, @@ -125,6 +129,8 @@ public class TestExtractorFactory { R apply(T t) throws IOException, OpenXML4JException, XmlException; } + @Rule + public ExpectedException thrown = ExpectedException.none(); @Test public void testFile() throws Exception { @@ -135,12 +141,12 @@ public class TestExtractorFactory { } } - @Test(expected = IllegalArgumentException.class) + @Test public void testFileInvalid() throws Exception { + thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN"); + thrown.expect(IOException.class); // Text - try (POITextExtractor ignored = ExtractorFactory.createExtractor(txt)) { - fail("extracting from invalid package"); - } + ExtractorFactory.createExtractor(txt); } @Test @@ -148,8 +154,10 @@ public class TestExtractorFactory { testStream(ExtractorFactory::createExtractor, true); } - @Test(expected = IllegalArgumentException.class) + @Test public void testInputStreamInvalid() throws Exception { + thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN"); + thrown.expect(IOException.class); testInvalid(ExtractorFactory::createExtractor); } @@ -158,8 +166,10 @@ public class TestExtractorFactory { testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false); } - @Test(expected = IOException.class) + @Test public void testPOIFSInvalid() throws Exception { + thrown.expectMessage("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0"); + thrown.expect(NotOLE2FileException.class); testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f))); } @@ -195,9 +205,7 @@ public class TestExtractorFactory { POITextExtractor ignored = poifs.apply(fis)) { fail("extracting from invalid package"); } catch (IllegalArgumentException e) { - assertTrue("Had: " + e, - e.getMessage().contains(FileMagic.UNKNOWN.name())); - + assertTrue("Had: " + e, e.getMessage().contains(FileMagic.UNKNOWN.name())); throw e; } } @@ -211,7 +219,7 @@ public class TestExtractorFactory { } try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ); - final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) { + final POITextExtractor ext = xmlFactory.create(pkg)) { testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]); pkg.revert(); } @@ -222,7 +230,7 @@ public class TestExtractorFactory { public void testPackageInvalid() throws Exception { // Text try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ); - final POITextExtractor ignored = ExtractorFactory.createExtractor(pkg)) { + final POITextExtractor ignored = xmlFactory.create(pkg)) { fail("extracting from invalid package"); } } @@ -251,61 +259,45 @@ public class TestExtractorFactory { assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); + try { + // Check we get the right extractors now + try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { + assertTrue(extractor instanceof EventBasedExcelExtractor); + } + try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { + assertTrue(extractor.getText().length() > 200); + } + + try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { + assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); + } + + try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { + assertTrue(extractor.getText().length() > 200); + } + } finally { + // Put back to normal + ExtractorFactory.setThreadPrefersEventExtractors(false); + } - // Check we get the right extractors now - POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor - instanceof EventBasedExcelExtractor - ); - extractor.close(); - extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - - // Put back to normal - ExtractorFactory.setThreadPrefersEventExtractors(false); assertFalse(ExtractorFactory.getPreferEventExtractor()); assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); // And back - extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor - instanceof ExcelExtractor - ); - extractor.close(); - extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); - - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); - assertTrue( - extractor - instanceof XSSFExcelExtractor - ); - extractor.close(); - extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())); - assertTrue( - extractor.getText().length() > 200 - ); - extractor.close(); + try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { + assertTrue(extractor instanceof ExcelExtractor); + } + try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { + assertTrue(extractor.getText().length() > 200); + } + + try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { + assertTrue(extractor instanceof XSSFExcelExtractor); + } + try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) { + assertTrue(extractor.getText().length() > 200); + } } /** @@ -325,7 +317,7 @@ public class TestExtractorFactory { }; for (int i=0; i clazz = doc.getClass(); - if ("HSLFSlideShowImpl".equals(clazz.getSimpleName())) { + if ("HSLFSlideShow".equals(clazz.getSimpleName())) { try { clazz.getDeclaredMethod("getPictureData").invoke(doc); } catch (ReflectiveOperationException e) { diff --git a/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java b/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java index 0a893b1503..19222c7006 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFBugs.java @@ -522,7 +522,7 @@ public class TestXSLFBugs { private String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException { try (SlideShowExtractor extr = new SlideShowExtractor<>(ppt)) { // do not auto-close the slideshow - extr.setFilesystem(null); + extr.setCloseFilesystem(false); extr.setSlidesByDefault(true); extr.setNotesByDefault(false); extr.setMasterByDefault(false); diff --git a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java index 3f71533dd5..eb86a217f2 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java @@ -29,20 +29,18 @@ import java.io.IOException; import java.io.InputStream; import org.apache.poi.POIDataSamples; -import org.apache.poi.ooxml.extractor.ExtractorFactory; -import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFShape; import org.apache.poi.xslf.usermodel.XSLFTextParagraph; -import org.apache.xmlbeans.XmlException; import org.junit.Test; /** * Tests for XSLFPowerPointExtractor */ public class TestXSLFPowerPointExtractor { - private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); + private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); /** * Get text out of the simple file @@ -262,10 +260,11 @@ public class TestXSLFPowerPointExtractor { } @Test - public void test45541() throws IOException, OpenXML4JException, XmlException { + public void test45541() throws IOException { // extract text from a powerpoint that has a header in the notes-element final File headerFile = slTests.getFile("45541_Header.pptx"); - try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) { + //noinspection rawtypes + try (final SlideShowExtractor extr = (SlideShowExtractor) ExtractorFactory.createExtractor(headerFile)) { String text = extr.getText(); assertNotNull(text); assertFalse("Had: " + text, text.contains("testdoc")); @@ -280,7 +279,8 @@ public class TestXSLFPowerPointExtractor { // extract text from a powerpoint that has a footer in the master-slide final File footerFile = slTests.getFile("45541_Footer.pptx"); - try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) { + //noinspection rawtypes + try (SlideShowExtractor extr = (SlideShowExtractor)ExtractorFactory.createExtractor(footerFile)) { String text = extr.getText(); assertNotContained(text, "testdoc"); diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractorUsingFactory.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractorUsingFactory.java index 0c6d908ef5..73ff3992b9 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractorUsingFactory.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractorUsingFactory.java @@ -16,7 +16,7 @@ ==================================================================== */ package org.apache.poi.xssf.extractor; -import org.apache.poi.ooxml.extractor.ExtractorFactory; +import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.hssf.HSSFTestDataSamples; import org.junit.After; @@ -27,7 +27,7 @@ public class TestXSSFEventBasedExcelExtractorUsingFactory extends TestXSSFEventB ExtractorFactory.setAllThreadsPreferEventExtractors(true); return (XSSFEventBasedExcelExtractor) ExtractorFactory.createExtractor(HSSFTestDataSamples.openSampleFileStream(sampleName)); } - + @After public void tearDown() { // reset setting to not affect other tests diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractorUsingFactory.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractorUsingFactory.java index fd130ba0ba..cd1684e5a5 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractorUsingFactory.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractorUsingFactory.java @@ -17,8 +17,8 @@ package org.apache.poi.xssf.extractor; +import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.hssf.HSSFTestDataSamples; -import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.junit.After; /** diff --git a/src/resources/main/META-INF/services/org.apache.poi.extractor.ExtractorProvider b/src/resources/main/META-INF/services/org.apache.poi.extractor.ExtractorProvider new file mode 100644 index 0000000000..c93493ce19 --- /dev/null +++ b/src/resources/main/META-INF/services/org.apache.poi.extractor.ExtractorProvider @@ -0,0 +1,18 @@ +# ==================================================================== +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================== + +org.apache.poi.extractor.MainExtractorFactory \ No newline at end of file diff --git a/src/resources/ooxml/META-INF/services/org.apache.poi.extractor.ExtractorProvider b/src/resources/ooxml/META-INF/services/org.apache.poi.extractor.ExtractorProvider new file mode 100644 index 0000000000..bb22aa1dbb --- /dev/null +++ b/src/resources/ooxml/META-INF/services/org.apache.poi.extractor.ExtractorProvider @@ -0,0 +1,18 @@ +# ==================================================================== +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================== + +org.apache.poi.ooxml.extractor.POIXMLExtractorFactory \ No newline at end of file diff --git a/src/resources/scratchpad/META-INF/services/org.apache.poi.extractor.ExtractorProvider b/src/resources/scratchpad/META-INF/services/org.apache.poi.extractor.ExtractorProvider new file mode 100644 index 0000000000..b4c821cdb6 --- /dev/null +++ b/src/resources/scratchpad/META-INF/services/org.apache.poi.extractor.ExtractorProvider @@ -0,0 +1,18 @@ +# ==================================================================== +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==================================================================== + +org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory \ No newline at end of file diff --git a/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java b/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java index 73d9f74844..16711d0b11 100644 --- a/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java +++ b/src/scratchpad/src/org/apache/poi/extractor/ole2/OLE2ScratchpadExtractorFactory.java @@ -17,44 +17,66 @@ package org.apache.poi.extractor.ole2; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.util.Iterator; import java.util.List; +import java.util.stream.StreamSupport; +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.extractor.ExtractorProvider; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POITextExtractor; -import org.apache.poi.extractor.OLE2ExtractorFactory; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; +import org.apache.poi.hslf.usermodel.HSLFShape; import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFTextParagraph; import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.FileMagic; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; /** - * Scratchpad-specific logic for {@link OLE2ExtractorFactory} and + * Scratchpad-specific logic for {@link ExtractorFactory} and * {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with * no Scratchpad jar (though without functionality!) *

Note - should not be used standalone, always use via the other * two classes

*/ @SuppressWarnings("WeakerAccess") -public class OLE2ScratchpadExtractorFactory { +public class OLE2ScratchpadExtractorFactory implements ExtractorProvider { private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class); + @Override + public boolean accepts(FileMagic fm) { + return FileMagic.OLE2 == fm; + } + + @Override + public POITextExtractor create(File file, String password) throws IOException { + return create(new POIFSFileSystem(file, true).getRoot(), password); + } + + @Override + public POITextExtractor create(InputStream inputStream, String password) throws IOException { + return create(new POIFSFileSystem(inputStream).getRoot(), password); + } + /** * Look for certain entries in the stream, to figure it * out what format is desired @@ -66,48 +88,54 @@ public class OLE2ScratchpadExtractorFactory { * * @throws IOException when the format specific extraction fails because of invalid entires */ - public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { - if (poifsDir.hasEntry("WordDocument")) { - // Old or new style word document? - try { - return new WordExtractor(poifsDir); - } catch (OldWordFileFormatException e) { - return new Word6Extractor(poifsDir); + public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException { + final String oldPW = Biff8EncryptionKey.getCurrentUserPassword(); + try { + Biff8EncryptionKey.setCurrentUserPassword(password); + if (poifsDir.hasEntry("WordDocument")) { + // Old or new style word document? + try { + return new WordExtractor(poifsDir); + } catch (OldWordFileFormatException e) { + return new Word6Extractor(poifsDir); + } } - } - if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { - return new SlideShowExtractor(SlideShowFactory.create(poifsDir)); - } + if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { + return new SlideShowExtractor(SlideShowFactory.create(poifsDir)); + } - if (poifsDir.hasEntry("VisioDocument")) { - return new VisioTextExtractor(poifsDir); - } + if (poifsDir.hasEntry("VisioDocument")) { + return new VisioTextExtractor(poifsDir); + } - if (poifsDir.hasEntry("Quill")) { - return new PublisherTextExtractor(poifsDir); - } + if (poifsDir.hasEntry("Quill")) { + return new PublisherTextExtractor(poifsDir); + } - final String[] outlookEntryNames = new String[] { - // message bodies, saved as plain text (PtypString) - // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) - // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry - // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx - // @see org.apache.poi.hsmf.Types.MAPIType - "__substg1.0_1000001E", //PidTagBody ASCII - "__substg1.0_1000001F", //PidTagBody Unicode - "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII - "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode - "__substg1.0_0037001E", //PidTagSubject ASCII - "__substg1.0_0037001F", //PidTagSubject Unicode - }; - for (String entryName : outlookEntryNames) { - if (poifsDir.hasEntry(entryName)) { - return new OutlookTextExtractor(poifsDir); + final String[] outlookEntryNames = new String[]{ + // message bodies, saved as plain text (PtypString) + // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf) + // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry + // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx + // @see org.apache.poi.hsmf.Types.MAPIType + "__substg1.0_1000001E", //PidTagBody ASCII + "__substg1.0_1000001F", //PidTagBody Unicode + "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII + "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode + "__substg1.0_0037001E", //PidTagSubject ASCII + "__substg1.0_0037001F", //PidTagSubject Unicode + }; + for (String entryName : outlookEntryNames) { + if (poifsDir.hasEntry(entryName)) { + return new OutlookTextExtractor(poifsDir); + } } + } finally { + Biff8EncryptionKey.setCurrentUserPassword(oldPW); } - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + return null; } /** @@ -120,10 +148,9 @@ public class OLE2ScratchpadExtractorFactory { * @param ext the extractor holding the directory to start parsing * @param dirs a list to be filled with directory references holding embedded * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries - * - * @throws IOException when the format specific extraction fails because of invalid entires */ - public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) throws IOException { + @Override + public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List dirs, List nonPOIFS) { // Find all the embedded directories DirectoryEntry root = ext.getRoot(); if (root == null) { @@ -132,25 +159,16 @@ public class OLE2ScratchpadExtractorFactory { if (ext instanceof ExcelExtractor) { // These are in MBD... under the root - Iterator it = root.getEntries(); - while (it.hasNext()) { - Entry entry = it.next(); - if (entry.getName().startsWith("MBD")) { - dirs.add(entry); - } - } + StreamSupport.stream(root.spliterator(), false) + .filter(entry -> entry.getName().startsWith("MBD")) + .forEach(dirs::add); } else if (ext instanceof WordExtractor) { // These are in ObjectPool -> _... under the root try { - DirectoryEntry op = (DirectoryEntry) - root.getEntry("ObjectPool"); - Iterator it = op.getEntries(); - while(it.hasNext()) { - Entry entry = it.next(); - if(entry.getName().startsWith("_")) { - dirs.add(entry); - } - } + DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); + StreamSupport.stream(op.spliterator(), false) + .filter(entry -> entry.getName().startsWith("_")) + .forEach(dirs::add); } catch(FileNotFoundException e) { logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage()); // ignored here diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java index d21a7e5d6a..570eaacf4c 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java @@ -17,7 +17,6 @@ package org.apache.poi.hdgf.extractor; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -38,11 +37,11 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * Can operate on the command line (outputs to stdout), or * can return the text for you (example: for use with Lucene). */ -public final class VisioTextExtractor extends POIOLE2TextExtractor { +public final class VisioTextExtractor implements POIOLE2TextExtractor { private HDGFDiagram hdgf; + private boolean doCloseFilesystem = true; public VisioTextExtractor(HDGFDiagram hdgf) { - super(hdgf); this.hdgf = hdgf; } public VisioTextExtractor(POIFSFileSystem fs) throws IOException { @@ -91,9 +90,7 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor { // Capture the text, as long as it isn't // simply an empty string String str = cmd.getValue().toString(); - if(str.isEmpty() || "\n".equals(str)) { - // Ignore empty strings - } else { + if (!(str.isEmpty() || "\n".equals(str))) { text.add( str ); } } @@ -121,21 +118,23 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor { return text.toString(); } - public static void main(String[] args) throws Exception { - if(args.length == 0) { - System.err.println("Use:"); - System.err.println(" VisioTextExtractor "); - System.exit(1); - } + @Override + public HDGFDiagram getDocument() { + return hdgf; + } - try (FileInputStream fis = new FileInputStream(args[0])) { - VisioTextExtractor extractor = - new VisioTextExtractor(fis); + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } - // Print not PrintLn as already has \n added to it - System.out.print(extractor.getText()); + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } - extractor.close(); - } + @Override + public HDGFDiagram getFilesystem() { + return hdgf; } } diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java index bd442b8da6..ac7ed74153 100644 --- a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java @@ -17,35 +17,37 @@ package org.apache.poi.hpbf.extractor; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.hpbf.HPBFDocument; import org.apache.poi.hpbf.model.qcbits.QCBit; -import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; +import org.apache.poi.hpbf.model.qcbits.QCTextBit; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * Extract text from HPBF Publisher files */ -public final class PublisherTextExtractor extends POIOLE2TextExtractor { - private HPBFDocument doc; +public final class PublisherTextExtractor implements POIOLE2TextExtractor { + private final HPBFDocument doc; private boolean hyperlinksByDefault; + private boolean doCloseFilesystem = true; public PublisherTextExtractor(HPBFDocument doc) { - super(doc); this.doc = doc; } + public PublisherTextExtractor(DirectoryNode dir) throws IOException { this(new HPBFDocument(dir)); } + public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { this(new HPBFDocument(fs)); } + public PublisherTextExtractor(InputStream is) throws IOException { this(new POIFSFileSystem(is)); } @@ -66,7 +68,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { // Get the text from the Quill Contents QCBit[] bits = doc.getQuillContents().getBits(); for (QCBit bit1 : bits) { - if (bit1 != null && bit1 instanceof QCTextBit) { + if (bit1 instanceof QCTextBit) { QCTextBit t = (QCTextBit) bit1; text.append(t.getText().replace('\r', '\n')); } @@ -79,7 +81,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { // how to tie that together. if(hyperlinksByDefault) { for (QCBit bit : bits) { - if (bit != null && bit instanceof Type12) { + if (bit instanceof Type12) { Type12 hyperlinks = (Type12) bit; for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) { text.append("<"); @@ -96,19 +98,23 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor { return text.toString(); } + @Override + public HPBFDocument getDocument() { + return doc; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } - public static void main(String[] args) throws Exception { - if(args.length == 0) { - System.err.println("Use:"); - System.err.println(" PublisherTextExtractor "); - } + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } - for (String arg : args) { - try (FileInputStream fis = new FileInputStream(arg)) { - PublisherTextExtractor te = new PublisherTextExtractor(fis); - System.out.println(te.getText()); - te.close(); - } - } + @Override + public HPBFDocument getFilesystem() { + return doc; } } diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java deleted file mode 100644 index 650f809253..0000000000 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ /dev/null @@ -1,279 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ - -package org.apache.poi.hslf.extractor; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import org.apache.poi.EncryptedDocumentException; -import org.apache.poi.extractor.POIOLE2TextExtractor; -import org.apache.poi.hslf.usermodel.HSLFObjectShape; -import org.apache.poi.hslf.usermodel.HSLFShape; -import org.apache.poi.hslf.usermodel.HSLFSlideShow; -import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl; -import org.apache.poi.hslf.usermodel.HSLFTextParagraph; -import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.sl.extractor.SlideShowExtractor; -import org.apache.poi.sl.usermodel.SlideShow; -import org.apache.poi.sl.usermodel.SlideShowFactory; -import org.apache.poi.util.Removal; - -/** - * This class can be used to extract text from a PowerPoint file. Can optionally - * also get the notes from one. - * - * @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead - */ -@SuppressWarnings("WeakerAccess") -@Deprecated -@Removal(version="5.0.0") -public final class PowerPointExtractor extends POIOLE2TextExtractor { - private final SlideShowExtractor delegate; - - private boolean slidesByDefault = true; - private boolean notesByDefault; - private boolean commentsByDefault; - private boolean masterByDefault; - - /** - * Basic extractor. Returns all the text, and optionally all the notes - */ - public static void main(String[] args) throws IOException { - if (args.length < 1) { - System.err.println("Usage:"); - System.err.println("\tPowerPointExtractor [-notes] "); - System.exit(1); - } - - boolean notes = false; - boolean comments = false; - boolean master = true; - - String file; - if (args.length > 1) { - notes = true; - file = args[1]; - if (args.length > 2) { - comments = true; - } - } else { - file = args[0]; - } - - try (PowerPointExtractor ppe = new PowerPointExtractor(file)) { - System.out.println(ppe.getText(true, notes, comments, master)); - } - } - - public PowerPointExtractor(final HSLFSlideShow slideShow) { - super(slideShow.getSlideShowImpl()); - setFilesystem(slideShow); - delegate = new SlideShowExtractor<>(slideShow); - } - - /** - * Creates a PowerPointExtractor, from a file - * - * @param fileName The name of the file to extract from - */ - public PowerPointExtractor(String fileName) throws IOException { - this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true)); - } - - /** - * Creates a PowerPointExtractor, from an Input Stream - * - * @param iStream The input stream containing the PowerPoint document - */ - public PowerPointExtractor(InputStream iStream) throws IOException { - this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword())); - } - - /** - * Creates a PowerPointExtractor, from an open POIFSFileSystem - * - * @param fs the POIFSFileSystem containing the PowerPoint document - */ - public PowerPointExtractor(POIFSFileSystem fs) throws IOException { - this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword())); - } - - /** - * Creates a PowerPointExtractor, from a specific place - * inside an open {@link POIFSFileSystem} - * - * @param dir the POIFS Directory containing the PowerPoint document - */ - public PowerPointExtractor(DirectoryNode dir) throws IOException { - this(new HSLFSlideShow(dir)); - } - - /** - * Creates a PowerPointExtractor, from a HSLFSlideShow - * - * @param ss the HSLFSlideShow to extract text from - */ - public PowerPointExtractor(HSLFSlideShowImpl ss) { - this(new HSLFSlideShow(ss)); - } - - /** - * Should a call to getText() return slide text? Default is yes - */ - public void setSlidesByDefault(final boolean slidesByDefault) { - this.slidesByDefault = slidesByDefault; - delegate.setSlidesByDefault(slidesByDefault); - } - - /** - * Should a call to getText() return notes text? Default is no - */ - public void setNotesByDefault(final boolean notesByDefault) { - this.notesByDefault = notesByDefault; - delegate.setNotesByDefault(notesByDefault); - } - - /** - * Should a call to getText() return comments text? Default is no - */ - public void setCommentsByDefault(final boolean commentsByDefault) { - this.commentsByDefault = commentsByDefault; - delegate.setCommentsByDefault(commentsByDefault); - } - - /** - * Should a call to getText() return text from master? Default is no - */ - public void setMasterByDefault(final boolean masterByDefault) { - this.masterByDefault = masterByDefault; - delegate.setMasterByDefault(masterByDefault); - } - - /** - * Fetches all the slide text from the slideshow, but not the notes, unless - * you've called setSlidesByDefault() and setNotesByDefault() to change this - */ - @Override - public String getText() { - return delegate.getText(); - } - - /** - * Fetches text from the slideshow, be it slide text or note text. Because - * the final block of text in a TextRun normally have their last \n - * stripped, we add it back - * - * @param getSlideText fetch slide text - * @param getNoteText fetch note text - */ - public String getText(boolean getSlideText, boolean getNoteText) { - return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault); - } - - public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) { - delegate.setSlidesByDefault(getSlideText); - delegate.setNotesByDefault(getNoteText); - delegate.setCommentsByDefault(getCommentText); - delegate.setMasterByDefault(getMasterText); - try { - return delegate.getText(); - } finally { - delegate.setSlidesByDefault(slidesByDefault); - delegate.setNotesByDefault(notesByDefault); - delegate.setCommentsByDefault(commentsByDefault); - delegate.setMasterByDefault(masterByDefault); - } - } - - /** - * Fetches all the notes text from the slideshow, but not the slide text - */ - public String getNotes() { - return getText(false, true, false, false); - } - - @SuppressWarnings("unchecked") - public List getOLEShapes() { - return (List)delegate.getOLEShapes(); - } - - /** - * Helper method to avoid problems with compiling code in Eclipse - * - * Eclipse javac has some bugs with complex casts, this method tries - * to work around this. - * - * @param fs The {@link POIFSFileSystem} to read the document from - * @param password The password that should be used or null if no password is necessary. - * - * @return The created SlideShow - * - * @throws IOException if an error occurs while reading the data - */ - private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException { - // Note: don't change the code here, it is required for Eclipse to compile the code - SlideShow slideShowOrig = SlideShowFactory.create(fs, password); - return (HSLFSlideShow)slideShowOrig; - } - - /** - * Helper method to avoid problems with compiling code in Eclipse - * - * Eclipse javac has some bugs with complex casts, this method tries - * to work around this. - * - * @param inp The {@link InputStream} to read data from. - * @param password The password that should be used or null if no password is necessary. - * - * @return The created SlideShow - * - * @throws IOException if an error occurs while reading the data - * @throws EncryptedDocumentException If the wrong password is given for a protected file - */ - private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException { - // Note: don't change the code here, it is required for Eclipse to compile the code - SlideShow slideShowOrig = SlideShowFactory.create(inp, password); - return (HSLFSlideShow)slideShowOrig; - } - - /** - * Helper method to avoid problems with compiling code in Eclipse - * - * Eclipse javac has some bugs with complex casts, this method tries - * to work around this. - * - * @param file The file to read data from. - * @param password The password that should be used or null if no password is necessary. - * @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back - * changes when the document is closed. - * - * @return The created SlideShow - * - * @throws IOException if an error occurs while reading the data - * @throws EncryptedDocumentException If the wrong password is given for a protected file - */ - private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException { - // Note: don't change the code here, it is required for Eclipse to compile the code - SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly); - return (HSLFSlideShow)slideShowOrig; - } -} diff --git a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java index 150326b6d0..8370f6c282 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java +++ b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShow.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.Map; import java.util.function.Supplier; +import org.apache.poi.POIDocument; import org.apache.poi.common.usermodel.GenericRecord; import org.apache.poi.common.usermodel.fonts.FontInfo; import org.apache.poi.ddf.EscherBSERecord; @@ -40,6 +41,9 @@ import org.apache.poi.ddf.EscherContainerRecord; import org.apache.poi.ddf.EscherOptRecord; import org.apache.poi.hpsf.ClassID; import org.apache.poi.hpsf.ClassIDPredefined; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.PropertySet; +import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; import org.apache.poi.hslf.exceptions.HSLFException; @@ -47,6 +51,7 @@ import org.apache.poi.hslf.model.HeadersFooters; import org.apache.poi.hslf.model.MovieShape; import org.apache.poi.hslf.record.*; import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet; +import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -66,7 +71,7 @@ import org.apache.poi.util.Units; * TODO: - figure out how to match notes to their correct sheet (will involve * understanding DocSlideList and DocNotesList) - handle Slide creation cleaner */ -public final class HSLFSlideShow implements SlideShow, Closeable, GenericRecord { +public final class HSLFSlideShow extends POIDocument implements SlideShow, Closeable, GenericRecord { //arbitrarily selected; may need to increase private static final int MAX_RECORD_LENGTH = 10_000_000; @@ -111,6 +116,8 @@ public final class HSLFSlideShow implements SlideShow olemap = new HashMap<>(); olemap.put(POWERPOINT_DOCUMENT, ClassIDPredefined.POWERPOINT_V8.getClassID()); // as per BIFF8 spec - olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID()); + olemap.put("Workbook", ClassIDPredefined.EXCEL_V8.getClassID()); // Typically from third party programs olemap.put("WORKBOOK", ClassIDPredefined.EXCEL_V8.getClassID()); // Typically odd Crystal Reports exports @@ -1179,4 +1186,94 @@ public final class HSLFSlideShow implements SlideShow getGenericChildren() { return Arrays.asList(_hslfSlideShow.getRecords()); } + + @Override + public void write() throws IOException { + getSlideShowImpl().write(); + } + + @Override + public void write(File newFile) throws IOException { + getSlideShowImpl().write(newFile); + } + + @Override + public DocumentSummaryInformation getDocumentSummaryInformation() { + return getSlideShowImpl().getDocumentSummaryInformation(); + } + + @Override + public SummaryInformation getSummaryInformation() { + return getSlideShowImpl().getSummaryInformation(); + } + + @Override + public void createInformationProperties() { + getSlideShowImpl().createInformationProperties(); + } + + @Override + public void readProperties() { + getSlideShowImpl().readProperties(); + } + + @Override + protected PropertySet getPropertySet(String setName) throws IOException { + return getSlideShowImpl().getPropertySetImpl(setName); + } + + @Override + protected PropertySet getPropertySet(String setName, EncryptionInfo encryptionInfo) throws IOException { + return getSlideShowImpl().getPropertySetImpl(setName, encryptionInfo); + } + + @Override + protected void writeProperties() throws IOException { + getSlideShowImpl().writePropertiesImpl(); + } + + @Override + public void writeProperties(POIFSFileSystem outFS) throws IOException { + getSlideShowImpl().writeProperties(outFS); + } + + @Override + protected void writeProperties(POIFSFileSystem outFS, List writtenEntries) throws IOException { + getSlideShowImpl().writePropertiesImpl(outFS, writtenEntries); + } + + @Override + protected void validateInPlaceWritePossible() throws IllegalStateException { + getSlideShowImpl().validateInPlaceWritePossibleImpl(); + } + + @Override + public DirectoryNode getDirectory() { + return getSlideShowImpl().getDirectory(); + } + + @Override + protected void clearDirectory() { + getSlideShowImpl().clearDirectoryImpl(); + } + + @Override + protected boolean initDirectory() { + return getSlideShowImpl().initDirectoryImpl(); + } + + @Override + protected void replaceDirectory(DirectoryNode newDirectory) { + getSlideShowImpl().replaceDirectoryImpl(newDirectory); + } + + @Override + protected String getEncryptedPropertyStreamName() { + return getSlideShowImpl().getEncryptedPropertyStreamName(); + } + + @Override + public EncryptionInfo getEncryptionInfo() throws IOException { + return getSlideShowImpl().getEncryptionInfo(); + } } diff --git a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java index 6f1c633ea3..d616180245 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java +++ b/src/scratchpad/src/org/apache/poi/hslf/usermodel/HSLFSlideShowImpl.java @@ -36,6 +36,7 @@ import java.util.NavigableMap; import java.util.TreeMap; import org.apache.poi.POIDocument; +import org.apache.poi.hpsf.PropertySet; import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; import org.apache.poi.hslf.exceptions.HSLFException; import org.apache.poi.hslf.exceptions.OldPowerPointFormatException; @@ -714,8 +715,6 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { } - - /* ******************* adding methods follow ********************* */ /** @@ -850,6 +849,38 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable { return "EncryptedSummary"; } + void writePropertiesImpl() throws IOException { + super.writeProperties(); + } + + PropertySet getPropertySetImpl(String setName) throws IOException { + return super.getPropertySet(setName); + } + + PropertySet getPropertySetImpl(String setName, EncryptionInfo encryptionInfo) throws IOException { + return super.getPropertySet(setName, encryptionInfo); + } + + void writePropertiesImpl(POIFSFileSystem outFS, List writtenEntries) throws IOException { + super.writeProperties(outFS, writtenEntries); + } + + void validateInPlaceWritePossibleImpl() throws IllegalStateException { + super.validateInPlaceWritePossible(); + } + + void clearDirectoryImpl() { + super.clearDirectory(); + } + + boolean initDirectoryImpl() { + return super.initDirectory(); + } + + void replaceDirectoryImpl(DirectoryNode newDirectory) { + super.replaceDirectory(newDirectory); + } + private static class BufAccessBAOS extends ByteArrayOutputStream { public byte[] getBuf() { return buf; diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java deleted file mode 100644 index 09132f639b..0000000000 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ /dev/null @@ -1,61 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hsmf.extractor; - -import org.apache.poi.hsmf.MAPIMessage; -import org.apache.poi.poifs.filesystem.DirectoryNode; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.Removal; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; - -/** - * A text extractor for HSMF (Outlook) .msg files. - * Outputs in a format somewhat like a plain text email. - * - * @deprecated use @{link OutlookTextExtractor} instead - */ -@Deprecated -@Removal(version = "5.0.0") -public class OutlookTextExtactor extends OutlookTextExtractor { - public OutlookTextExtactor(MAPIMessage msg) { - super(msg); - } - - public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException { - super(new MAPIMessage(poifsDir)); - } - - public OutlookTextExtactor(POIFSFileSystem fs) throws IOException { - super(new MAPIMessage(fs)); - } - - public OutlookTextExtactor(InputStream inp) throws IOException { - super(new MAPIMessage(inp)); - } - - public static void main(String[] args) throws Exception { - for (String filename : args) { - try (POIFSFileSystem poifs = new POIFSFileSystem(new File(filename)); - OutlookTextExtractor extractor = new OutlookTextExtractor(poifs)) { - System.out.println(extractor.getText()); - } - } - } -} diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java index e11f005fe3..a818d03280 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtractor.java @@ -42,9 +42,12 @@ import org.apache.poi.util.LocaleUtil; * * @since 4.1.2 */ -public class OutlookTextExtractor extends POIOLE2TextExtractor { +public class OutlookTextExtractor implements POIOLE2TextExtractor { + private final MAPIMessage msg; + private boolean doCloseFilesystem = true; + public OutlookTextExtractor(MAPIMessage msg) { - super(msg); + this.msg = msg; } public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException { @@ -76,14 +79,13 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor { * Returns the underlying MAPI message */ public MAPIMessage getMAPIMessage() { - return (MAPIMessage) document; + return msg; } /** * Outputs something a little like a RFC822 email */ public String getText() { - MAPIMessage msg = (MAPIMessage) document; StringBuilder s = new StringBuilder(); // See if we can get a suitable encoding for any @@ -201,4 +203,24 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor { } s.append("\n"); } + + @Override + public MAPIMessage getDocument() { + return msg; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public MAPIMessage getFilesystem() { + return msg; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java index 1d509dd1c6..526d21be35 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java @@ -31,13 +31,14 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * Class to extract the text from old (Word 6 / Word 95) Word Documents. * * This should only be used on the older files, for most uses you - * should call {@link WordExtractor} which deals properly + * should call {@link WordExtractor} which deals properly * with HWPF. * * @author Nick Burch */ -public final class Word6Extractor extends POIOLE2TextExtractor { +public final class Word6Extractor implements POIOLE2TextExtractor { private HWPFOldDocument doc; + private boolean doCloseFilesystem = true; /** * Create a new Word Extractor @@ -49,12 +50,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor { /** * Create a new Word Extractor - * + * * @param fs * POIFSFileSystem containing the word file */ - public Word6Extractor( POIFSFileSystem fs ) throws IOException - { + public Word6Extractor( POIFSFileSystem fs ) throws IOException { this( fs.getRoot() ); } @@ -62,14 +62,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor { * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead */ @Deprecated - public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) - throws IOException - { + public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException { this( dir ); } - public Word6Extractor( DirectoryNode dir ) throws IOException - { + public Word6Extractor( DirectoryNode dir ) throws IOException { this( new HWPFOldDocument( dir ) ); } @@ -78,7 +75,6 @@ public final class Word6Extractor extends POIOLE2TextExtractor { * @param doc The HWPFOldDocument to extract from */ public Word6Extractor(HWPFOldDocument doc) { - super(doc); this.doc = doc; } @@ -101,7 +97,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor { ret = new String[doc.getTextTable().getTextPieces().size()]; for(int i=0; i" ); - System.exit( 1 ); - } - - // Process the first argument as a file - InputStream fin = new FileInputStream( args[0] ); - try (WordExtractor extractor = new WordExtractor(fin)) { - System.out.println(extractor.getText()); - } - } - /** * Get the text from the word file, as an array with one String per * paragraph @@ -142,7 +122,7 @@ public final class WordExtractor extends POIOLE2TextExtractor { return getParagraphText( r ); } - protected static String[] getParagraphText( Range r ) { + static String[] getParagraphText( Range r ) { String[] ret; ret = new String[r.numParagraphs()]; for ( int i = 0; i < ret.length; i++ ) { @@ -287,8 +267,27 @@ public final class WordExtractor extends POIOLE2TextExtractor { /** * Removes any fields (eg macros, page markers etc) from the string. */ - public static String stripFields( String text ) - { + public static String stripFields( String text ) { return Range.stripFields( text ); } + + @Override + public HWPFDocument getDocument() { + return doc; + } + + @Override + public void setCloseFilesystem(boolean doCloseFilesystem) { + this.doCloseFilesystem = doCloseFilesystem; + } + + @Override + public boolean isCloseFilesystem() { + return doCloseFilesystem; + } + + @Override + public HWPFDocument getFilesystem() { + return doc; + } } diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java index a1db11c170..aaa1cb4be8 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/extractor/TestVisioExtractor.java @@ -19,12 +19,9 @@ package org.apache.poi.hdgf.extractor; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PrintStream; import org.apache.poi.POIDataSamples; import org.apache.poi.hdgf.HDGFDiagram; @@ -32,7 +29,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.junit.Test; public final class TestVisioExtractor { - private static POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); + private static final POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); private final String defFilename = "Test_Visio-Some_Random_Text.vsd"; private final int defTextChunks = 5; @@ -63,7 +60,7 @@ public final class TestVisioExtractor { is3.close(); HDGFDiagram hdgf3 = new HDGFDiagram(poifs3); - + VisioTextExtractor extractor3 = new VisioTextExtractor(hdgf3); assertNotNull(extractor3); assertNotNull(extractor3.getAllText()); @@ -97,7 +94,7 @@ public final class TestVisioExtractor { @Test public void testProblemFiles() throws Exception { String[] files = { - "44594.vsd", "44594-2.vsd", + "44594.vsd", "44594-2.vsd", "ShortChunk1.vsd", "ShortChunk2.vsd", "ShortChunk3.vsd", "NegativeChunkLength.vsd", "NegativeChunkLength2.vsd" }; @@ -108,31 +105,6 @@ public final class TestVisioExtractor { } } - @Test - public void testMain() throws Exception { - PrintStream oldOut = System.out; - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - PrintStream capture = new PrintStream(baos); - System.setOut(capture); - - String path = _dgTests.getFile(defFilename).getPath(); - VisioTextExtractor.main(new String[] {path}); - - // Put things back - System.setOut(oldOut); - - // Check - capture.flush(); - String text = baos.toString(); - // YK: stdout can contain lots of other stuff if logging is sent to console - // ( -Dorg.apache.poi.util.POILogger=org.apache.poi.util.SystemOutLogger) - assertTrue( text.contains( - "text\nView\n" + - "Test View\nI am a test view\n" + - "Some random text, on a page\n" - )); - } - private VisioTextExtractor openExtractor(String fileName) throws IOException { try (InputStream is = _dgTests.openResourceAsStream(fileName)) { return new VisioTextExtractor(is); diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java index e38ef007ad..007fff036a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestFixedSizedProperties.java @@ -42,7 +42,6 @@ import org.apache.poi.hsmf.datatypes.PropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue; import org.apache.poi.hsmf.dev.HSMFDump; -import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hsmf.extractor.OutlookTextExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.LocaleUtil; @@ -74,23 +73,23 @@ public final class TestFixedSizedProperties { fsMessageFails = new POIFSFileSystem(samples.getFile(messageFails)); mapiMessageSucceeds = new MAPIMessage(fsMessageSucceeds); - mapiMessageFails = new MAPIMessage(fsMessageFails); - + mapiMessageFails = new MAPIMessage(fsMessageFails); + messageDateFormat = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss", Locale.ROOT); - messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); + messageDateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); userTimeZone = LocaleUtil.getUserTimeZone(); LocaleUtil.setUserTimeZone(LocaleUtil.TIMEZONE_UTC); } - - + + @AfterClass public static void closeFS() throws Exception { LocaleUtil.setUserTimeZone(userTimeZone); fsMessageSucceeds.close(); fsMessageFails.close(); } - + /** * Check we can find a sensible number of properties on a few * of our test files @@ -98,21 +97,21 @@ public final class TestFixedSizedProperties { @Test public void testPropertiesFound() { Map> props; - + props = mapiMessageSucceeds.getMainChunks().getProperties(); assertTrue(props.toString(), props.size() > 10); - + props = mapiMessageFails.getMainChunks().getProperties(); assertTrue(props.toString(), props.size() > 10); } - + /** * Check we find properties of a variety of different types */ @Test public void testPropertyValueTypes() { Chunks mainChunks = mapiMessageSucceeds.getMainChunks(); - + // Ask to have the values looked up Map> props = mainChunks.getProperties(); HashSet> seenTypes = @@ -126,7 +125,7 @@ public final class TestFixedSizedProperties { assertTrue(seenTypes.toString(), seenTypes.contains(LongPropertyValue.class)); assertTrue(seenTypes.toString(), seenTypes.contains(TimePropertyValue.class)); assertFalse(seenTypes.toString(), seenTypes.contains(ChunkBasedPropertyValue.class)); - + // Ask for the raw values seenTypes.clear(); for (PropertyValue pv : mainChunks.getRawProperties().values()) { @@ -144,31 +143,21 @@ public final class TestFixedSizedProperties { @Test public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception { OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds); - ext.setFilesystem(null); // Don't close re-used test resources here - + ext.setCloseFilesystem(false); + String text = ext.getText(); assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); ext.close(); } - @Test - public void testReadMessageDateSucceedsWithOutlookTextExtactor() throws Exception { - OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageSucceeds); - ext.setFilesystem(null); // Don't close re-used test resources here - - String text = ext.getText(); - assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); - ext.close(); - } - /** * Test to see if we can read the Date Chunk with OutlookTextExtractor. */ @Test public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception { OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails); - ext.setFilesystem(null); // Don't close re-used test resources here - + ext.setCloseFilesystem(false); + String text = ext.getText(); assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n"); ext.close(); @@ -182,7 +171,7 @@ public final class TestFixedSizedProperties { PrintStream stream = new PrintStream(new ByteArrayOutputStream()); HSMFDump dump = new HSMFDump(fsMessageSucceeds); dump.dump(stream); - } + } /** * Test to see if we can read the Date Chunk with HSMFDump. @@ -202,19 +191,19 @@ public final class TestFixedSizedProperties { // Check via the message date Calendar clientSubmitTime = mapiMessageSucceeds.getMessageDate(); assertEquals( - "Fri, 22 Jun 2012 18:32:54", + "Fri, 22 Jun 2012 18:32:54", messageDateFormat.format(clientSubmitTime.getTime())); - + // Fetch the property value directly Map> props = mapiMessageSucceeds.getMainChunks().getProperties(); - List pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME); + List pv = props.get(MAPIProperty.CLIENT_SUBMIT_TIME); assertNotNull(pv); assertEquals(1, pv.size()); - + clientSubmitTime = (Calendar)pv.get(0).getValue(); assertEquals( - "Fri, 22 Jun 2012 18:32:54", + "Fri, 22 Jun 2012 18:32:54", messageDateFormat.format(clientSubmitTime.getTime())); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java index 4d8bfb693f..2767228501 100644 --- a/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hsmf/extractor/TestOutlookTextExtractor.java @@ -20,7 +20,6 @@ package org.apache.poi.hsmf.extractor; import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.POITestCase.assertNotContained; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import java.io.FileInputStream; import java.text.SimpleDateFormat; @@ -57,68 +56,62 @@ public final class TestOutlookTextExtractor { @Test public void testQuick() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Kevin Roast\n"); - assertContains(text, "To: Kevin Roast \n"); - assertNotContained(text, "CC:"); - assertNotContained(text, "BCC:"); - assertNotContained(text, "Attachment:"); - assertContains(text, "Subject: Test the content transformer\n"); - Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); - SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); - f.setTimeZone(LocaleUtil.getUserTimeZone()); - String dateText = f.format(cal.getTime()); - assertContains(text, "Date: " + dateText + "\n"); - assertContains(text, "The quick brown fox jumps over the lazy dog"); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Kevin Roast\n"); + assertContains(text, "To: Kevin Roast \n"); + assertNotContained(text, "CC:"); + assertNotContained(text, "BCC:"); + assertNotContained(text, "Attachment:"); + assertContains(text, "Subject: Test the content transformer\n"); + Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); + SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); + f.setTimeZone(LocaleUtil.getUserTimeZone()); + String dateText = f.format(cal.getTime()); + assertContains(text, "Date: " + dateText + "\n"); + assertContains(text, "The quick brown fox jumps over the lazy dog"); + } } @Test public void testSimple() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Travis Ferguson\n"); - assertContains(text, "To: travis@overwrittenstack.com\n"); - assertNotContained(text, "CC:"); - assertNotContained(text, "BCC:"); - assertContains(text, "Subject: test message\n"); - assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n"); - assertContains(text, "This is a test message."); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Travis Ferguson\n"); + assertContains(text, "To: travis@overwrittenstack.com\n"); + assertNotContained(text, "CC:"); + assertNotContained(text, "BCC:"); + assertContains(text, "Subject: test message\n"); + assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n"); + assertContains(text, "This is a test message."); + } } @Test public void testConstructors() throws Exception { - FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); - OutlookTextExtractor ext = new OutlookTextExtractor(fis); - String inp = ext.getText(); - ext.close(); - fis.close(); - - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); - ext = new OutlookTextExtractor(poifs); - String poifsTxt = ext.getText(); - ext.close(); - poifs.close(); - - fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); - ext = new OutlookTextExtractor(new MAPIMessage(fis)); - String mapi = ext.getText(); - ext.close(); - fis.close(); + String inp; + try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); + OutlookTextExtractor ext = new OutlookTextExtractor(fis)) { + inp = ext.getText(); + } + + String poifsTxt; + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); + OutlookTextExtractor ext = new OutlookTextExtractor(poifs)){ + poifsTxt = ext.getText(); + } + + String mapi; + try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); + OutlookTextExtractor ext = new OutlookTextExtractor(new MAPIMessage(fis))) { + mapi = ext.getText(); + } assertEquals(inp, poifsTxt); assertEquals(inp, mapi); @@ -142,25 +135,22 @@ public final class TestOutlookTextExtractor { "example_sent_regular.msg", "example_sent_unicode.msg" }; for (String file : files) { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); - MAPIMessage msg = new MAPIMessage(poifs); - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Mike Farman\n"); - assertContains(text, "To: 'Ashutosh Dandavate' ; " + - "'Paul Holmes-Higgin' ; 'Mike Farman' \n"); - assertContains(text, "CC: 'nickb@alfresco.com' ; " + - "'nick.burch@alfresco.com' ; 'Roy Wetherall' \n"); - assertContains(text, "BCC: 'David Caruana' ; " + - "'Vonka Jan' \n"); - assertContains(text, "Subject: This is a test message please ignore\n"); - assertContains(text, "Date:"); - assertContains(text, "The quick brown fox jumps over the lazy dog"); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Mike Farman\n"); + assertContains(text, "To: 'Ashutosh Dandavate' ; " + + "'Paul Holmes-Higgin' ; 'Mike Farman' \n"); + assertContains(text, "CC: 'nickb@alfresco.com' ; " + + "'nick.burch@alfresco.com' ; 'Roy Wetherall' \n"); + assertContains(text, "BCC: 'David Caruana' ; " + + "'Vonka Jan' \n"); + assertContains(text, "Subject: This is a test message please ignore\n"); + assertContains(text, "Date:"); + assertContains(text, "The quick brown fox jumps over the lazy dog"); + } } } @@ -182,25 +172,21 @@ public final class TestOutlookTextExtractor { "example_received_regular.msg", "example_received_unicode.msg" }; for (String file : files) { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); - MAPIMessage msg = new MAPIMessage(poifs); - - - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - assertContains(text, "From: Mike Farman\n"); - assertContains(text, "To: 'Ashutosh Dandavate' ; " + - "'Paul Holmes-Higgin' ; 'Mike Farman' \n"); - assertContains(text, "CC: nickb@alfresco.com; " + - "nick.burch@alfresco.com; 'Roy Wetherall' \n"); - assertNotContained(text, "BCC:"); - assertContains(text, "Subject: This is a test message please ignore\n"); - assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly - assertContains(text, "The quick brown fox jumps over the lazy dog"); - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); + + assertContains(text, "From: Mike Farman\n"); + assertContains(text, "To: 'Ashutosh Dandavate' ; " + + "'Paul Holmes-Higgin' ; 'Mike Farman' \n"); + assertContains(text, "CC: nickb@alfresco.com; " + + "nick.burch@alfresco.com; 'Roy Wetherall' \n"); + assertNotContained(text, "BCC:"); + assertContains(text, "Subject: This is a test message please ignore\n"); + assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly + assertContains(text, "The quick brown fox jumps over the lazy dog"); + } } } @@ -210,85 +196,59 @@ public final class TestOutlookTextExtractor { @SuppressWarnings("JavadocReference") @Test public void testWithAttachments() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - - // Check the normal bits - String text = ext.getText(); - - assertContains(text, "From: Nicolas1"); - assertContains(text, "To: 'nicolas1.23456@free.fr'"); - assertNotContained(text, "CC:"); - assertNotContained(text, "BCC:"); - assertContains(text, "Subject: test"); - assertContains(text, "Date: Wed, 22 Apr"); - assertContains(text, "Attachment: test-unicode.doc\n"); - assertContains(text, "Attachment: pj1.txt\n"); - assertContains(text, "contenu"); - - // Embeded bits are checked in - // TestExtractorFactory - - ext.close(); - poifs.close(); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + + // Check the normal bits + String text = ext.getText(); + + assertContains(text, "From: Nicolas1"); + assertContains(text, "To: 'nicolas1.23456@free.fr'"); + assertNotContained(text, "CC:"); + assertNotContained(text, "BCC:"); + assertContains(text, "Subject: test"); + assertContains(text, "Date: Wed, 22 Apr"); + assertContains(text, "Attachment: test-unicode.doc\n"); + assertContains(text, "Attachment: pj1.txt\n"); + assertContains(text, "contenu"); + + // Embeded bits are checked in + // TestExtractorFactory + } } @Test public void testWithAttachedMessage() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - // Check we got bits from the main message - assertContains(text, "Master mail"); - assertContains(text, "ante in lacinia euismod"); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); - // But not the attached message - assertNotContained(text, "Test mail attachment"); - assertNotContained(text, "Lorem ipsum dolor sit"); + // Check we got bits from the main message + assertContains(text, "Master mail"); + assertContains(text, "ante in lacinia euismod"); - ext.close(); - poifs.close(); + // But not the attached message + assertNotContained(text, "Test mail attachment"); + assertNotContained(text, "Lorem ipsum dolor sit"); + } } @Test public void testEncodings() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtractor ext = new OutlookTextExtractor(msg); - String text = ext.getText(); - - // Check the english bits - assertContains(text, "From: Tests Chang@FT"); - assertContains(text, "tests.chang@fengttt.com"); - - // And check some chinese bits - assertContains(text, "(\u5f35\u6bd3\u502b)"); - assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); + try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); + MAPIMessage msg = new MAPIMessage(poifs); + OutlookTextExtractor ext = new OutlookTextExtractor(msg)) { + String text = ext.getText(); - ext.close(); - poifs.close(); - } + // Check the english bits + assertContains(text, "From: Tests Chang@FT"); + assertContains(text, "tests.chang@fengttt.com"); - @Test - public void testEncodingsDeprecatedClass() throws Exception { - POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); - MAPIMessage msg = new MAPIMessage(poifs); - OutlookTextExtactor ext = new OutlookTextExtactor(msg); - assertTrue("OutlookTextExtactor instanceof OutlookTextExtractor", ext instanceof OutlookTextExtractor); - String text = ext.getText(); - - // Check the english bits - assertContains(text, "From: Tests Chang@FT"); - assertContains(text, "tests.chang@fengttt.com"); - - // And check some chinese bits - assertContains(text, "(\u5f35\u6bd3\u502b)"); - assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); - - ext.close(); - poifs.close(); + // And check some chinese bits + assertContains(text, "(\u5f35\u6bd3\u502b)"); + assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); + } } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java index c605130a67..1962f2facf 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java @@ -17,16 +17,16 @@ package org.apache.poi.hwpf.extractor; -import org.apache.poi.POIDataSamples; -import org.apache.poi.extractor.POITextExtractor; -import org.apache.poi.extractor.OLE2ExtractorFactory; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.junit.Test; +import static org.junit.Assert.assertNotNull; import java.io.IOException; import java.io.InputStream; -import static org.junit.Assert.assertNotNull; +import org.apache.poi.POIDataSamples; +import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.extractor.POITextExtractor; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.junit.Test; /** * Tests for bugs with the WordExtractor @@ -61,7 +61,7 @@ public final class TestWordExtractorBugs { @Test public void testBug60374() throws Exception { POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC")); - final POITextExtractor extractor = OLE2ExtractorFactory.createExtractor(fs); + final POITextExtractor extractor = ExtractorFactory.createExtractor(fs); // Check it gives text without error assertNotNull(extractor.getText()); diff --git a/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java index 19f54bbf63..9ea2fdd074 100644 --- a/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java +++ b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java @@ -25,7 +25,7 @@ import java.io.IOException; import java.io.InputStream; import org.apache.poi.POIDataSamples; -import org.apache.poi.hpsf.*; +import org.apache.poi.hpsf.Thumbnail; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.usermodel.HSSFWorkbook; @@ -101,42 +101,31 @@ public final class TestHPSFPropertiesExtractor { @Test public void testConstructors() throws IOException { - POIFSFileSystem fs; - HSSFWorkbook wb; - try { - fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls")); - wb = new HSSFWorkbook(fs); - } catch (IOException e) { - throw new RuntimeException(e); - } - ExcelExtractor excelExt = new ExcelExtractor(wb); - final String fsText; - HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs); - fsExt.setFilesystem(null); // Don't close re-used test resources! - try { - fsText = fsExt.getText(); - } finally { - fsExt.close(); - } - final String hwText; - HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb); - hwExt.setFilesystem(null); // Don't close re-used test resources! - try { - hwText = hwExt.getText(); - } finally { - hwExt.close(); - } - final String eeText; - HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt); - eeExt.setFilesystem(null); // Don't close re-used test resources! - try { - eeText = eeExt.getText(); - } finally { - eeExt.close(); - wb.close(); + + try (POIFSFileSystem fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls")); + HSSFWorkbook wb = new HSSFWorkbook(fs); + ExcelExtractor excelExt = new ExcelExtractor(wb)) { + + try (HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs)) { + // Don't close re-used test resources! + fsExt.setCloseFilesystem(false); + fsText = fsExt.getText(); + } + + try (HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb)) { + // Don't close re-used test resources! + hwExt.setCloseFilesystem(false); + hwText = hwExt.getText(); + } + + try (HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt)) { + // Don't close re-used test resources! + eeExt.setCloseFilesystem(false); + eeText = eeExt.getText(); + } } assertEquals(fsText, hwText); diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index ce4f2b7be8..139b03fb03 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -43,9 +43,7 @@ public final class TestExcelExtractor { private static ExcelExtractor createExtractor(String sampleFileName) throws IOException { File file = HSSFTestDataSamples.getSampleFile(sampleFileName); POIFSFileSystem fs = new POIFSFileSystem(file); - ExcelExtractor extractor = new ExcelExtractor(fs); - extractor.setFilesystem(fs); - return extractor; + return new ExcelExtractor(fs); } @Test @@ -223,16 +221,16 @@ public final class TestExcelExtractor { extractor.setIncludeBlankCells(false); extractor.setIncludeSheetNames(false); String text = extractor.getText(); - + // Note - not all the formats in the file // actually quite match what they claim to // be, as some are auto-local builtins... - + assertStartsWith(text, "Dates, all 24th November 2006\n"); assertContains(text, "yyyy/mm/dd\t2006/11/24\n"); assertContains(text, "yyyy-mm-dd\t2006-11-24\n"); assertContains(text, "dd-mm-yy\t24-11-06\n"); - + assertContains(text, "nn.nn\t10.52\n"); assertContains(text, "nn.nnn\t10.520\n"); assertContains(text, "\u00a3nn.nn\t\u00a310.52\n"); @@ -247,7 +245,7 @@ public final class TestExcelExtractor { @Test public void testWithEmbeded() throws Exception { POIFSFileSystem fs = null; - + HSSFWorkbook wbA = null, wbB = null; ExcelExtractor exA = null, exB = null; @@ -257,7 +255,7 @@ public final class TestExcelExtractor { DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool"); DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460"); DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461"); - + wbA = new HSSFWorkbook(dirA, fs, true); exA = new ExcelExtractor(wbA); wbB = new HSSFWorkbook(dirB, fs, true); @@ -299,10 +297,10 @@ public final class TestExcelExtractor { exB = new ExcelExtractor(wbB); assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText()); assertEquals("Sample Excel", exA.getSummaryInformation().getTitle()); - + assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText()); assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle()); - + // And the base file too ex = new ExcelExtractor(fs); assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText()); -- 2.39.5