123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.ooxml.extractor;
-
- import java.io.File;
- import java.io.IOException;
- import java.io.InputStream;
- import java.lang.reflect.Method;
- import java.util.ArrayList;
- import java.util.Iterator;
- import java.util.List;
-
- import org.apache.poi.EncryptedDocumentException;
- import org.apache.poi.extractor.OLE2ExtractorFactory;
- import org.apache.poi.extractor.POIOLE2TextExtractor;
- import org.apache.poi.extractor.POITextExtractor;
- import org.apache.poi.hssf.extractor.ExcelExtractor;
- import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
- import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.openxml4j.opc.PackagePart;
- import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
- import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
- import org.apache.poi.poifs.crypt.Decryptor;
- import org.apache.poi.poifs.crypt.EncryptionInfo;
- import org.apache.poi.poifs.filesystem.DirectoryEntry;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.Entry;
- import org.apache.poi.poifs.filesystem.FileMagic;
- import org.apache.poi.poifs.filesystem.NotOLE2FileException;
- import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.sl.extractor.SlideShowExtractor;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.util.NotImplemented;
- import org.apache.poi.util.POILogFactory;
- import org.apache.poi.util.POILogger;
- import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
- import org.apache.poi.xslf.usermodel.XMLSlideShow;
- import org.apache.poi.xslf.usermodel.XSLFRelation;
- import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
- import org.apache.poi.xssf.usermodel.XSSFRelation;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.apache.poi.xwpf.usermodel.XWPFRelation;
- import org.apache.xmlbeans.XmlException;
-
- /**
- * Figures out the correct POITextExtractor for your supplied
- * document, and returns it.
- *
- * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
- * not present on the runtime classpath</p>
- * <p>Note 2 - rather than using this, for most cases you would be better
- * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
- */
- @SuppressWarnings("WeakerAccess")
- public final class ExtractorFactory {
- private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
-
- public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
- private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
- private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
-
- private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
- XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
- XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
- XSLFRelation.PRESENTATION_MACRO
- };
-
- private ExtractorFactory() {
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
- */
- public static boolean getThreadPrefersEventExtractors() {
- return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
- */
- public static Boolean getAllThreadsPreferEventExtractors() {
- return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
- */
- public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
- */
- public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should this thread use event based extractors is available?
- * Checks the all-threads one first, then thread specific.
- */
- public static boolean getPreferEventExtractor() {
- return OLE2ExtractorFactory.getPreferEventExtractor();
- }
-
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
- POIFSFileSystem fs = null;
- try {
- fs = new POIFSFileSystem(f);
- if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
- return (T)createEncryptedOOXMLExtractor(fs);
- }
- POITextExtractor extractor = createExtractor(fs);
- extractor.setFilesystem(fs);
- return (T)extractor;
- } catch (OfficeXmlFileException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
- T t = (T)createExtractor(pkg);
- t.setFilesystem(pkg);
- return t;
- } catch (NotOLE2FileException ne) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
- } catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
- }
- }
-
- public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
- InputStream is = FileMagic.prepareToCheckMagic(inp);
-
- FileMagic fm = FileMagic.valueOf(is);
-
- switch (fm) {
- case OLE2:
- POIFSFileSystem fs = new POIFSFileSystem(is);
- boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
- return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
- case OOXML:
- return createExtractor(OPCPackage.open(is));
- default:
- throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm);
- }
- }
-
- /**
- * Tries to determine the actual type of file and produces a matching text-extractor for it.
- *
- * @param pkg An {@link OPCPackage}.
- * @return A {@link POIXMLTextExtractor} for the given file.
- * @throws IOException If an error occurs while reading the file
- * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
- * @throws XmlException If an XML parsing error occurs.
- * @throws IllegalArgumentException If no matching file type could be found.
- */
- public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
- try {
- // Check for the normal Office core document
- PackageRelationshipCollection core;
- core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-
- // If nothing was found, try some of the other OOXML-based core types
- if (core.size() == 0) {
- // Could it be an OOXML-Strict one?
- core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
- }
- if (core.size() == 0) {
- // Could it be a visio one?
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (core.size() == 1)
- return new XDGFVisioExtractor(pkg);
- }
-
- // Should just be a single core document, complain if not
- if (core.size() != 1) {
- throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
- }
-
- // Grab the core document part, and try to identify from that
- final PackagePart corePart = pkg.getPart(core.getRelationship(0));
- final String contentType = corePart.getContentType();
-
- // Is it XSSF?
- for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- if (getPreferEventExtractor()) {
- return new XSSFEventBasedExcelExtractor(pkg);
- }
- return new XSSFExcelExtractor(pkg);
- }
- }
-
- // Is it XWPF?
- for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new XWPFWordExtractor(pkg);
- }
- }
-
- // Is it XSLF?
- for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new SlideShowExtractor<>(new XMLSlideShow(pkg));
- }
- }
-
- // special handling for SlideShow-Theme-files,
- if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
- return new SlideShowExtractor<>(new XMLSlideShow(pkg));
- }
-
- // How about xlsb?
- for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
- if (rel.getContentType().equals(contentType)) {
- return new XSSFBEventBasedExcelExtractor(pkg);
- }
- }
-
- throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
-
- } catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
- }
- }
-
- public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return createExtractor(fs.getRoot());
- }
-
- @SuppressWarnings("unchecked")
- public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
- {
- // First, check for OOXML
- for (String entryName : poifsDir.getEntryNames()) {
- if (entryName.equals("Package")) {
- OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
- return (T)createExtractor(pkg);
- }
- }
-
- // If not, ask the OLE2 code to check, with Scratchpad if possible
- return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
- // All the embedded directories we spotted
- ArrayList<Entry> dirs = new ArrayList<>();
- // For anything else not directly held in as a POIFS directory
- ArrayList<InputStream> nonPOIFS = new ArrayList<>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if (root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- // provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it
- if (ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while (it.hasNext()) {
- Entry entry = it.next();
- if (entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else {
- try {
- Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory");
- Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
- m.invoke(null, ext, dirs, nonPOIFS);
- } catch (ReflectiveOperationException e) {
- logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage());
- return new POITextExtractor[0];
- }
- }
-
- // Create the extractors
- if (dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
- for (Entry dir : dirs) {
- textExtractors.add(createExtractor((DirectoryNode) dir));
- }
- for (InputStream nonPOIF : nonPOIFS) {
- try {
- textExtractors.add(createExtractor(nonPOIF));
- } catch (IllegalArgumentException e) {
- // Ignore, just means it didn't contain
- // a format we support as yet
- logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
- } catch (XmlException | OpenXML4JException e) {
- throw new IOException(e.getMessage(), e);
- }
- }
- return textExtractors.toArray(new POITextExtractor[0]);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- @NotImplemented
- @SuppressWarnings({"UnusedParameters", "UnusedReturnValue"})
- public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
- throw new IllegalStateException("Not yet supported");
- }
-
- private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
- throws IOException {
- String pass = Biff8EncryptionKey.getCurrentUserPassword();
- if (pass == null) {
- pass = Decryptor.DEFAULT_PASSWORD;
- }
-
- EncryptionInfo ei = new EncryptionInfo(fs);
- Decryptor dec = ei.getDecryptor();
- InputStream is = null;
- try {
- if (!dec.verifyPassword(pass)) {
- throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
- }
- is = dec.getDataStream(fs);
- return createExtractor(OPCPackage.open(is));
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new EncryptedDocumentException(e);
- } finally {
- IOUtils.closeQuietly(is);
-
- // also close the POIFSFileSystem here as we read all the data
- // while decrypting
- fs.close();
- }
- }
- }
|