]> source.dussan.org Git - poi.git/commitdiff
Start to pull out some of the OLE2 logic, so we can then split some Scratchpad parts out
authorNick Burch <nick@apache.org>
Mon, 11 Jul 2016 22:20:51 +0000 (22:20 +0000)
committerNick Burch <nick@apache.org>
Mon, 11 Jul 2016 22:20:51 +0000 (22:20 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752223 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java [new file with mode: 0644]

diff --git a/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java b/src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
new file mode 100644 (file)
index 0000000..ea78385
--- /dev/null
@@ -0,0 +1,208 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.extractor;
+
+import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.xmlbeans.XmlException;
+
+/**
+ * Figures out the correct POIOLE2TextExtractor for your supplied
+ *  document, and returns it.
+ *  
+ * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
+ *  not present on the runtime classpath</p>
+ * <p>Note 2 - for text extractor creation across all formats, use
+ *  {@link org.apache.poi.extractor.ExtractorFactory} contained within
+ *  the OOXML jar.</p>
+ * <p>Note 3 - rather than using this, for most cases you would be better
+ *  off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
+ */
+@SuppressWarnings("WeakerAccess")
+public class OLE2ExtractorFactory {
+    /** Should this thread prefer event based over usermodel based extractors? */
+    private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
+        @Override
+        protected Boolean initialValue() { return Boolean.FALSE; }
+    };
+
+    /** Should all threads prefer event based over usermodel based extractors? */
+    private static Boolean allPreferEventExtractors;
+
+    /**
+     * Should this thread prefer event based over usermodel based extractors?
+     * (usermodel extractors tend to be more accurate, but use more memory)
+     * Default is false.
+     */
+    public static boolean getThreadPrefersEventExtractors() {
+        return threadPreferEventExtractors.get();
+    }
+
+    /**
+     * Should all threads prefer event based over usermodel based extractors?
+     * (usermodel extractors tend to be more accurate, but use more memory)
+     * Default is to use the thread level setting, which defaults to false.
+     */
+    public static Boolean getAllThreadsPreferEventExtractors() {
+        return allPreferEventExtractors;
+    }
+
+    /**
+     * Should this thread prefer event based over usermodel based extractors?
+     * Will only be used if the All Threads setting is null.
+     */
+    public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+        threadPreferEventExtractors.set(preferEventExtractors);
+    }
+
+    /**
+     * Should all threads prefer event based over usermodel based extractors?
+     * If set, will take preference over the Thread level setting.
+     */
+    public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+        allPreferEventExtractors = preferEventExtractors;
+    }
+
+    /**
+     * Should this thread use event based extractors is available?
+     * Checks the all-threads one first, then thread specific.
+     */
+    protected static boolean getPreferEventExtractor() {
+        if(allPreferEventExtractors != null) {
+            return allPreferEventExtractors;
+        }
+        return threadPreferEventExtractors.get();
+    }
+
+    public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+        // Only ever an OLE2 one from the root of the FS
+        return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+    }
+    public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+        // Only ever an OLE2 one from the root of the FS
+        return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+    }
+    public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+        // Only ever an OLE2 one from the root of the FS
+        return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+    }
+
+    public static POITextExtractor createExtractor(InputStream input) {
+        // TODO Something nasty with reflection...
+        return null;
+    }
+
+    /**
+     * Create the Extractor, if possible. Generally needs the Scratchpad jar.
+     * Note that this won't check for embedded OOXML resources either, use
+     *  {@link org.apache.poi.extractor.ExtractorFactory} for that.
+     */
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir)
+            throws IOException, OpenXML4JException, XmlException
+    {
+        // Look for certain entries in the stream, to figure it
+        // out from
+        for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
+            if (poifsDir.hasEntry(workbookName)) {
+                if (getPreferEventExtractor()) {
+                    return new EventBasedExcelExtractor(poifsDir);
+                }
+                return new ExcelExtractor(poifsDir);
+            }
+        }
+
+        // TODO Try to ask the Scratchpad
+
+        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+    }
+
+    /**
+     * Returns an array of text extractors, one for each of
+     *  the embedded documents in the file (if there are any).
+     * If there are no embedded documents, you'll get back an
+     *  empty array. Otherwise, you'll get one open
+     *  {@link POITextExtractor} for each embedded file.
+     */
+    public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext)
+            throws IOException, OpenXML4JException, XmlException
+    {
+        // All the embedded directories we spotted
+        ArrayList<Entry> dirs = new ArrayList<Entry>();
+        // For anything else not directly held in as a POIFS directory
+        ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+
+        // Find all the embedded directories
+        DirectoryEntry root = ext.getRoot();
+        if(root == null) {
+            throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+        }
+
+        if(ext instanceof ExcelExtractor) {
+            // These are in MBD... under the root
+            Iterator<Entry> it = root.getEntries();
+            while(it.hasNext()) {
+                Entry entry = it.next();
+                if(entry.getName().startsWith("MBD")) {
+                    dirs.add(entry);
+                }
+            }
+        } else {
+            // TODO Ask scratchpad
+        }
+
+        // Create the extractors
+        if(dirs.size() == 0 && nonPOIFS.size() == 0){
+            return new POITextExtractor[0];
+        }
+
+        ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+        for (Entry dir : dirs) {
+            e.add(createExtractor(
+                    (DirectoryNode) dir
+            ));
+        }
+        for (InputStream nonPOIF : nonPOIFS) {
+            try {
+                e.add(createExtractor(nonPOIF));
+            } catch (IllegalArgumentException ie) {
+                // Ignore, just means it didn't contain
+                //  a format we support as yet
+                // TODO Should we log this?
+            } catch (Exception xe) {
+                // Ignore, invalid format
+                // TODO Should we log this?
+            }
+        }
+        return e.toArray(new POITextExtractor[e.size()]);
+    }
+}