]> source.dussan.org Git - poi.git/commitdiff
Start moving logic over into the main and scratchpad jars for OLE2
authorNick Burch <nick@apache.org>
Mon, 11 Jul 2016 22:47:02 +0000 (22:47 +0000)
committerNick Burch <nick@apache.org>
Mon, 11 Jul 2016 22:47:02 +0000 (22:47 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752226 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/extractor/OLE2ExtractorFactory.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/scratchpad/src/org/apache/poi/extractor/OLE2ScrachpadExtractorFactory.java

index 0db450aaf06f6b205c1cff418b509f3615997faf..9facd397979ba571fbf7c392f5bb66c03cd20ac5 100644 (file)
@@ -20,8 +20,10 @@ import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAME
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.POITextExtractor;
@@ -33,6 +35,8 @@ import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 
 /**
  * Figures out the correct POIOLE2TextExtractor for your supplied
@@ -48,6 +52,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  */
 @SuppressWarnings("WeakerAccess")
 public class OLE2ExtractorFactory {
+    private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class); 
+    
     /** Should this thread prefer event based over usermodel based extractors? */
     private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
         @Override
@@ -115,11 +121,38 @@ public class OLE2ExtractorFactory {
         return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
     }
 
-    public static POITextExtractor createExtractor(InputStream input) {
-        // TODO Something nasty with reflection...
-        return null;
+    public static POITextExtractor createExtractor(InputStream input) throws IOException {
+        Class<?> cls = getOOXMLClass();
+        if (cls != null) {
+            // TODO Reflection
+            throw new IllegalArgumentException("TODO Reflection");
+        } else {
+            // Best hope it's OLE2....
+            return createExtractor(new NPOIFSFileSystem(input));
+        }
     }
 
+    private static Class<?> getOOXMLClass() {
+        try {
+            return OLE2ExtractorFactory.class.getClassLoader().loadClass(
+                    "org.apache.poi.extractor.ExtractorFactory"
+            );
+        } catch (ClassNotFoundException e) {
+            LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
+            return null;
+        }
+    }
+    private static Class<?> getScratchpadClass() {
+        try {
+            return OLE2ExtractorFactory.class.getClassLoader().loadClass(
+                    "org.apache.poi.extractor.OLE2ScrachpadExtractorFactory"
+            );
+        } catch (ClassNotFoundException e) {
+            LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
+            throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
+        }
+    }
+    
     /**
      * Create the Extractor, if possible. Generally needs the Scratchpad jar.
      * Note that this won't check for embedded OOXML resources either, use
@@ -138,8 +171,16 @@ public class OLE2ExtractorFactory {
                 return new ExcelExtractor(poifsDir);
             }
         }
-
-        // TODO Try to ask the Scratchpad
+        
+        // Ask Scratchpad, or fail trying
+        Class<?> cls = getScratchpadClass();
+        try {
+            Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
+            POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
+            if (ext != null) return ext;
+        } catch (Exception e) {
+            throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
+        }
 
         throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
     }
@@ -155,9 +196,9 @@ public class OLE2ExtractorFactory {
             throws IOException
     {
         // All the embedded directories we spotted
-        ArrayList<Entry> dirs = new ArrayList<Entry>();
+        List<Entry> dirs = new ArrayList<Entry>();
         // For anything else not directly held in as a POIFS directory
-        ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+        List<InputStream> nonPOIFS = new ArrayList<InputStream>();
 
         // Find all the embedded directories
         DirectoryEntry root = ext.getRoot();
@@ -175,7 +216,15 @@ public class OLE2ExtractorFactory {
                 }
             }
         } else {
-            // TODO Ask scratchpad
+            // Ask Scratchpad, or fail trying
+            Class<?> cls = getScratchpadClass();
+            try {
+                Method m = cls.getDeclaredMethod(
+                        "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
+                m.invoke(null, ext, dirs, nonPOIFS);
+            } catch (Exception e) {
+                throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
+            }
         }
 
         // Create the extractors
@@ -195,10 +244,10 @@ public class OLE2ExtractorFactory {
             } catch (IllegalArgumentException ie) {
                 // Ignore, just means it didn't contain
                 //  a format we support as yet
-                // TODO Should we log this?
+                LOGGER.log(POILogger.WARN, ie);
             } catch (Exception xe) {
                 // Ignore, invalid format
-                // TODO Should we log this?
+                LOGGER.log(POILogger.WARN, xe);
             }
         }
         return e.toArray(new POITextExtractor[e.size()]);
index 4ba8d8f2f854e80a0ad896384cbd7e377f3217f7..830a4d82d87738d2a4f924736311005a63c77ea2 100644 (file)
@@ -78,23 +78,13 @@ public class ExtractorFactory {
        protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
        protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
 
-
-       /** Should this thread prefer event based over usermodel based extractors? */
-       private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
-               @Override
-               protected Boolean initialValue() { return Boolean.FALSE; }
-       };
-
-       /** Should all threads prefer event based over usermodel based extractors? */
-       private static Boolean allPreferEventExtractors;
-
    /**
     * Should this thread prefer event based over usermodel based extractors?
     * (usermodel extractors tend to be more accurate, but use more memory)
     * Default is false.
     */
        public static boolean getThreadPrefersEventExtractors() {
-          return threadPreferEventExtractors.get();
+          return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
        }
 
    /**
@@ -103,7 +93,7 @@ public class ExtractorFactory {
     * Default is to use the thread level setting, which defaults to false.
     */
        public static Boolean getAllThreadsPreferEventExtractors() {
-          return allPreferEventExtractors;
+          return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
        }
 
    /**
@@ -111,7 +101,7 @@ public class ExtractorFactory {
     * Will only be used if the All Threads setting is null.
     */
    public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
-      threadPreferEventExtractors.set(preferEventExtractors);
+       OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
    }
 
    /**
@@ -119,7 +109,7 @@ public class ExtractorFactory {
     * If set, will take preference over the Thread level setting.
     */
    public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
-      allPreferEventExtractors = preferEventExtractors;
+       OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
    }
 
    /**
@@ -127,10 +117,7 @@ public class ExtractorFactory {
     * Checks the all-threads one first, then thread specific.
     */
    protected static boolean getPreferEventExtractor() {
-      if(allPreferEventExtractors != null) {
-         return allPreferEventExtractors;
-      }
-      return threadPreferEventExtractors.get();
+       return OLE2ExtractorFactory.getPreferEventExtractor();
    }
 
        public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@@ -281,83 +268,28 @@ public class ExtractorFactory {
        }
 
        public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
-          // Only ever an OLE2 one from the root of the FS
-               return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+           return OLE2ExtractorFactory.createExtractor(fs);
        }
     public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
-        // Only ever an OLE2 one from the root of the FS
-         return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+        return OLE2ExtractorFactory.createExtractor(fs);
      }
     public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
-        // Only ever an OLE2 one from the root of the FS
-         return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+        return OLE2ExtractorFactory.createExtractor(fs);
      }
 
     public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
             OpenXML4JException, XmlException
     {
-        // Look for certain entries in the stream, to figure it
-        // out from
-        for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
-            if (poifsDir.hasEntry(workbookName)) {
-                if (getPreferEventExtractor()) {
-                    return new EventBasedExcelExtractor(poifsDir);
-                }
-                return new ExcelExtractor(poifsDir);
-            }
-        }
-        if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
-            throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
-                    + "found. Please call OldExcelExtractor directly for basic text extraction");
-        }
-
-        if (poifsDir.hasEntry("WordDocument")) {
-            // Old or new style word document?
-            try {
-                return new WordExtractor(poifsDir);
-            } catch (OldWordFileFormatException e) {
-                return new Word6Extractor(poifsDir);
-            }
-        }
-
-        if (poifsDir.hasEntry("PowerPoint Document")) {
-            return new PowerPointExtractor(poifsDir);
-        }
-
-        if (poifsDir.hasEntry("VisioDocument")) {
-            return new VisioTextExtractor(poifsDir);
-        }
-
-        if (poifsDir.hasEntry("Quill")) {
-            return new PublisherTextExtractor(poifsDir);
-        }
-
-        final String[] outlookEntryNames = new String[] {
-                // message bodies, saved as plain text (PtypString)
-                // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
-                // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
-                // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
-                // @see org.apache.poi.hsmf.Types.MAPIType
-                "__substg1.0_1000001E", //PidTagBody ASCII
-                "__substg1.0_1000001F", //PidTagBody Unicode
-                "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
-                "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
-                "__substg1.0_0037001E", //PidTagSubject ASCII
-                "__substg1.0_0037001F", //PidTagSubject Unicode
-        };
-        for (String entryName : outlookEntryNames) {
-            if (poifsDir.hasEntry(entryName)) {
-                return new OutlookTextExtactor(poifsDir);
-            }
-        }
-
+        // First, check for OOXML
         for (String entryName : poifsDir.getEntryNames()) {
             if (entryName.equals("Package")) {
                 OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
                 return createExtractor(pkg);
             }
         }
-        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+        
+        // If not, ask the OLE2 code to check, with Scratchpad if possible
+        return OLE2ExtractorFactory.createExtractor(poifsDir);
     }
 
        /**
index 467bb31fdf5be0429e380867cc6488d870ef10bf..0aa022332e230a426fac6fa2232064b7723c96e5 100644 (file)
@@ -150,6 +150,7 @@ public class TestExtractorFactory {
 
         POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
         assertTrue(
+                extractor.getClass().getName(),
                 extractor
                 instanceof XSSFExcelExtractor
         );
@@ -163,6 +164,7 @@ public class TestExtractorFactory {
 
         extractor = ExtractorFactory.createExtractor(xltx);
         assertTrue(
+                extractor.getClass().getName(),
                 extractor
                 instanceof XSSFExcelExtractor
         );
@@ -340,6 +342,7 @@ public class TestExtractorFactory {
 
         extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
         assertTrue(
+                extractor.getClass().getName(),
                 extractor
                 instanceof XSSFExcelExtractor
         );
@@ -359,6 +362,7 @@ public class TestExtractorFactory {
         // Word
         extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
         assertTrue(
+                extractor.getClass().getName(),
                 extractor
                 instanceof WordExtractor
         );
@@ -369,6 +373,7 @@ public class TestExtractorFactory {
 
         extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
         assertTrue(
+                extractor.getClass().getName(),
                 extractor
                 instanceof Word6Extractor
         );
@@ -379,6 +384,7 @@ public class TestExtractorFactory {
 
         extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
         assertTrue(
+                extractor.getClass().getName(),
                 extractor
                 instanceof Word6Extractor
         );
index 90ede6236723898c7044d9ae7f421e80748181ec..b1e52f4d75ceda88f384e589aed569b843e6f939 100644 (file)
@@ -20,8 +20,8 @@ import java.io.ByteArrayInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.POITextExtractor;
@@ -108,7 +108,7 @@ public class OLE2ScrachpadExtractorFactory {
         *  empty array. Otherwise, you'll get one open
         *  {@link POITextExtractor} for each embedded file.
         */
-       public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException {
+       public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
       // Find all the embedded directories
                DirectoryEntry root = ext.getRoot();
                if(root == null) {