import java.io.IOException;
import java.io.InputStream;
+import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
+import java.util.List;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
*/
@SuppressWarnings("WeakerAccess")
public class OLE2ExtractorFactory {
+ private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
+
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
@Override
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
- public static POITextExtractor createExtractor(InputStream input) {
- // TODO Something nasty with reflection...
- return null;
+ public static POITextExtractor createExtractor(InputStream input) throws IOException {
+ Class<?> cls = getOOXMLClass();
+ if (cls != null) {
+ // TODO Reflection
+ throw new IllegalArgumentException("TODO Reflection");
+ } else {
+ // Best hope it's OLE2....
+ return createExtractor(new NPOIFSFileSystem(input));
+ }
}
+ private static Class<?> getOOXMLClass() {
+ try {
+ return OLE2ExtractorFactory.class.getClassLoader().loadClass(
+ "org.apache.poi.extractor.ExtractorFactory"
+ );
+ } catch (ClassNotFoundException e) {
+ LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
+ return null;
+ }
+ }
+ private static Class<?> getScratchpadClass() {
+ try {
+ return OLE2ExtractorFactory.class.getClassLoader().loadClass(
+ "org.apache.poi.extractor.OLE2ScrachpadExtractorFactory"
+ );
+ } catch (ClassNotFoundException e) {
+ LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
+ throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
+ }
+ }
+
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
return new ExcelExtractor(poifsDir);
}
}
-
- // TODO Try to ask the Scratchpad
+
+ // Ask Scratchpad, or fail trying
+ Class<?> cls = getScratchpadClass();
+ try {
+ Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
+ POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
+ if (ext != null) return ext;
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
+ }
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
throws IOException
{
// All the embedded directories we spotted
- ArrayList<Entry> dirs = new ArrayList<Entry>();
+ List<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
- ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+ List<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
}
}
} else {
- // TODO Ask scratchpad
+ // Ask Scratchpad, or fail trying
+ Class<?> cls = getScratchpadClass();
+ try {
+ Method m = cls.getDeclaredMethod(
+ "identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
+ m.invoke(null, ext, dirs, nonPOIFS);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
+ }
}
// Create the extractors
} catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
- // TODO Should we log this?
+ LOGGER.log(POILogger.WARN, ie);
} catch (Exception xe) {
// Ignore, invalid format
- // TODO Should we log this?
+ LOGGER.log(POILogger.WARN, xe);
}
}
return e.toArray(new POITextExtractor[e.size()]);
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
-
- /** Should this thread prefer event based over usermodel based extractors? */
- private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
- @Override
- protected Boolean initialValue() { return Boolean.FALSE; }
- };
-
- /** Should all threads prefer event based over usermodel based extractors? */
- private static Boolean allPreferEventExtractors;
-
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
- return threadPreferEventExtractors.get();
+ return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
}
/**
* Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
- return allPreferEventExtractors;
+ return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
}
/**
* Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- threadPreferEventExtractors.set(preferEventExtractors);
+ OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
}
/**
* If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- allPreferEventExtractors = preferEventExtractors;
+ OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
}
/**
* Checks the all-threads one first, then thread specific.
*/
protected static boolean getPreferEventExtractor() {
- if(allPreferEventExtractors != null) {
- return allPreferEventExtractors;
- }
- return threadPreferEventExtractors.get();
+ return OLE2ExtractorFactory.getPreferEventExtractor();
}
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- // Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+ return OLE2ExtractorFactory.createExtractor(fs);
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- // Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+ return OLE2ExtractorFactory.createExtractor(fs);
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- // Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
+ return OLE2ExtractorFactory.createExtractor(fs);
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
OpenXML4JException, XmlException
{
- // Look for certain entries in the stream, to figure it
- // out from
- for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
- if (poifsDir.hasEntry(workbookName)) {
- if (getPreferEventExtractor()) {
- return new EventBasedExcelExtractor(poifsDir);
- }
- return new ExcelExtractor(poifsDir);
- }
- }
- if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
- throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
- + "found. Please call OldExcelExtractor directly for basic text extraction");
- }
-
- if (poifsDir.hasEntry("WordDocument")) {
- // Old or new style word document?
- try {
- return new WordExtractor(poifsDir);
- } catch (OldWordFileFormatException e) {
- return new Word6Extractor(poifsDir);
- }
- }
-
- if (poifsDir.hasEntry("PowerPoint Document")) {
- return new PowerPointExtractor(poifsDir);
- }
-
- if (poifsDir.hasEntry("VisioDocument")) {
- return new VisioTextExtractor(poifsDir);
- }
-
- if (poifsDir.hasEntry("Quill")) {
- return new PublisherTextExtractor(poifsDir);
- }
-
- final String[] outlookEntryNames = new String[] {
- // message bodies, saved as plain text (PtypString)
- // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
- // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
- // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
- // @see org.apache.poi.hsmf.Types.MAPIType
- "__substg1.0_1000001E", //PidTagBody ASCII
- "__substg1.0_1000001F", //PidTagBody Unicode
- "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
- "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
- "__substg1.0_0037001E", //PidTagSubject ASCII
- "__substg1.0_0037001F", //PidTagSubject Unicode
- };
- for (String entryName : outlookEntryNames) {
- if (poifsDir.hasEntry(entryName)) {
- return new OutlookTextExtactor(poifsDir);
- }
- }
-
+ // First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg);
}
}
- throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+
+ // If not, ask the OLE2 code to check, with Scratchpad if possible
+ return OLE2ExtractorFactory.createExtractor(poifsDir);
}
/**
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(
+ extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(
+ extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(
+ extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(
+ extractor.getClass().getName(),
extractor
instanceof WordExtractor
);
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(
+ extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(
+ extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.Iterator;
+import java.util.List;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
- public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException {
+ public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if(root == null) {