aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJaven O'Neal <onealj@apache.org>2016-07-17 08:26:51 +0000
committerJaven O'Neal <onealj@apache.org>2016-07-17 08:26:51 +0000
commit8fc36c73672dd2726845831029782637776fcc25 (patch)
tree8b1390ecc1c19c09ba7da5dddc35e084b0adde6c
parent9e5c4d9c9a39620ca6dadeb4e92f24069f0ee649 (diff)
downloadpoi-8fc36c73672dd2726845831029782637776fcc25.tar.gz
poi-8fc36c73672dd2726845831029782637776fcc25.zip
whitespace
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1753028 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java504
1 files changed, 250 insertions, 254 deletions
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
index 05cad8af93..6cffd39fdf 100644
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -72,180 +72,180 @@ import org.apache.xmlbeans.XmlException;
*/
@SuppressWarnings("WeakerAccess")
public class ExtractorFactory {
- public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
- protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
- protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
- */
- public static boolean getThreadPrefersEventExtractors() {
- return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
- */
- public static Boolean getAllThreadsPreferEventExtractors() {
- return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
- */
- public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
- */
- public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should this thread use event based extractors is available?
- * Checks the all-threads one first, then thread specific.
- */
- protected static boolean getPreferEventExtractor() {
- return OLE2ExtractorFactory.getPreferEventExtractor();
- }
-
- public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
- NPOIFSFileSystem fs = null;
+ public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
+ protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
+ protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is false.
+ */
+ public static boolean getThreadPrefersEventExtractors() {
+ return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is to use the thread level setting, which defaults to false.
+ */
+ public static Boolean getAllThreadsPreferEventExtractors() {
+ return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * Will only be used if the All Threads setting is null.
+ */
+ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+ OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
+ }
+
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * If set, will take preference over the Thread level setting.
+ */
+ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+ OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
+ }
+
+ /**
+ * Should this thread use event based extractors is available?
+ * Checks the all-threads one first, then thread specific.
+ */
+ protected static boolean getPreferEventExtractor() {
+ return OLE2ExtractorFactory.getPreferEventExtractor();
+ }
+
+ public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ NPOIFSFileSystem fs = null;
try {
fs = new NPOIFSFileSystem(f);
POIOLE2TextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
return extractor;
+
} catch (OfficeXmlFileException e) {
// ensure file-handle release
- IOUtils.closeQuietly(fs);
-
+ IOUtils.closeQuietly(fs);
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
+
} catch (NotOLE2FileException ne) {
// ensure file-handle release
- IOUtils.closeQuietly(fs);
-
+ IOUtils.closeQuietly(fs);
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
- } catch (OpenXML4JException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
- } catch (XmlException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
+ } catch (OpenXML4JException e) {
+ // ensure file-handle release
+ IOUtils.closeQuietly(fs);
+ throw e;
- throw e;
- } catch (IOException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
+ } catch (XmlException e) {
+ // ensure file-handle release
+ IOUtils.closeQuietly(fs);
+ throw e;
+
+ } catch (IOException e) {
+ // ensure file-handle release
+ IOUtils.closeQuietly(fs);
+ throw e;
- throw e;
} catch (RuntimeException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
+ // ensure file-handle release
+ IOUtils.closeQuietly(fs);
+ throw e;
+ }
+ }
- throw e;
- }
+ public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ // Figure out the kind of stream
+ // If clearly doesn't do mark/reset, wrap up
+ if (! inp.markSupported()) {
+ inp = new PushbackInputStream(inp, 8);
+ }
+
+ if (NPOIFSFileSystem.hasPOIFSHeader(inp)) {
+ return createExtractor(new NPOIFSFileSystem(inp));
+ }
+ if (DocumentFactoryHelper.hasOOXMLHeader(inp)) {
+ return createExtractor(OPCPackage.open(inp));
+ }
+ throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
}
- public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
- // Figure out the kind of stream
- // If clearly doesn't do mark/reset, wrap up
- if(! inp.markSupported()) {
- inp = new PushbackInputStream(inp, 8);
- }
-
- if(NPOIFSFileSystem.hasPOIFSHeader(inp)) {
- return createExtractor(new NPOIFSFileSystem(inp));
- }
- if(DocumentFactoryHelper.hasOOXMLHeader(inp)) {
- return createExtractor(OPCPackage.open(inp));
- }
- throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
- }
-
- /**
- * Tries to determine the actual type of file and produces a matching text-extractor for it.
- *
- * @param pkg An {@link OPCPackage}.
- * @return A {@link POIXMLTextExtractor} for the given file.
- * @throws IOException If an error occurs while reading the file
- * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
- * @throws XmlException If an XML parsing error occurs.
- * @throws IllegalArgumentException If no matching file type could be found.
- */
- public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
+ /**
+ * Tries to determine the actual type of file and produces a matching text-extractor for it.
+ *
+ * @param pkg An {@link OPCPackage}.
+ * @return A {@link POIXMLTextExtractor} for the given file.
+ * @throws IOException If an error occurs while reading the file
+ * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
+ * @throws XmlException If an XML parsing error occurs.
+ * @throws IllegalArgumentException If no matching file type could be found.
+ */
+ public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
- // Check for the normal Office core document
- PackageRelationshipCollection core =
- pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-
- // If nothing was found, try some of the other OOXML-based core types
- if (core.size() == 0) {
- // Could it be an OOXML-Strict one?
- core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
- }
- if (core.size() == 0) {
- // Could it be a visio one?
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (core.size() == 1)
- return new XDGFVisioExtractor(pkg);
- }
-
- // Should just be a single core document, complain if not
- if (core.size() != 1) {
- throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
- }
-
- // Grab the core document part, and try to identify from that
- PackagePart corePart = pkg.getPart(core.getRelationship(0));
-
- // Is it XSSF?
- for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
- if(corePart.getContentType().equals(rel.getContentType())) {
- if(getPreferEventExtractor()) {
- return new XSSFEventBasedExcelExtractor(pkg);
- }
-
- return new XSSFExcelExtractor(pkg);
- }
- }
-
- // Is it XWPF?
- for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
- if(corePart.getContentType().equals(rel.getContentType())) {
- return new XWPFWordExtractor(pkg);
- }
- }
-
- // Is it XSLF?
- for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
- if(corePart.getContentType().equals(rel.getContentType())) {
- return new XSLFPowerPointExtractor(pkg);
- }
- }
-
- // special handling for SlideShow-Theme-files,
- if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
- }
-
- throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
- } catch (IOException e) {
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
+ // Check for the normal Office core document
+ PackageRelationshipCollection core;
+ core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
+
+ // If nothing was found, try some of the other OOXML-based core types
+ if (core.size() == 0) {
+ // Could it be an OOXML-Strict one?
+ core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
+ }
+ if (core.size() == 0) {
+ // Could it be a visio one?
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ if (core.size() == 1)
+ return new XDGFVisioExtractor(pkg);
+ }
+
+ // Should just be a single core document, complain if not
+ if (core.size() != 1) {
+ throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
+ }
+
+ // Grab the core document part, and try to identify from that
+ PackagePart corePart = pkg.getPart(core.getRelationship(0));
+
+ // Is it XSSF?
+ for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
+ if (corePart.getContentType().equals(rel.getContentType())) {
+ if (getPreferEventExtractor()) {
+ return new XSSFEventBasedExcelExtractor(pkg);
+ }
+ return new XSSFExcelExtractor(pkg);
+ }
+ }
+
+ // Is it XWPF?
+ for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
+ if (corePart.getContentType().equals(rel.getContentType())) {
+ return new XWPFWordExtractor(pkg);
+ }
+ }
+
+ // Is it XSLF?
+ for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
+ if (corePart.getContentType().equals(rel.getContentType())) {
+ return new XSLFPowerPointExtractor(pkg);
+ }
+ }
+
+ // special handling for SlideShow-Theme-files,
+ if (XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
+ return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ }
+
+ throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
+
+ } catch (IOException e) {
+ // ensure that we close the package again if there is an error opening it, however
+ // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+ pkg.revert();
+ throw e;
} catch (OpenXML4JException e) {
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
@@ -256,27 +256,25 @@ public class ExtractorFactory {
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
- } catch (RuntimeException e) {
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
-
- throw e;
- }
- }
-
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
- }
+ } catch (RuntimeException e) {
+ // ensure that we close the package again if there is an error opening it, however
+ // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+ pkg.revert();
+ throw e;
+ }
+ }
+
+ public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
+ return OLE2ExtractorFactory.createExtractor(fs);
+ }
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
- }
+ }
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
- }
+ }
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
- OpenXML4JException, XmlException
+ public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
{
// First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
@@ -285,104 +283,102 @@ public class ExtractorFactory {
return createExtractor(pkg);
}
}
-
+
// If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir);
}
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
- // All the embedded directories we spotted
- ArrayList<Entry> dirs = new ArrayList<Entry>();
- // For anything else not directly held in as a POIFS directory
- ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if(root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- if(ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while(it.hasNext()) {
- Entry entry = it.next();
- if(entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else if(ext instanceof WordExtractor) {
- // These are in ObjectPool -> _... under the root
- try {
- DirectoryEntry op = (DirectoryEntry)
- root.getEntry("ObjectPool");
- Iterator<Entry> it = op.getEntries();
- while(it.hasNext()) {
- Entry entry = it.next();
- if(entry.getName().startsWith("_")) {
- dirs.add(entry);
- }
- }
- } catch(FileNotFoundException e) {
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ */
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
+ // All the embedded directories we spotted
+ ArrayList<Entry> dirs = new ArrayList<Entry>();
+ // For anything else not directly held in as a POIFS directory
+ ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+
+ // Find all the embedded directories
+ DirectoryEntry root = ext.getRoot();
+ if (root == null) {
+ throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+ }
+
+ if (ext instanceof ExcelExtractor) {
+ // These are in MBD... under the root
+ Iterator<Entry> it = root.getEntries();
+ while (it.hasNext()) {
+ Entry entry = it.next();
+ if (entry.getName().startsWith("MBD")) {
+ dirs.add(entry);
+ }
+ }
+ } else if (ext instanceof WordExtractor) {
+ // These are in ObjectPool -> _... under the root
+ try {
+ DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
+ Iterator<Entry> it = op.getEntries();
+ while (it.hasNext()) {
+ Entry entry = it.next();
+ if (entry.getName().startsWith("_")) {
+ dirs.add(entry);
+ }
+ }
+ } catch (FileNotFoundException e) {
// ignored here
}
- //} else if(ext instanceof PowerPointExtractor) {
- // Tricky, not stored directly in poifs
- // TODO
- } else if(ext instanceof OutlookTextExtactor) {
- // Stored in the Attachment blocks
- MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
- for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
- if(attachment.attachData != null) {
- byte[] data = attachment.attachData.getValue();
- nonPOIFS.add( new ByteArrayInputStream(data) );
- } else if(attachment.attachmentDirectory != null) {
- dirs.add(attachment.attachmentDirectory.getDirectory());
- }
- }
- }
-
- // Create the extractors
- if(dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+ //} else if(ext instanceof PowerPointExtractor) {
+ // Tricky, not stored directly in poifs
+ // TODO
+ } else if (ext instanceof OutlookTextExtactor) {
+ // Stored in the Attachment blocks
+ MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
+ for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ if (attachment.attachData != null) {
+ byte[] data = attachment.attachData.getValue();
+ nonPOIFS.add( new ByteArrayInputStream(data) );
+ } else if (attachment.attachmentDirectory != null) {
+ dirs.add(attachment.attachmentDirectory.getDirectory());
+ }
+ }
+ }
+
+ // Create the extractors
+ if (dirs.size() == 0 && nonPOIFS.size() == 0){
+ return new POITextExtractor[0];
+ }
+
+ ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for (Entry dir : dirs) {
- e.add(createExtractor(
- (DirectoryNode) dir
- ));
+ e.add(createExtractor((DirectoryNode) dir));
}
for (InputStream nonPOIF : nonPOIFS) {
try {
- e.add(createExtractor(nonPOIF));
+ e.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain
// a format we support as yet
} catch (XmlException xe) {
- throw new IOException(xe.getMessage());
+ throw new IOException(xe.getMessage());
} catch (OpenXML4JException oe) {
- throw new IOException(oe.getMessage());
+ throw new IOException(oe.getMessage());
}
}
- return e.toArray(new POITextExtractor[e.size()]);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- public static POITextExtractor[] getEmbededDocsTextExtractors(@SuppressWarnings("UnusedParameters") POIXMLTextExtractor ext) {
- throw new IllegalStateException("Not yet supported");
- }
+ return e.toArray(new POITextExtractor[e.size()]);
+ }
+
+ /**
+ * Returns an array of text extractors, one for each of
+ * the embedded documents in the file (if there are any).
+ * If there are no embedded documents, you'll get back an
+ * empty array. Otherwise, you'll get one open
+ * {@link POITextExtractor} for each embedded file.
+ */
+ @SuppressWarnings("UnusedParameters")
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
+ throw new IllegalStateException("Not yet supported");
+ }
}