return threadPreferEventExtractors.get();
}
-
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
- InputStream inp = null;
+ NPOIFSFileSystem fs = null;
try {
- try {
- NPOIFSFileSystem fs = new NPOIFSFileSystem(f);
- return createExtractor(fs);
- } catch (OfficeXmlFileException e) {
- return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
- } catch (NotOLE2FileException ne) {
- throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
+ fs = new NPOIFSFileSystem(f);
+ POIOLE2TextExtractor extractor = createExtractor(fs);
+ extractor.setFilesystem(fs);
+ return extractor;
+ } catch (OfficeXmlFileException e) {
+ // ensure file-handle release
+ if(fs != null) {
+ fs.close();
}
- } finally {
- if(inp != null) inp.close();
+ return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
+ } catch (NotOLE2FileException ne) {
+ // ensure file-handle release
+ if(fs != null) {
+ fs.close();
+ }
+ throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
}
}
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
}
+ /**
+ * Tries to determine the actual type of file and produces a matching text-extractor for it.
+ *
+ * @param pkg An {@link OPCPackage}.
+ * @return A {@link POIXMLTextExtractor} for the given file.
+ * @throws IOException If an error occurs while reading the file
+ * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
+ * @throws XmlException If an XML parsing error occurs.
+ * @throws IllegalArgumentException If no matching file type could be found.
+ */
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
- // Check for the normal Office core document
- PackageRelationshipCollection core =
- pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-
- // If nothing was found, try some of the other OOXML-based core types
- if (core.size() == 0) {
- // Could it be an OOXML-Strict one?
- core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
- }
- if (core.size() == 0) {
- // Could it be a visio one?
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (core.size() == 1)
- return new XDGFVisioExtractor(pkg);
- }
-
- // Should just be a single core document, complain if not
- if (core.size() != 1) {
- throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
- }
-
- // Grab the core document part, and try to identify from that
- PackagePart corePart = pkg.getPart(core.getRelationship(0));
-
- // Is it XSSF?
- for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
- if(corePart.getContentType().equals(rel.getContentType())) {
- if(getPreferEventExtractor()) {
- return new XSSFEventBasedExcelExtractor(pkg);
- }
-
- return new XSSFExcelExtractor(pkg);
- }
- }
-
- // Is it XWPF?
- for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
- if(corePart.getContentType().equals(rel.getContentType())) {
- return new XWPFWordExtractor(pkg);
- }
- }
-
- // Is it XSLF?
- for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
- if(corePart.getContentType().equals(rel.getContentType())) {
- return new XSLFPowerPointExtractor(pkg);
- }
- }
-
- // special handling for SlideShow-Theme-files,
- if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
- }
-
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
+ try {
+ // Check for the normal Office core document
+ PackageRelationshipCollection core =
+ pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
+
+ // If nothing was found, try some of the other OOXML-based core types
+ if (core.size() == 0) {
+ // Could it be an OOXML-Strict one?
+ core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
+ }
+ if (core.size() == 0) {
+ // Could it be a visio one?
+ core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+ if (core.size() == 1)
+ return new XDGFVisioExtractor(pkg);
+ }
+
+ // Should just be a single core document, complain if not
+ if (core.size() != 1) {
+ throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
+ }
+
+ // Grab the core document part, and try to identify from that
+ PackagePart corePart = pkg.getPart(core.getRelationship(0));
+
+ // Is it XSSF?
+ for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
+ if(corePart.getContentType().equals(rel.getContentType())) {
+ if(getPreferEventExtractor()) {
+ return new XSSFEventBasedExcelExtractor(pkg);
+ }
+
+ return new XSSFExcelExtractor(pkg);
+ }
+ }
+
+ // Is it XWPF?
+ for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
+ if(corePart.getContentType().equals(rel.getContentType())) {
+ return new XWPFWordExtractor(pkg);
+ }
+ }
+
+ // Is it XSLF?
+ for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
+ if(corePart.getContentType().equals(rel.getContentType())) {
+ return new XSLFPowerPointExtractor(pkg);
+ }
+ }
+
+ // special handling for SlideShow-Theme-files,
+ if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
+ return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+ }
+
+ throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
+ } catch (IOException e) {
+ // ensure that we close the package again if there is an error opening it, however
+ // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+ pkg.revert();
+ throw e;
+ } catch (OpenXML4JException e) {
+ // ensure that we close the package again if there is an error opening it, however
+ // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+ pkg.revert();
+ throw e;
+ } catch (XmlException e) {
+ // ensure that we close the package again if there is an error opening it, however
+ // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+ pkg.revert();
+ throw e;
+ } catch (RuntimeException e) {
+ // ensure that we close the package again if there is an error opening it, however
+ // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+ pkg.revert();
+
+ throw e;
+ }
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Word
+ extractor = ExtractorFactory.createExtractor(doc);
assertTrue(
- ExtractorFactory.createExtractor(doc)
+ extractor
instanceof WordExtractor
);
assertTrue(
- ExtractorFactory.createExtractor(doc).getText().length() > 120
+ extractor.getText().length() > 120
);
+ extractor.close();
+ extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(
- ExtractorFactory.createExtractor(doc6)
+ extractor
instanceof Word6Extractor
);
assertTrue(
- ExtractorFactory.createExtractor(doc6).getText().length() > 20
+ extractor.getText().length() > 20
);
+ extractor.close();
+ extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(
- ExtractorFactory.createExtractor(doc95)
+ extractor
instanceof Word6Extractor
);
assertTrue(
- ExtractorFactory.createExtractor(doc95).getText().length() > 120
+ extractor.getText().length() > 120
);
+ extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(
);
extractor.close();
- // PowerPoint
+ // PowerPoint (PPT)
+ extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(
- ExtractorFactory.createExtractor(ppt)
+ extractor
instanceof PowerPointExtractor
);
assertTrue(
- ExtractorFactory.createExtractor(ppt).getText().length() > 120
+ extractor.getText().length() > 120
);
+ extractor.close();
+ // PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(
extractor
instanceof XSLFPowerPointExtractor
);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// Visio - binary
+ extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(
- ExtractorFactory.createExtractor(vsd)
+ extractor
instanceof VisioTextExtractor
);
assertTrue(
- ExtractorFactory.createExtractor(vsd).getText().length() > 50
+ extractor.getText().length() > 50
);
+ extractor.close();
+
// Visio - vsdx
+ extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(
- ExtractorFactory.createExtractor(vsdx)
+ extractor
instanceof XDGFVisioExtractor
);
assertTrue(
- ExtractorFactory.createExtractor(vsdx).getText().length() > 20
+ extractor.getText().length() > 20
);
+ extractor.close();
// Publisher
+ extractor = ExtractorFactory.createExtractor(pub);
assertTrue(
- ExtractorFactory.createExtractor(pub)
+ extractor
instanceof PublisherTextExtractor
);
assertTrue(
- ExtractorFactory.createExtractor(pub).getText().length() > 50
+ extractor.getText().length() > 50
);
+ extractor.close();
// Outlook msg
+ extractor = ExtractorFactory.createExtractor(msg);
assertTrue(
- ExtractorFactory.createExtractor(msg)
+ extractor
instanceof OutlookTextExtactor
);
assertTrue(
- ExtractorFactory.createExtractor(msg).getText().length() > 50
+ extractor.getText().length() > 50
);
+ extractor.close();
// Text
try {
extractor.close();
// Visio
+ extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(
- ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
+ extractor
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);
+ extractor.close();
// Text
try {
ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
+ ext.close();
// Excel
ext = (POIOLE2TextExtractor)
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
+ ext.close();
// Word
ext = (POIOLE2TextExtractor)
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
+ ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor)
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
+ ext.close();
// Outlook
ext = (OutlookTextExtactor)
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
+ ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor)
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
-
+ ext.close();
// TODO - PowerPoint
// TODO - Publisher