if(core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
-
- PackagePart corePart = pkg.getPart(core.getRelationship(0));
- if(corePart.getContentType().equals(XSSFRelation.WORKBOOK.getContentType())) {
- return new XSSFExcelExtractor(pkg);
- }
- if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType())) {
+
+ PackagePart corePart = pkg.getPart(core.getRelationship(0));
+ if (corePart.getContentType().equals(XSSFRelation.WORKBOOK.getContentType()) ||
+ corePart.getContentType().equals(XSSFRelation.MACRO_TEMPLATE_WORKBOOK.getContentType()) ||
+ corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
+ corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
+ corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
+ return new XSSFExcelExtractor(pkg);
+ }
+
+ if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
+ corePart.getContentType().equals(XWPFRelation.TEMPLATE.getContentType()) ||
+ corePart.getContentType().equals(XWPFRelation.MACRO_DOCUMENT.getContentType()) ||
+ corePart.getContentType().equals(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType()) ) {
return new XWPFWordExtractor(pkg);
}
+
if(corePart.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
return new XSLFPowerPointExtractor(pkg);
}
- throw new IllegalArgumentException("No supported documents found in the OOXML package");
+ throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
private File xls;
private File xlsx;
-
+ private File xltx;
+
private File doc;
private File docx;
+ private File dotx;
private File ppt;
private File pptx;
xls = new File(excel_dir, "SampleSS.xls");
xlsx = new File(excel_dir, "SampleSS.xlsx");
-
+ xltx = new File(excel_dir, "test.xltx");
+
doc = new File(word_dir, "SampleDoc.doc");
docx = new File(word_dir, "SampleDoc.docx");
-
+ dotx = new File(word_dir, "test.dotx");
+
ppt = new File(powerpoint_dir, "SampleShow.ppt");
pptx = new File(powerpoint_dir, "SampleShow.pptx");
assertTrue(
ExtractorFactory.createExtractor(xlsx).getText().length() > 200
);
+
+ assertTrue(
+ ExtractorFactory.createExtractor(xltx)
+ instanceof XSSFExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(xltx).getText().contains("test")
+ );
+
// Word
assertTrue(
assertTrue(
ExtractorFactory.createExtractor(docx).getText().length() > 120
);
-
+
+ assertTrue(
+ ExtractorFactory.createExtractor(dotx)
+ instanceof XWPFWordExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(dotx).getText().contains("Test")
+ );
+
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(ppt)