import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
-import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.util.IOUtils;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
* Figures out the correct POITextExtractor for your supplied
* document, and returns it.
*/
+@SuppressWarnings("WeakerAccess")
public class ExtractorFactory {
public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
return extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
- if(fs != null) {
- fs.close();
- }
+ IOUtils.closeQuietly(fs);
+
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
// ensure file-handle release
- if(fs != null) {
- fs.close();
- }
+ IOUtils.closeQuietly(fs);
+
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
} catch (OpenXML4JException e) {
// ensure file-handle release
- if(fs != null) {
- fs.close();
- }
+ IOUtils.closeQuietly(fs);
+
throw e;
} catch (XmlException e) {
// ensure file-handle release
- if(fs != null) {
- fs.close();
- }
+ IOUtils.closeQuietly(fs);
+
throw e;
} catch (IOException e) {
// ensure file-handle release
- if(fs != null) {
- fs.close();
- }
+ IOUtils.closeQuietly(fs);
+
throw e;
} catch (RuntimeException e) {
// ensure file-handle release
- if(fs != null) {
- fs.close();
- }
+ IOUtils.closeQuietly(fs);
+
throw e;
}
}
}
}
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
- public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
- public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
- InvalidFormatException, OpenXML4JException, XmlException
+ OpenXML4JException, XmlException
{
// Look for certain entries in the stream, to figure it
// out from
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
+ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
// All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
dirs.add(entry);
}
}
- } catch(FileNotFoundException e) {}
+ } catch(FileNotFoundException e) {
+ // ignored here
+ }
} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
}
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
- for(int i=0; i<dirs.size(); i++) {
- e.add( createExtractor(
- (DirectoryNode)dirs.get(i)
- ) );
- }
- for(int i=0; i<nonPOIFS.size(); i++) {
- try {
- e.add( createExtractor(nonPOIFS.get(i)) );
- } catch(IllegalArgumentException ie) {
- // Ignore, just means it didn't contain
- // a format we support as yet
- } catch(XmlException xe) {
- throw new IOException(xe.getMessage());
- } catch(OpenXML4JException oe) {
- throw new IOException(oe.getMessage());
- }
- }
+ for (Entry dir : dirs) {
+ e.add(createExtractor(
+ (DirectoryNode) dir
+ ));
+ }
+ for (InputStream nonPOIF : nonPOIFS) {
+ try {
+ e.add(createExtractor(nonPOIF));
+ } catch (IllegalArgumentException ie) {
+ // Ignore, just means it didn't contain
+ // a format we support as yet
+ } catch (XmlException xe) {
+ throw new IOException(xe.getMessage());
+ } catch (OpenXML4JException oe) {
+ throw new IOException(oe.getMessage());
+ }
+ }
return e.toArray(new POITextExtractor[e.size()]);
}
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
// TODO Support OOXML-Strict, see bug #57699
try {
- extractor = ExtractorFactory.createExtractor(xlsxStrict);
+ /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
ExtractorFactory.createExtractor(stream);
fail();
} finally {
- stream.close();
+ IOUtils.closeQuietly(stream);
}
} catch(IllegalArgumentException e) {
// Good
}
}
+
+ @Test
+ public void testOPOIFS() throws Exception {
+ // Excel
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
+ instanceof ExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
+ );
+
+ // Word
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
+ instanceof WordExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
+ instanceof Word6Extractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
+ );
+
+ // PowerPoint
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
+ instanceof PowerPointExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
+ );
+
+ // Visio
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
+ instanceof VisioTextExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
+ );
+
+ // Publisher
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
+ instanceof PublisherTextExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
+ );
+
+ // Outlook msg
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
+ instanceof OutlookTextExtactor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
+ );
+
+ // Text
+ try {
+ ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
+ fail();
+ } catch(IOException e) {
+ // Good
+ }
+ }
+
@Test
public void testPackage() throws Exception {
// Excel
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
+ for (POITextExtractor embed : embeds) {
+ assertTrue(embed.getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+ if (embed instanceof PowerPointExtractor) numPpt++;
+ else if (embed instanceof ExcelExtractor) numXls++;
+ else if (embed instanceof WordExtractor) numWord++;
+ else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(4, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+ for (POITextExtractor embed : embeds) {
+ assertTrue(embed.getText().length() > 20);
+ if (embed instanceof PowerPointExtractor) numPpt++;
+ else if (embed instanceof ExcelExtractor) numXls++;
+ else if (embed instanceof WordExtractor) numWord++;
+ else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
assertEquals(3, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
- else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
+ for (POITextExtractor embed : embeds) {
+ assertTrue(embed.getText().length() > 20);
+ if (embed instanceof PowerPointExtractor) numPpt++;
+ else if (embed instanceof ExcelExtractor) numXls++;
+ else if (embed instanceof WordExtractor) numWord++;
+ else if (embed instanceof OutlookTextExtactor) numMsg++;
+ else if (embed instanceof XWPFWordExtractor) numWordX++;
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(1, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+ for (POITextExtractor embed : embeds) {
+ assertTrue(embed.getText().length() > 20);
+ if (embed instanceof PowerPointExtractor) numPpt++;
+ else if (embed instanceof ExcelExtractor) numXls++;
+ else if (embed instanceof WordExtractor) numWord++;
+ else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(1, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
+ for (POITextExtractor embed : embeds) {
+ assertTrue(embed.getText().length() > 20);
+ if (embed instanceof PowerPointExtractor) numPpt++;
+ else if (embed instanceof ExcelExtractor) numXls++;
+ else if (embed instanceof WordExtractor) numWord++;
+ else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
* "No supported documents found in the OLE2 stream"
*/
@Test
- public void a() throws Exception {
+ public void bug59074() throws Exception {
try {
ExtractorFactory.createExtractor(
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
fail("Old excel formats not supported via ExtractorFactory");
- } catch (OldExcelFormatException e) {}
+ } catch (OldExcelFormatException e) {
+ // expected here
+ }
+ }
+
+ @Test
+ public void testGetEmbeddedFromXMLExtractor() {
+ try {
+ // currently not implemented
+ ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
+ fail("Unsupported currently");
+ } catch (IllegalStateException e) {
+ // expected here
+ }
}
}