123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.extractor;
-
- import java.io.ByteArrayInputStream;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.PushbackInputStream;
- import java.util.ArrayList;
- import java.util.Iterator;
-
- import org.apache.poi.POIOLE2TextExtractor;
- import org.apache.poi.POITextExtractor;
- import org.apache.poi.POIXMLTextExtractor;
- import org.apache.poi.hsmf.MAPIMessage;
- import org.apache.poi.hsmf.datatypes.AttachmentChunks;
- import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
- import org.apache.poi.hssf.extractor.ExcelExtractor;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.openxml4j.opc.PackagePart;
- import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
- import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
- import org.apache.poi.poifs.filesystem.DirectoryEntry;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.DocumentFactoryHelper;
- import org.apache.poi.poifs.filesystem.Entry;
- import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.NotOLE2FileException;
- import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.util.NotImplemented;
- import org.apache.poi.util.POILogFactory;
- import org.apache.poi.util.POILogger;
- import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
- import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
- import org.apache.poi.xslf.usermodel.XSLFRelation;
- import org.apache.poi.xslf.usermodel.XSLFSlideShow;
- import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
- import org.apache.poi.xssf.usermodel.XSSFRelation;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.apache.poi.xwpf.usermodel.XWPFRelation;
- import org.apache.xmlbeans.XmlException;
-
- /**
- * Figures out the correct POITextExtractor for your supplied
- * document, and returns it.
- *
- * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
- * not present on the runtime classpath</p>
- * <p>Note 2 - rather than using this, for most cases you would be better
- * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
- */
- @SuppressWarnings("WeakerAccess")
- public class ExtractorFactory {
- private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
-
- public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
- protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
- protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
- */
- public static boolean getThreadPrefersEventExtractors() {
- return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
- */
- public static Boolean getAllThreadsPreferEventExtractors() {
- return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
- }
-
- /**
- * Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
- */
- public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
- OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
- */
- public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
- OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
- }
-
- /**
- * Should this thread use event based extractors is available?
- * Checks the all-threads one first, then thread specific.
- */
- protected static boolean getPreferEventExtractor() {
- return OLE2ExtractorFactory.getPreferEventExtractor();
- }
-
- public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
- NPOIFSFileSystem fs = null;
- try {
- fs = new NPOIFSFileSystem(f);
- POIOLE2TextExtractor extractor = createExtractor(fs);
- extractor.setFilesystem(fs);
- return extractor;
-
- } catch (OfficeXmlFileException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
-
- } catch (NotOLE2FileException ne) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
-
- } catch (OpenXML4JException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
-
- } catch (XmlException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
-
- } catch (IOException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
-
- } catch (RuntimeException e) {
- // ensure file-handle release
- IOUtils.closeQuietly(fs);
- throw e;
- }
- }
-
- public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
- // Figure out the kind of stream
- // If clearly doesn't do mark/reset, wrap up
- if (! inp.markSupported()) {
- inp = new PushbackInputStream(inp, 8);
- }
-
- if (NPOIFSFileSystem.hasPOIFSHeader(inp)) {
- return createExtractor(new NPOIFSFileSystem(inp));
- }
- if (DocumentFactoryHelper.hasOOXMLHeader(inp)) {
- return createExtractor(OPCPackage.open(inp));
- }
- throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
- }
-
- /**
- * Tries to determine the actual type of file and produces a matching text-extractor for it.
- *
- * @param pkg An {@link OPCPackage}.
- * @return A {@link POIXMLTextExtractor} for the given file.
- * @throws IOException If an error occurs while reading the file
- * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
- * @throws XmlException If an XML parsing error occurs.
- * @throws IllegalArgumentException If no matching file type could be found.
- */
- public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
- try {
- // Check for the normal Office core document
- PackageRelationshipCollection core;
- core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-
- // If nothing was found, try some of the other OOXML-based core types
- if (core.size() == 0) {
- // Could it be an OOXML-Strict one?
- core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
- }
- if (core.size() == 0) {
- // Could it be a visio one?
- core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
- if (core.size() == 1)
- return new XDGFVisioExtractor(pkg);
- }
-
- // Should just be a single core document, complain if not
- if (core.size() != 1) {
- throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
- }
-
- // Grab the core document part, and try to identify from that
- final PackagePart corePart = pkg.getPart(core.getRelationship(0));
- final String contentType = corePart.getContentType();
-
- // Is it XSSF?
- for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- if (getPreferEventExtractor()) {
- return new XSSFEventBasedExcelExtractor(pkg);
- }
- return new XSSFExcelExtractor(pkg);
- }
- }
-
- // Is it XWPF?
- for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new XWPFWordExtractor(pkg);
- }
- }
-
- // Is it XSLF?
- for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
- if ( rel.getContentType().equals( contentType ) ) {
- return new XSLFPowerPointExtractor(pkg);
- }
- }
-
- // special handling for SlideShow-Theme-files,
- if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
- return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
- }
-
- // How about xlsb?
- for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
- if (rel.getContentType().equals(contentType)) {
- return new XSSFBEventBasedExcelExtractor(pkg);
- }
- }
-
- throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
-
- } catch (IOException e) {
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
- } catch (OpenXML4JException e) {
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
- } catch (XmlException e) {
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
- } catch (RuntimeException e) {
- // ensure that we close the package again if there is an error opening it, however
- // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
- pkg.revert();
- throw e;
- }
- }
-
- public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
- }
- public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
- }
- public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
- return OLE2ExtractorFactory.createExtractor(fs);
- }
-
- public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
- {
- // First, check for OOXML
- for (String entryName : poifsDir.getEntryNames()) {
- if (entryName.equals("Package")) {
- OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
- return createExtractor(pkg);
- }
- }
-
- // If not, ask the OLE2 code to check, with Scratchpad if possible
- return OLE2ExtractorFactory.createExtractor(poifsDir);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
- // All the embedded directories we spotted
- ArrayList<Entry> dirs = new ArrayList<Entry>();
- // For anything else not directly held in as a POIFS directory
- ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-
- // Find all the embedded directories
- DirectoryEntry root = ext.getRoot();
- if (root == null) {
- throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
- }
-
- if (ext instanceof ExcelExtractor) {
- // These are in MBD... under the root
- Iterator<Entry> it = root.getEntries();
- while (it.hasNext()) {
- Entry entry = it.next();
- if (entry.getName().startsWith("MBD")) {
- dirs.add(entry);
- }
- }
- } else if (ext instanceof WordExtractor) {
- // These are in ObjectPool -> _... under the root
- try {
- DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
- Iterator<Entry> it = op.getEntries();
- while (it.hasNext()) {
- Entry entry = it.next();
- if (entry.getName().startsWith("_")) {
- dirs.add(entry);
- }
- }
- } catch (FileNotFoundException e) {
- logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
- // ignored here
- }
- //} else if(ext instanceof PowerPointExtractor) {
- // Tricky, not stored directly in poifs
- // TODO
- } else if (ext instanceof OutlookTextExtactor) {
- // Stored in the Attachment blocks
- MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
- for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
- if (attachment.getAttachData() != null) {
- byte[] data = attachment.getAttachData().getValue();
- nonPOIFS.add( new ByteArrayInputStream(data) );
- } else if (attachment.getAttachmentDirectory() != null) {
- dirs.add(attachment.getAttachmentDirectory().getDirectory());
- }
- }
- }
-
- // Create the extractors
- if (dirs.size() == 0 && nonPOIFS.size() == 0){
- return new POITextExtractor[0];
- }
-
- ArrayList<POITextExtractor> textExtractors = new ArrayList<POITextExtractor>();
- for (Entry dir : dirs) {
- textExtractors.add(createExtractor((DirectoryNode) dir));
- }
- for (InputStream nonPOIF : nonPOIFS) {
- try {
- textExtractors.add(createExtractor(nonPOIF));
- } catch (IllegalArgumentException e) {
- // Ignore, just means it didn't contain
- // a format we support as yet
- logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
- } catch (XmlException e) {
- throw new IOException(e.getMessage(), e);
- } catch (OpenXML4JException e) {
- throw new IOException(e.getMessage(), e);
- }
- }
- return textExtractors.toArray(new POITextExtractor[textExtractors.size()]);
- }
-
- /**
- * Returns an array of text extractors, one for each of
- * the embedded documents in the file (if there are any).
- * If there are no embedded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
- * {@link POITextExtractor} for each embedded file.
- */
- @NotImplemented
- @SuppressWarnings("UnusedParameters")
- public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
- throw new IllegalStateException("Not yet supported");
- }
- }
|