]> source.dussan.org Git - poi.git/commitdiff
Handle some cases better where file handles were left open by the ExtractorFactory...
authorDominik Stadler <centic@apache.org>
Sun, 20 Dec 2015 20:39:01 +0000 (20:39 +0000)
committerDominik Stadler <centic@apache.org>
Sun, 20 Dec 2015 20:39:01 +0000 (20:39 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1721064 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/POITextExtractor.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java

index 6514ad5d1889b9fbb7812e97586b75777a94a07b..542b4d30dac19f3e1a2624bb629c674aacccc48f 100644 (file)
@@ -31,6 +31,8 @@ import java.io.IOException;
  * @see org.apache.poi.hwpf.extractor.WordExtractor
  */
 public abstract class POITextExtractor implements Closeable {
+    private Closeable fsToClose = null;
+    
        /**
         * Retrieves all the text from the document.
         * How cells, paragraphs etc are separated in the text
@@ -46,6 +48,13 @@ public abstract class POITextExtractor implements Closeable {
         *  metadata / properties, such as author and title.
         */
        public abstract POITextExtractor getMetadataTextExtractor();
+
+       /**
+        * Used to ensure file handle cleanup.
+        */
+       public void setFilesystem(Closeable fs) {
+           fsToClose = fs;
+       }
        
        /**
         * Allows to free resources of the Extractor as soon as
@@ -55,6 +64,8 @@ public abstract class POITextExtractor implements Closeable {
         * The Extractor cannot be used after close has been called.
         */
        public void close() throws IOException {
-               // nothing to do in abstract class, derived classes may perform actions.
+               if(fsToClose != null) {
+                   fsToClose.close();
+               }
        }
 }
index 906c0263754e8709ec7c261d60a8cf845d5a465b..d7c6e3d5d0a95cd66fa43c9b556db266f02679d5 100644 (file)
@@ -128,20 +128,25 @@ public class ExtractorFactory {
       return threadPreferEventExtractors.get();
    }
 
-
        public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
-               InputStream inp = null;
+           NPOIFSFileSystem fs = null;
         try {
-            try {
-                NPOIFSFileSystem fs = new NPOIFSFileSystem(f);
-                return createExtractor(fs);
-            } catch (OfficeXmlFileException e) {
-                return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
-            } catch (NotOLE2FileException ne) {
-                throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
+            fs = new NPOIFSFileSystem(f);
+            POIOLE2TextExtractor extractor = createExtractor(fs);
+            extractor.setFilesystem(fs);
+            return extractor;
+        } catch (OfficeXmlFileException e) {
+            // ensure file-handle release
+            if(fs != null) {
+                fs.close();
             }
-        } finally {
-            if(inp != null) inp.close();
+            return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
+        } catch (NotOLE2FileException ne) {
+            // ensure file-handle release
+            if(fs != null) {
+                fs.close();
+            }
+            throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
         }
     }
 
@@ -161,65 +166,95 @@ public class ExtractorFactory {
                throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
        }
 
+       /**
+        * Tries to determine the actual type of file and produces a matching text-extractor for it.
+        *
+        * @param pkg An {@link OPCPackage}.
+        * @return A {@link POIXMLTextExtractor} for the given file.
+        * @throws IOException If an error occurs while reading the file 
+        * @throws OpenXML4JException If an error parsing the OpenXML file format is found. 
+        * @throws XmlException If an XML parsing error occurs.
+        * @throws IllegalArgumentException If no matching file type could be found.
+        */
        public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
-          // Check for the normal Office core document
-       PackageRelationshipCollection core =
-            pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-       
-       // If nothing was found, try some of the other OOXML-based core types
-       if (core.size() == 0) {
-           // Could it be an OOXML-Strict one?
-           core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
-       }
-       if (core.size() == 0) {
-           // Could it be a visio one?
-           core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
-           if (core.size() == 1)
-               return new XDGFVisioExtractor(pkg);
-       }
-       
-       // Should just be a single core document, complain if not
-       if (core.size() != 1) {
-           throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
-       }
-
-       // Grab the core document part, and try to identify from that
-       PackagePart corePart = pkg.getPart(core.getRelationship(0));
-
-       // Is it XSSF?
-       for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
-          if(corePart.getContentType().equals(rel.getContentType())) {
-             if(getPreferEventExtractor()) {
-                return new XSSFEventBasedExcelExtractor(pkg);
-             }
-
-             return new XSSFExcelExtractor(pkg);
-          }
-       }
-
-       // Is it XWPF?
-       for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
-          if(corePart.getContentType().equals(rel.getContentType())) {
-             return new XWPFWordExtractor(pkg);
-          }
-       }
-
-       // Is it XSLF?
-       for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
-          if(corePart.getContentType().equals(rel.getContentType())) {
-             return new XSLFPowerPointExtractor(pkg);
-          }
-       }
-
-       // special handling for SlideShow-Theme-files, 
-       if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
-           return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
-       }
-
-       // ensure that we close the package again if there is an error opening it, however
-       // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
-       pkg.revert();
-       throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
+        try {
+          // Check for the normal Office core document
+           PackageRelationshipCollection core =
+                pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
+           
+           // If nothing was found, try some of the other OOXML-based core types
+           if (core.size() == 0) {
+               // Could it be an OOXML-Strict one?
+               core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
+           }
+           if (core.size() == 0) {
+               // Could it be a visio one?
+               core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
+               if (core.size() == 1)
+                   return new XDGFVisioExtractor(pkg);
+           }
+           
+           // Should just be a single core document, complain if not
+           if (core.size() != 1) {
+               throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
+           }
+    
+           // Grab the core document part, and try to identify from that
+           PackagePart corePart = pkg.getPart(core.getRelationship(0));
+    
+           // Is it XSSF?
+           for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
+              if(corePart.getContentType().equals(rel.getContentType())) {
+                 if(getPreferEventExtractor()) {
+                    return new XSSFEventBasedExcelExtractor(pkg);
+                 }
+    
+                 return new XSSFExcelExtractor(pkg);
+              }
+           }
+    
+           // Is it XWPF?
+           for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
+              if(corePart.getContentType().equals(rel.getContentType())) {
+                 return new XWPFWordExtractor(pkg);
+              }
+           }
+    
+           // Is it XSLF?
+           for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
+              if(corePart.getContentType().equals(rel.getContentType())) {
+                 return new XSLFPowerPointExtractor(pkg);
+              }
+           }
+    
+           // special handling for SlideShow-Theme-files, 
+           if(XSLFRelation.THEME_MANAGER.getContentType().equals(corePart.getContentType())) {
+               return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
+           }
+           
+           throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
+           } catch (IOException e) {
+               // ensure that we close the package again if there is an error opening it, however
+               // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+               pkg.revert();
+               throw e;
+        } catch (OpenXML4JException e) {
+            // ensure that we close the package again if there is an error opening it, however
+            // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+            pkg.revert();
+            throw e;
+        } catch (XmlException e) {
+            // ensure that we close the package again if there is an error opening it, however
+            // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+            pkg.revert();
+            throw e;
+           } catch (RuntimeException e) {
+           // ensure that we close the package again if there is an error opening it, however
+           // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
+           pkg.revert();
+           
+           throw e;
+           }
        }
 
        public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
index fba530757a79d9eefb17f106a7caa6d696d95412..acf0d46614ff4c0226c52860497b568f82c191e5 100644 (file)
@@ -193,29 +193,35 @@ public class TestExtractorFactory {
 
 
         // Word
+        extractor = ExtractorFactory.createExtractor(doc);
         assertTrue(
-                ExtractorFactory.createExtractor(doc)
+                extractor
                 instanceof WordExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(doc).getText().length() > 120
+                extractor.getText().length() > 120
         );
+        extractor.close();
 
+        extractor = ExtractorFactory.createExtractor(doc6);
         assertTrue(
-                ExtractorFactory.createExtractor(doc6)
+                extractor
                 instanceof Word6Extractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(doc6).getText().length() > 20
+                extractor.getText().length() > 20
         );
+        extractor.close();
 
+        extractor = ExtractorFactory.createExtractor(doc95);
         assertTrue(
-                ExtractorFactory.createExtractor(doc95)
+                extractor
                 instanceof Word6Extractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(doc95).getText().length() > 120
+                extractor.getText().length() > 120
         );
+        extractor.close();
 
         extractor = ExtractorFactory.createExtractor(docx);
         assertTrue(
@@ -241,62 +247,71 @@ public class TestExtractorFactory {
         );
         extractor.close();
 
-        // PowerPoint
+        // PowerPoint (PPT)
+        extractor = ExtractorFactory.createExtractor(ppt);
         assertTrue(
-                ExtractorFactory.createExtractor(ppt)
+                extractor
                 instanceof PowerPointExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(ppt).getText().length() > 120
+                extractor.getText().length() > 120
         );
+        extractor.close();
 
+        // PowerPoint (PPTX)
         extractor = ExtractorFactory.createExtractor(pptx);
         assertTrue(
                 extractor
                 instanceof XSLFPowerPointExtractor
         );
-        extractor.close();
-
-        extractor = ExtractorFactory.createExtractor(pptx);
         assertTrue(
                 extractor.getText().length() > 120
         );
         extractor.close();
 
         // Visio - binary
+        extractor = ExtractorFactory.createExtractor(vsd);
         assertTrue(
-                ExtractorFactory.createExtractor(vsd)
+                extractor
                 instanceof VisioTextExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(vsd).getText().length() > 50
+                extractor.getText().length() > 50
         );
+        extractor.close();
+
         // Visio - vsdx
+        extractor = ExtractorFactory.createExtractor(vsdx);
         assertTrue(
-                ExtractorFactory.createExtractor(vsdx)
+                extractor
                 instanceof XDGFVisioExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(vsdx).getText().length() > 20
+                extractor.getText().length() > 20
         );
+        extractor.close();
 
         // Publisher
+        extractor = ExtractorFactory.createExtractor(pub);
         assertTrue(
-                ExtractorFactory.createExtractor(pub)
+                extractor
                 instanceof PublisherTextExtractor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(pub).getText().length() > 50
+                extractor.getText().length() > 50
         );
+        extractor.close();
 
         // Outlook msg
+        extractor = ExtractorFactory.createExtractor(msg);
         assertTrue(
-                ExtractorFactory.createExtractor(msg)
+                extractor
                 instanceof OutlookTextExtactor
         );
         assertTrue(
-                ExtractorFactory.createExtractor(msg).getText().length() > 50
+                extractor.getText().length() > 50
         );
+        extractor.close();
 
         // Text
         try {
@@ -557,13 +572,15 @@ public class TestExtractorFactory {
         extractor.close();
         
         // Visio
+        extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
         assertTrue(
-                ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
+                extractor
                 instanceof XDGFVisioExtractor
         );
         assertTrue(
                 extractor.getText().length() > 20
         );
+        extractor.close();
 
         // Text
         try {
@@ -670,6 +687,7 @@ public class TestExtractorFactory {
                 ExtractorFactory.createExtractor(xls);
         embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
         assertEquals(0, embeds.length);
+        ext.close();
 
         // Excel
         ext = (POIOLE2TextExtractor)
@@ -690,6 +708,7 @@ public class TestExtractorFactory {
         assertEquals(2, numXls);
         assertEquals(2, numWord);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Word
         ext = (POIOLE2TextExtractor)
@@ -709,6 +728,7 @@ public class TestExtractorFactory {
         assertEquals(2, numXls);
         assertEquals(1, numWord);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Word which contains an OOXML file
         ext = (POIOLE2TextExtractor)
@@ -730,6 +750,7 @@ public class TestExtractorFactory {
         assertEquals(0, numWord);
         assertEquals(1, numWordX);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Outlook
         ext = (OutlookTextExtactor)
@@ -749,6 +770,7 @@ public class TestExtractorFactory {
         assertEquals(0, numXls);
         assertEquals(1, numWord);
         assertEquals(0, numMsg);
+        ext.close();
 
         // Outlook with another outlook file in it
         ext = (OutlookTextExtactor)
@@ -768,7 +790,7 @@ public class TestExtractorFactory {
         assertEquals(0, numXls);
         assertEquals(0, numWord);
         assertEquals(1, numMsg);
-
+        ext.close();
 
         // TODO - PowerPoint
         // TODO - Publisher