]> source.dussan.org Git - poi.git/commitdiff
* Add text-extraction verification to integration-tests via a new abstract base FileH...
authorDominik Stadler <centic@apache.org>
Fri, 27 Feb 2015 09:59:14 +0000 (09:59 +0000)
committerDominik Stadler <centic@apache.org>
Fri, 27 Feb 2015 09:59:14 +0000 (09:59 +0000)
* Fix NullPointerException found in some documents when running against the test-data
* Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely formatted XLS files.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1662652 13f79535-47bb-0310-9956-ffa450edef68

12 files changed:
src/integrationtest/org/apache/poi/TestAllFiles.java
src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java [new file with mode: 0644]
src/integrationtest/org/apache/poi/stress/FileHandler.java
src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java
src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java
src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java
src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java
src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java
src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java
src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java

index 46083035370cbbca9d2771ae48cae2f1b5838a60..d0439b40fd51a40814cb8b9ac785cd0786b2afcd 100644 (file)
@@ -253,20 +253,26 @@ public class TestAllFiles {
     @Test
     public void testAllFiles() throws Exception {
                assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler);
-               InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100);
+               File inputFile = new File(ROOT_DIR, file);
+               
                try {
-                       handler.handleFile(stream);
-                       
-                       assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", 
-                               EXPECTED_FAILURES.contains(file));
-               } catch (Exception e) {
-                   // check if we expect failure for this file
-                       if(!EXPECTED_FAILURES.contains(file)) {
-                           throw new Exception("While handling " + file, e);
-                       }
-               } finally {
-                       stream.close();
-               }
+            InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100);
+               try {
+                       handler.handleFile(stream);
+    
+                       assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", 
+                               EXPECTED_FAILURES.contains(file));
+               } finally {
+                       stream.close();
+               }
+
+            handler.handleExtracting(inputFile);
+        } catch (Exception e) {
+            // check if we expect failure for this file
+            if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
+                throw new Exception("While handling " + file, e);
+            }
+        }
        }
 
        private static String getExtension(String file) {
@@ -282,5 +288,9 @@ public class TestAllFiles {
                @Override
         public void handleFile(InputStream stream) throws Exception {
                }
+
+               @Override
+        public void handleExtracting(File file) throws Exception {
+        }
        }
 }
diff --git a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java
new file mode 100644 (file)
index 0000000..85ebb1b
--- /dev/null
@@ -0,0 +1,55 @@
+package org.apache.poi.stress;\r
+\r
+import static org.junit.Assert.assertFalse;\r
+import static org.junit.Assert.assertNotNull;\r
+\r
+import java.io.File;\r
+import java.util.HashSet;\r
+import java.util.Set;\r
+\r
+import org.apache.poi.POITextExtractor;\r
+import org.apache.poi.extractor.ExtractorFactory;\r
+\r
+public abstract class AbstractFileHandler implements FileHandler {\r
+    public static final Set<String> EXPECTED_EXTRACTOR_FAILURES = new HashSet<String>();\r
+    static {\r
+        // password protected files\r
+        EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx");\r
+        \r
+        // unsupported file-types, no supported OLE2 parts\r
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2");\r
+        EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx");\r
+    }\r
+\r
+    public void handleExtracting(File file) throws Exception {\r
+        POITextExtractor extractor = ExtractorFactory.createExtractor(file);\r
+        try  {\r
+            assertNotNull(extractor);\r
+\r
+            assertNotNull(extractor.getText());\r
+            \r
+            // also try metadata\r
+            POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();\r
+            assertNotNull(metadataExtractor.getText());\r
+\r
+            assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!", \r
+                    EXPECTED_EXTRACTOR_FAILURES.contains(file));\r
+        } catch (IllegalArgumentException e) {\r
+            if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {\r
+                throw new Exception("While handling " + file, e);\r
+            }\r
+        } finally {\r
+            extractor.close();\r
+        }\r
+    }\r
+}\r
index e6f3385f02a114ab65ee0b75bd35a63cdb91f99a..ce2991b0bc3b8e7f9a9cd93901e79fde87c08cd6 100644 (file)
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.stress;
 
+import java.io.File;
 import java.io.InputStream;
 
 /**
@@ -34,4 +35,10 @@ public interface FileHandler {
         * @throws Exception
         */
        void handleFile(InputStream stream) throws Exception;
+       
+       /**
+        * Ensures that extracting text from the given file
+        * is returning some text. 
+        */
+       void handleExtracting(File file) throws Exception;
 }
index dfa875005847b0ebc46e7f39ae9182ed96b03608..9f492bf0edda1ab00d0c37b7b62ffcac28d05e02 100644 (file)
@@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAPIAttribute;
 import org.apache.poi.hmef.attribute.MAPIStringAttribute;
 import org.junit.Test;
 
-public class HMEFFileHandler implements FileHandler {
+public class HMEFFileHandler extends AbstractFileHandler {
 
        @Override
     public void handleFile(InputStream stream) throws Exception {
index b7d846ae62f0f3d687e6fa661475de424628ef3f..477ee859cb9ed848cd61d9cd81614a1f8af867b4 100644 (file)
@@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertiesOnlyDocument;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.junit.Test;
 
-public class HPSFFileHandler implements FileHandler {
+public class HPSFFileHandler extends AbstractFileHandler {
        @Override
     public void handleFile(InputStream stream) throws Exception {
                HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream));
index 19dbd97a0eb737a282097a9e4dbe2cdb10f3b40a..dd579c4dba891a98a0ff9e3f3e89bad5672ff78b 100644 (file)
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.stress;
 
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 
@@ -49,4 +50,10 @@ public class HSSFFileHandler extends SpreadsheetHandler {
                        stream.close();
                }
        }
+
+       // a test-case to test this locally without executing the full TestAllFiles
+    @Test
+    public void testExtractor() throws Exception {
+        handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls"));
+    }
 }
\ No newline at end of file
index 31deac7106bc9145396221cfa33b72a9cf07e159..5c4a36e3caef0212f41e3d6443db2f7a32524fbe 100644 (file)
@@ -25,7 +25,7 @@ import java.io.InputStream;
 import org.apache.poi.POIDocument;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
-public class POIFSFileHandler implements FileHandler {
+public class POIFSFileHandler extends AbstractFileHandler {
 
        @Override
     public void handleFile(InputStream stream) throws Exception {
index aad703ce98d5830adf21fb305b4d7a850ea56b9b..f12bbd2de55bc55acfa80b06388752b2c2d1d04b 100644 (file)
@@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.ss.usermodel.WorkbookFactory;
 
-public abstract class SpreadsheetHandler implements FileHandler {
+public abstract class SpreadsheetHandler extends AbstractFileHandler {
        public void handleWorkbook(Workbook wb, String extension) throws IOException {
                // try to access some of the content
                readContent(wb);
index 3464218fd9d26c349b81f2f158ba8af489c9efa6..e6cbb184b207eff7d27e2bf75e4a3de562ec9bab 100644 (file)
@@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.junit.Test;
 
-public class XSLFFileHandler implements FileHandler {
+public class XSLFFileHandler extends AbstractFileHandler {
        @Override
     public void handleFile(InputStream stream) throws Exception {
         // ignore password protected files
index a96d46da31ebbd7e82e370643e64a2a42888f8f8..47c18d8aa051f0da9712acfe701e8b88dbb25e94 100644 (file)
@@ -22,7 +22,7 @@ import java.io.InputStream;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.junit.Test;
 
-public class XWPFFileHandler implements FileHandler {
+public class XWPFFileHandler extends AbstractFileHandler {
        @Override
     public void handleFile(InputStream stream) throws Exception {
         // ignore password protected files
index 65d1e3d693847b06833c0a0c7ce6277a8e52cc80..a0b6b5db17480330fd264d5fccf135e5ecec76e6 100644 (file)
@@ -213,7 +213,9 @@ public class ExtractorFactory {
     {
         // Look for certain entries in the stream, to figure it
         // out from
-        if (poifsDir.hasEntry("Workbook")) {
+        if (poifsDir.hasEntry("Workbook") ||
+                // some XLS files have different entry-names
+                poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) {
             if (getPreferEventExtractor()) {
                 return new EventBasedExcelExtractor(poifsDir);
             }
index 6f43ba126b8cf8f8488ae9c3bc01689484d19ebc..39ef5be8a4bb90fb0a5a246f671146e02481910d 100644 (file)
@@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
         }
         POIXMLTextExtractor extractor =
                 new XSSFExcelExtractor(args[0]);
-        System.out.println(extractor.getText());
+        try {
+            System.out.println(extractor.getText());
+        } finally {
+            extractor.close();
+        }
     }
 
     /**
@@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
         if (type == Cell.CELL_TYPE_NUMERIC) {
             CellStyle cs = cell.getCellStyle();
 
-            if (cs.getDataFormatString() != null) {
+            if (cs != null && cs.getDataFormatString() != null) {
                 text.append(formatter.formatRawCellContents(
                         cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString()
                         ));