#62886 - Regression extracting text from corrupted docx files

author Andreas Beeker <kiwiwings@apache.org>

Tue, 18 Dec 2018 23:55:58 +0000 (23:55 +0000)

committer Andreas Beeker <kiwiwings@apache.org>

Tue, 18 Dec 2018 23:55:58 +0000 (23:55 +0000)
author Andreas Beeker <kiwiwings@apache.org>
Tue, 18 Dec 2018 23:55:58 +0000 (23:55 +0000)
committer Andreas Beeker <kiwiwings@apache.org>
Tue, 18 Dec 2018 23:55:58 +0000 (23:55 +0000)
diff --git a/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java b/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java

index 725c7a4835f8d54f14defa114fabd4493180f58c..728bf9849a0bfaa27c0583b90738204ce67c54b2 100644 (file)
--- a/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java
+++ b/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java
@@ -20,6 +20,7 @@ package org.apache.poi.openxml4j.util;
  import static org.apache.poi.openxml4j.util.ZipSecureFile.MAX_ENTRY_SIZE;
  import static org.apache.poi.openxml4j.util.ZipSecureFile.MIN_INFLATE_RATIO;
  
+import java.io.EOFException;
  import java.io.FilterInputStream;
  import java.io.IOException;
  import java.io.InputStream;
@@ -144,6 +145,8 @@ public class ZipArchiveThresholdInputStream extends FilterInputStream {
                          "No valid entries or contents found, this is not a valid OOXML (Office Open XML) file", ze);
              }
              throw ze;
+        } catch (EOFException e) {
+            return null;
          }
      }
  
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java b/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java

index 42d675c28cb8fc9e007c4b6abe615291102ec44f..a77eedc99993a8953309c554ac8cc2e6267439de 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java
@@ -43,13 +43,24 @@ import org.junit.Test;
  import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
  
  public class TestXWPFBugs {
+    private static final POIDataSamples samples = POIDataSamples.getDocumentInstance();
+
+    @Test
+    public void truncatedDocx() throws Exception {
+        try (InputStream fis = samples.openResourceAsStream("truncated62886.docx");
+            OPCPackage opc = OPCPackage.open(fis);
+            XWPFWordExtractor ext = new XWPFWordExtractor(opc)) {
+            assertNotNull(ext.getText());
+        }
+    }
+
      /**
       * A word document that's encrypted with non-standard
       * Encryption options, and no cspname section. See bug 53475
       */
      @Test
      public void bug53475NoCSPName() throws Exception {
-        File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-solrcell.docx");
+        File file = samples.getFile("bug53475-password-is-solrcell.docx");
          POIFSFileSystem filesystem = new POIFSFileSystem(file, true);
  
          // Check the encryption details
@@ -84,7 +95,7 @@ public class TestXWPFBugs {
          int maxKeyLen = Cipher.getMaxAllowedKeyLength("AES");
          Assume.assumeTrue("Please install JCE Unlimited Strength Jurisdiction Policy files for AES 256", maxKeyLen == 2147483647);
  
-        File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-pass.docx");
+        File file = samples.getFile("bug53475-password-is-pass.docx");
          POIFSFileSystem filesystem = new POIFSFileSystem(file, true);
  
          // Check the encryption details
@@ -117,7 +128,7 @@ public class TestXWPFBugs {
      public void bug59058() throws IOException, XmlException {
          String files[] = { "bug57031.docx", "bug59058.docx" };
          for (String f : files) {
-            ZipFile zf = new ZipFile(POIDataSamples.getDocumentInstance().getFile(f));
+            ZipFile zf = new ZipFile(samples.getFile(f));
              ZipArchiveEntry entry = zf.getEntry("word/document.xml");
              DocumentDocument document = DocumentDocument.Factory.parse(zf.getInputStream(entry));
              assertNotNull(document);
diff --git a/test-data/document/truncated62886.docx b/test-data/document/truncated62886.docx

new file mode 100644 (file)

index 0000000..4998b45

Binary files /dev/null and b/test-data/document/truncated62886.docx differ
author	Andreas Beeker <kiwiwings@apache.org>
	Tue, 18 Dec 2018 23:55:58 +0000 (23:55 +0000)
committer	Andreas Beeker <kiwiwings@apache.org>
	Tue, 18 Dec 2018 23:55:58 +0000 (23:55 +0000)
src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java		patch \| blob \| history
test-data/document/truncated62886.docx	[new file with mode: 0644]	patch \| blob