From: Andreas Beeker Date: Tue, 18 Dec 2018 23:55:58 +0000 (+0000) Subject: #62886 - Regression extracting text from corrupted docx files X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=bd951d284af2b0990739222962999dbbbd01a871;p=poi.git #62886 - Regression extracting text from corrupted docx files git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1849252 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java b/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java index 725c7a4835..728bf9849a 100644 --- a/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java +++ b/src/ooxml/java/org/apache/poi/openxml4j/util/ZipArchiveThresholdInputStream.java @@ -20,6 +20,7 @@ package org.apache.poi.openxml4j.util; import static org.apache.poi.openxml4j.util.ZipSecureFile.MAX_ENTRY_SIZE; import static org.apache.poi.openxml4j.util.ZipSecureFile.MIN_INFLATE_RATIO; +import java.io.EOFException; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; @@ -144,6 +145,8 @@ public class ZipArchiveThresholdInputStream extends FilterInputStream { "No valid entries or contents found, this is not a valid OOXML (Office Open XML) file", ze); } throw ze; + } catch (EOFException e) { + return null; } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java b/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java index 42d675c28c..a77eedc999 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFBugs.java @@ -43,13 +43,24 @@ import org.junit.Test; import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument; public class TestXWPFBugs { + private static final POIDataSamples samples = POIDataSamples.getDocumentInstance(); + + @Test + public void truncatedDocx() throws Exception { + try (InputStream fis = samples.openResourceAsStream("truncated62886.docx"); + OPCPackage opc = OPCPackage.open(fis); + XWPFWordExtractor ext = new XWPFWordExtractor(opc)) { + assertNotNull(ext.getText()); + } + } + /** * A word document that's encrypted with non-standard * Encryption options, and no cspname section. See bug 53475 */ @Test public void bug53475NoCSPName() throws Exception { - File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-solrcell.docx"); + File file = samples.getFile("bug53475-password-is-solrcell.docx"); POIFSFileSystem filesystem = new POIFSFileSystem(file, true); // Check the encryption details @@ -84,7 +95,7 @@ public class TestXWPFBugs { int maxKeyLen = Cipher.getMaxAllowedKeyLength("AES"); Assume.assumeTrue("Please install JCE Unlimited Strength Jurisdiction Policy files for AES 256", maxKeyLen == 2147483647); - File file = POIDataSamples.getDocumentInstance().getFile("bug53475-password-is-pass.docx"); + File file = samples.getFile("bug53475-password-is-pass.docx"); POIFSFileSystem filesystem = new POIFSFileSystem(file, true); // Check the encryption details @@ -117,7 +128,7 @@ public class TestXWPFBugs { public void bug59058() throws IOException, XmlException { String files[] = { "bug57031.docx", "bug59058.docx" }; for (String f : files) { - ZipFile zf = new ZipFile(POIDataSamples.getDocumentInstance().getFile(f)); + ZipFile zf = new ZipFile(samples.getFile(f)); ZipArchiveEntry entry = zf.getEntry("word/document.xml"); DocumentDocument document = DocumentDocument.Factory.parse(zf.getInputStream(entry)); assertNotNull(document); diff --git a/test-data/document/truncated62886.docx b/test-data/document/truncated62886.docx new file mode 100644 index 0000000000..4998b45ad6 Binary files /dev/null and b/test-data/document/truncated62886.docx differ