From b39c87b884892b37b71b2dde3984d0bd24b0a6ab Mon Sep 17 00:00:00 2001 From: Dominik Stadler Date: Sun, 22 Mar 2015 21:47:44 +0000 Subject: [PATCH] Integration tests: Expect exception for old word documents and still run the text extraction for them. Also add executing HPSFPropertiesExtractor where possible git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1668483 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/TestAllFiles.java | 37 +++++++++++++----- .../poi/stress/AbstractFileHandler.java | 15 +++++++ .../apache/poi/stress/HWPFFileHandler.java | 10 +++-- test-data/document/52117.doc | Bin 0 -> 7168 bytes 4 files changed, 50 insertions(+), 12 deletions(-) create mode 100644 test-data/document/52117.doc diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index e8de685f9c..8a66024f7b 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -31,6 +31,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.stress.*; import org.apache.tools.ant.DirectoryScanner; import org.junit.Test; @@ -162,6 +163,20 @@ public class TestAllFiles { HANDLERS.put("spreadsheet/test_properties1", new NullFileHandler()); } + // Old Word Documents where we can at least extract some text + private static final Set OLD_FILES = new HashSet(); + static { + OLD_FILES.add("document/Bug49933.doc"); + OLD_FILES.add("document/Bug51944.doc"); + OLD_FILES.add("document/Word6.doc"); + OLD_FILES.add("document/Word6_sections.doc"); + OLD_FILES.add("document/Word6_sections2.doc"); + OLD_FILES.add("document/Word95.doc"); + OLD_FILES.add("document/word95err.doc"); + OLD_FILES.add("hpsf/TestMickey.doc"); + OLD_FILES.add("document/52117.doc"); + } + private static final Set EXPECTED_FAILURES = new HashSet(); static { // password protected files @@ -202,15 +217,7 @@ public class TestAllFiles { EXPECTED_FAILURES.add("spreadsheet/43493.xls"); EXPECTED_FAILURES.add("spreadsheet/46904.xls"); EXPECTED_FAILURES.add("document/56880.doc"); - EXPECTED_FAILURES.add("document/Bug49933.doc"); EXPECTED_FAILURES.add("document/Bug50955.doc"); - EXPECTED_FAILURES.add("document/Bug51944.doc"); - EXPECTED_FAILURES.add("document/Word6.doc"); - EXPECTED_FAILURES.add("document/Word6_sections.doc"); - EXPECTED_FAILURES.add("document/Word6_sections2.doc"); - EXPECTED_FAILURES.add("document/Word95.doc"); - EXPECTED_FAILURES.add("document/word95err.doc"); - EXPECTED_FAILURES.add("hpsf/TestMickey.doc"); EXPECTED_FAILURES.add("slideshow/PPT95.ppt"); EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx"); EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx"); @@ -269,17 +276,29 @@ public class TestAllFiles { File inputFile = new File(ROOT_DIR, file); try { - InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100); + InputStream stream = new BufferedInputStream(new FileInputStream(inputFile), 64*1024); try { handler.handleFile(stream); assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", EXPECTED_FAILURES.contains(file)); + assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", + OLD_FILES.contains(file)); } finally { stream.close(); } handler.handleExtracting(inputFile); + } catch (OldWordFileFormatException e) { + // for old word files we should still support extracting text + if(OLD_FILES.contains(file)) { + handler.handleExtracting(inputFile); + } else { + // check if we expect failure for this file + if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { + throw new Exception("While handling " + file, e); + } + } } catch (Exception e) { // check if we expect failure for this file if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { diff --git a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java index 8a27e6d0e9..8819083771 100644 --- a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java @@ -28,8 +28,10 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.xmlbeans.XmlException; @@ -89,6 +91,19 @@ public abstract class AbstractFileHandler implements FileHandler { assertEquals("File should not be modified by extractor", modified, file.lastModified()); handleExtractingAsStream(file); + + if(extractor instanceof POIOLE2TextExtractor) { + HPSFPropertiesExtractor hpsfExtractor = new HPSFPropertiesExtractor((POIOLE2TextExtractor)extractor); + try { + assertNotNull(hpsfExtractor.getDocumentSummaryInformationText()); + assertNotNull(hpsfExtractor.getSummaryInformationText()); + String text = hpsfExtractor.getText(); + //System.out.println(text); + assertNotNull(text); + } finally { + hpsfExtractor.close(); + } + } } catch (IllegalArgumentException e) { if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) { throw new Exception("While handling " + file, e); diff --git a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java index 3a223674cd..a56ddd2dc6 100644 --- a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java @@ -63,12 +63,10 @@ public class HWPFFileHandler extends POIFSFileHandler { docTextWriter.close(); } - - // a test-case to test this locally without executing the full TestAllFiles @Test public void test() throws Exception { - File file = new File("test-data/document/51921-Word-Crash067.doc"); + File file = new File("test-data/document/52117.doc"); InputStream stream = new FileInputStream(file); try { @@ -91,4 +89,10 @@ public class HWPFFileHandler extends POIFSFileHandler { stream.close(); } } + + @Test + public void testExtractingOld() throws Exception { + File file = new File("test-data/document/52117.doc"); + handleExtracting(file); + } } diff --git a/test-data/document/52117.doc b/test-data/document/52117.doc new file mode 100644 index 0000000000000000000000000000000000000000..4f966c01c501293cf20f3b0d133c958cdbca30c5 GIT binary patch literal 7168 zcmeI0ZBSHI7{||rU1Y^2R6tP6nv zZJenY$`5_WnZ_9{8=Iy~N2i9y4^GWEHeE7jtm(t5L!kolVl|Ef+yA+{3s*o{urwwP z{Pw&&&vRa$bM8I&9OZ|(!F^vWJkLgj4J@7wg$yh~Q_dls6wRkF7Kb#I4TVCXXfcvA zVH$a0Fw$3=Lmehj4`|M3fka>cvjNHHMB-^QyqT^};DKu9$Dho2R>FLU1FQwJ9D7K| ziqSR^i>3B`5sUsrIy3E$cwi!5kzDKl==(d?=idnXWN;tA7Qp6$d0;+B1!*81EC3lG z6D$Oaz+!MecmON`Ja`Z+1rLFT!7`8qvcYoj2zV5%04qTbFo9f<2lBxxPykkg$G{r! zI9LnTf%RYmcmmKkpNhmIUi<&1ICvx;FL?3*GtiFC*a`KQmC}4!MOVNt`RinUvp?YG z*O>BhEdFhcUco1;`-0U*X2xeafMC6WNOFFMp=>au5Pq!z6D2)Oa(92d#ki!2N*s!Q+7T!qb5E!yuqNaWRZ( zU-TfRy|EcFeHXosnD)q4#MH)K#8m&K!SI@=vZ!<5Nm{z|{rMTPVe3568DS5kQB)G1C) z|KjQX6lp4sCH#6oU^yoYG;K`K8el>#m_!Fn$f#hUPE8aru@>q@G@d};f(fJdU##Id z=+oM9M&hWQ2l;)`%sq&W@E5Ix$%lA6JDmDlP2P6KVyYh4QsoFZyi%>vu-WIfi?T=H zdHzZMOkaPnyXV5?AV2m6uBuuENrEO+zYAU+=)25c?d`eP8|>=3!4E`iJJnad$z(Db z?A`{87_f;u1Q@K#%Zrv&IJ{J{(<@1G-S#?@Y%~;m-Q}W>dI5pM<&^}vv4PLE^J>Bb zJwDgQFO%?ZG*|*JS^e%vE#_jYt;GIJmC;ZwY;P0;LP0*&aWdH5-P1LI4t8GY`K_b7 zbBOQn?KnTs(cc&1F~(qsztGX!*BR0dRrI<2O_IrI+)^rf1ai+-^pUz^5}LQ7R!vqg z8lwG-2sK#Sq}-rwUSH~MzfGLVK?0BbgFicjbusI6jqdF)ovtP z2bt(2-8ClBM-6JS5QfbpGNT5gSxEeDr%Uuna-^S{EZQ3$QoT9|O}4?9%J%Ov9r)J# z(Qf_6yY&Zm>p$`7_Dpp_-fcU)=M7*1;K6E81U7*(Pz`LL1_+=Dw1PJ96Sx2_fy>|; z_yY`q5YS;)(Sszg1n^)vC;?9JGWY--17|=ln2kLr8_?eKJQy6jLI3ophH#R8AsPJ;oP}n>FiD(*6)W6ed`pF#d=4?ob=bLFofb}G^* zJ&w)eve>Lmg2OHPUZ5|(6qeS@(wkW#m%wUgG1p?ov!WEz!!Jg%C8mE2~ zaT!cs=5q_EP=^|m=^Gb;zKM0}TPt!F#PdY&FGTpKx0w##rD^GOwy}#|LEM1PiAyejIU!&*l0tEl