import java.util.Map;
import java.util.Set;
+import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.stress.*;
import org.apache.tools.ant.DirectoryScanner;
import org.junit.Test;
HANDLERS.put("spreadsheet/test_properties1", new NullFileHandler());
}
+ // Old Word Documents where we can at least extract some text
+ private static final Set<String> OLD_FILES = new HashSet<String>();
+ static {
+ OLD_FILES.add("document/Bug49933.doc");
+ OLD_FILES.add("document/Bug51944.doc");
+ OLD_FILES.add("document/Word6.doc");
+ OLD_FILES.add("document/Word6_sections.doc");
+ OLD_FILES.add("document/Word6_sections2.doc");
+ OLD_FILES.add("document/Word95.doc");
+ OLD_FILES.add("document/word95err.doc");
+ OLD_FILES.add("hpsf/TestMickey.doc");
+ OLD_FILES.add("document/52117.doc");
+ }
+
private static final Set<String> EXPECTED_FAILURES = new HashSet<String>();
static {
// password protected files
EXPECTED_FAILURES.add("spreadsheet/43493.xls");
EXPECTED_FAILURES.add("spreadsheet/46904.xls");
EXPECTED_FAILURES.add("document/56880.doc");
- EXPECTED_FAILURES.add("document/Bug49933.doc");
EXPECTED_FAILURES.add("document/Bug50955.doc");
- EXPECTED_FAILURES.add("document/Bug51944.doc");
- EXPECTED_FAILURES.add("document/Word6.doc");
- EXPECTED_FAILURES.add("document/Word6_sections.doc");
- EXPECTED_FAILURES.add("document/Word6_sections2.doc");
- EXPECTED_FAILURES.add("document/Word95.doc");
- EXPECTED_FAILURES.add("document/word95err.doc");
- EXPECTED_FAILURES.add("hpsf/TestMickey.doc");
EXPECTED_FAILURES.add("slideshow/PPT95.ppt");
EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx");
EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx");
File inputFile = new File(ROOT_DIR, file);
try {
- InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100);
+ InputStream stream = new BufferedInputStream(new FileInputStream(inputFile), 64*1024);
try {
handler.handleFile(stream);
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
EXPECTED_FAILURES.contains(file));
+ assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
+ OLD_FILES.contains(file));
} finally {
stream.close();
}
handler.handleExtracting(inputFile);
+ } catch (OldWordFileFormatException e) {
+ // for old word files we should still support extracting text
+ if(OLD_FILES.contains(file)) {
+ handler.handleExtracting(inputFile);
+ } else {
+ // check if we expect failure for this file
+ if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
+ throw new Exception("While handling " + file, e);
+ }
+ }
} catch (Exception e) {
// check if we expect failure for this file
if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
import java.util.HashSet;\r
import java.util.Set;\r
\r
+import org.apache.poi.POIOLE2TextExtractor;\r
import org.apache.poi.POITextExtractor;\r
import org.apache.poi.extractor.ExtractorFactory;\r
+import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;\r
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;\r
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;\r
import org.apache.xmlbeans.XmlException;\r
assertEquals("File should not be modified by extractor", modified, file.lastModified());\r
\r
handleExtractingAsStream(file);\r
+ \r
+ if(extractor instanceof POIOLE2TextExtractor) {\r
+ HPSFPropertiesExtractor hpsfExtractor = new HPSFPropertiesExtractor((POIOLE2TextExtractor)extractor);\r
+ try {\r
+ assertNotNull(hpsfExtractor.getDocumentSummaryInformationText());\r
+ assertNotNull(hpsfExtractor.getSummaryInformationText());\r
+ String text = hpsfExtractor.getText();\r
+ //System.out.println(text);\r
+ assertNotNull(text);\r
+ } finally {\r
+ hpsfExtractor.close();\r
+ }\r
+ }\r
} catch (IllegalArgumentException e) {\r
if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {\r
throw new Exception("While handling " + file, e);\r