* Fix NullPointerException found in some documents when running against the test-data * Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely formatted XLS files. git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1662652 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_12_FINAL
@@ -253,20 +253,26 @@ public class TestAllFiles { | |||
@Test | |||
public void testAllFiles() throws Exception { | |||
assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler); | |||
InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100); | |||
File inputFile = new File(ROOT_DIR, file); | |||
try { | |||
handler.handleFile(stream); | |||
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", | |||
EXPECTED_FAILURES.contains(file)); | |||
} catch (Exception e) { | |||
// check if we expect failure for this file | |||
if(!EXPECTED_FAILURES.contains(file)) { | |||
throw new Exception("While handling " + file, e); | |||
} | |||
} finally { | |||
stream.close(); | |||
} | |||
InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100); | |||
try { | |||
handler.handleFile(stream); | |||
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", | |||
EXPECTED_FAILURES.contains(file)); | |||
} finally { | |||
stream.close(); | |||
} | |||
handler.handleExtracting(inputFile); | |||
} catch (Exception e) { | |||
// check if we expect failure for this file | |||
if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { | |||
throw new Exception("While handling " + file, e); | |||
} | |||
} | |||
} | |||
private static String getExtension(String file) { | |||
@@ -282,5 +288,9 @@ public class TestAllFiles { | |||
@Override | |||
public void handleFile(InputStream stream) throws Exception { | |||
} | |||
@Override | |||
public void handleExtracting(File file) throws Exception { | |||
} | |||
} | |||
} |
@@ -0,0 +1,55 @@ | |||
package org.apache.poi.stress; | |||
import static org.junit.Assert.assertFalse; | |||
import static org.junit.Assert.assertNotNull; | |||
import java.io.File; | |||
import java.util.HashSet; | |||
import java.util.Set; | |||
import org.apache.poi.POITextExtractor; | |||
import org.apache.poi.extractor.ExtractorFactory; | |||
public abstract class AbstractFileHandler implements FileHandler { | |||
public static final Set<String> EXPECTED_EXTRACTOR_FAILURES = new HashSet<String>(); | |||
static { | |||
// password protected files | |||
EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx"); | |||
// unsupported file-types, no supported OLE2 parts | |||
EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2"); | |||
EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx"); | |||
} | |||
public void handleExtracting(File file) throws Exception { | |||
POITextExtractor extractor = ExtractorFactory.createExtractor(file); | |||
try { | |||
assertNotNull(extractor); | |||
assertNotNull(extractor.getText()); | |||
// also try metadata | |||
POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor(); | |||
assertNotNull(metadataExtractor.getText()); | |||
assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!", | |||
EXPECTED_EXTRACTOR_FAILURES.contains(file)); | |||
} catch (IllegalArgumentException e) { | |||
if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) { | |||
throw new Exception("While handling " + file, e); | |||
} | |||
} finally { | |||
extractor.close(); | |||
} | |||
} | |||
} |
@@ -16,6 +16,7 @@ | |||
==================================================================== */ | |||
package org.apache.poi.stress; | |||
import java.io.File; | |||
import java.io.InputStream; | |||
/** | |||
@@ -34,4 +35,10 @@ public interface FileHandler { | |||
* @throws Exception | |||
*/ | |||
void handleFile(InputStream stream) throws Exception; | |||
/** | |||
* Ensures that extracting text from the given file | |||
* is returning some text. | |||
*/ | |||
void handleExtracting(File file) throws Exception; | |||
} |
@@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAPIAttribute; | |||
import org.apache.poi.hmef.attribute.MAPIStringAttribute; | |||
import org.junit.Test; | |||
public class HMEFFileHandler implements FileHandler { | |||
public class HMEFFileHandler extends AbstractFileHandler { | |||
@Override | |||
public void handleFile(InputStream stream) throws Exception { |
@@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertiesOnlyDocument; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
import org.junit.Test; | |||
public class HPSFFileHandler implements FileHandler { | |||
public class HPSFFileHandler extends AbstractFileHandler { | |||
@Override | |||
public void handleFile(InputStream stream) throws Exception { | |||
HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream)); |
@@ -16,6 +16,7 @@ | |||
==================================================================== */ | |||
package org.apache.poi.stress; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.InputStream; | |||
@@ -49,4 +50,10 @@ public class HSSFFileHandler extends SpreadsheetHandler { | |||
stream.close(); | |||
} | |||
} | |||
// a test-case to test this locally without executing the full TestAllFiles | |||
@Test | |||
public void testExtractor() throws Exception { | |||
handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls")); | |||
} | |||
} |
@@ -25,7 +25,7 @@ import java.io.InputStream; | |||
import org.apache.poi.POIDocument; | |||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | |||
public class POIFSFileHandler implements FileHandler { | |||
public class POIFSFileHandler extends AbstractFileHandler { | |||
@Override | |||
public void handleFile(InputStream stream) throws Exception { |
@@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet; | |||
import org.apache.poi.ss.usermodel.Workbook; | |||
import org.apache.poi.ss.usermodel.WorkbookFactory; | |||
public abstract class SpreadsheetHandler implements FileHandler { | |||
public abstract class SpreadsheetHandler extends AbstractFileHandler { | |||
public void handleWorkbook(Workbook wb, String extension) throws IOException { | |||
// try to access some of the content | |||
readContent(wb); |
@@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage; | |||
import org.apache.poi.xslf.XSLFSlideShow; | |||
import org.junit.Test; | |||
public class XSLFFileHandler implements FileHandler { | |||
public class XSLFFileHandler extends AbstractFileHandler { | |||
@Override | |||
public void handleFile(InputStream stream) throws Exception { | |||
// ignore password protected files |
@@ -22,7 +22,7 @@ import java.io.InputStream; | |||
import org.apache.poi.xwpf.usermodel.XWPFDocument; | |||
import org.junit.Test; | |||
public class XWPFFileHandler implements FileHandler { | |||
public class XWPFFileHandler extends AbstractFileHandler { | |||
@Override | |||
public void handleFile(InputStream stream) throws Exception { | |||
// ignore password protected files |
@@ -213,7 +213,9 @@ public class ExtractorFactory { | |||
{ | |||
// Look for certain entries in the stream, to figure it | |||
// out from | |||
if (poifsDir.hasEntry("Workbook")) { | |||
if (poifsDir.hasEntry("Workbook") || | |||
// some XLS files have different entry-names | |||
poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) { | |||
if (getPreferEventExtractor()) { | |||
return new EventBasedExcelExtractor(poifsDir); | |||
} |
@@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor | |||
} | |||
POIXMLTextExtractor extractor = | |||
new XSSFExcelExtractor(args[0]); | |||
System.out.println(extractor.getText()); | |||
try { | |||
System.out.println(extractor.getText()); | |||
} finally { | |||
extractor.close(); | |||
} | |||
} | |||
/** | |||
@@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor | |||
if (type == Cell.CELL_TYPE_NUMERIC) { | |||
CellStyle cs = cell.getCellStyle(); | |||
if (cs.getDataFormatString() != null) { | |||
if (cs != null && cs.getDataFormatString() != null) { | |||
text.append(formatter.formatRawCellContents( | |||
cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString() | |||
)); |