From: Nick Burch Date: Sun, 30 Nov 2014 14:22:06 +0000 (+0000) Subject: Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=101f66c7890f7684fe2ba156d43991abc7d545ea;p=poi.git Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java index d78e2268f3..366d8e4996 100644 --- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java @@ -17,6 +17,8 @@ package org.apache.poi.hssf.extractor; +import java.io.BufferedInputStream; +import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord; import org.apache.poi.hssf.record.OldStringRecord; import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.RecordInputStream; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.ss.usermodel.Cell; /** - * A text extractor for very old (pre-OLE2) Excel files, - * such as Excel 4 files. + * A text extractor for old Excel files, which are too old for + * HSSFWorkbook to handle. This includes Excel 95, and very old + * (pre-OLE2) Excel files, such as Excel 4 files. *

* Returns much (but not all) of the textual content of the file, * suitable for indexing by something like Apache Lucene, or used @@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell; *

*/ public class OldExcelExtractor { - private InputStream input; + private RecordInputStream ris; + private Closeable input; - public OldExcelExtractor(InputStream input) { - this.input = input; + public OldExcelExtractor(InputStream input) throws IOException { + BufferedInputStream bstream = new BufferedInputStream(input, 8); + if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) { + open(new NPOIFSFileSystem(bstream)); + } else { + open(bstream); + } } public OldExcelExtractor(File f) throws IOException { - this.input = new FileInputStream(f); + InputStream input = new FileInputStream(f); + if (NPOIFSFileSystem.hasPOIFSHeader(input)) { + open(new NPOIFSFileSystem(f)); + } else { + open(input); + } + } + public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException { + open(fs); + } + public OldExcelExtractor(DirectoryNode directory) throws IOException { + open(directory); + } + + private void open(InputStream biffStream) { + input = biffStream; + ris = new RecordInputStream(biffStream); + } + private void open(NPOIFSFileSystem fs) throws IOException { + input = fs; + open(fs.getRoot()); + } + private void open(DirectoryNode directory) throws IOException { + DocumentNode book = (DocumentNode)directory.getEntry("Book"); + if (book == null) { + throw new IOException("No Excel 5/95 Book stream found"); + } + + ris = new RecordInputStream(directory.createDocumentInputStream(book)); } public static void main(String[] args) throws Exception { @@ -66,7 +106,6 @@ public class OldExcelExtractor { public String getText() { StringBuffer text = new StringBuffer(); - RecordInputStream ris = new RecordInputStream(input); while (ris.hasNextRecord()) { int sid = ris.getNextSid(); ris.nextRecord(); @@ -108,6 +147,14 @@ public class OldExcelExtractor { ris.readFully(new byte[ris.remaining()]); } } + + if (input != null) { + try { + input.close(); + } catch (IOException e) {} + input = null; + } + ris = null; return text.toString(); } diff --git a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java index b7013c1503..e5062a0241 100644 --- a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java +++ b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java @@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest { SILENT_EXCLUDED.add("46904.xls"); SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption - SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 + SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 + SILENT_EXCLUDED.add("testEXCEL_5.xls"); // Biff 5 / Excel 5 + SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95 } @Override diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java index f6c36e16c1..8c58f9e4f3 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java @@ -24,7 +24,8 @@ import junit.framework.TestCase; import org.apache.poi.hssf.HSSFTestDataSamples; /** - * Unit tests for the Excel 4 (and older) text extractor + * Unit tests for the Excel 5/95 and Excel 4 (and older) text + * extractor */ public final class TestOldExcelExtractor extends TestCase { private static OldExcelExtractor createExtractor(String sampleFileName) { @@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase { } } - public void testSimple() { + public void testSimpleExcel4() { OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); // Check we can call getText without error @@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase { assertTrue(text, text.contains("11")); assertTrue(text, text.contains("784")); } + public void DISABLEDtestSimpleExcel5() { + for (String ver : new String[] {"5", "95"}) { + OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls"); + + // Check we can call getText without error + String text = extractor.getText(); + + // Check we find a few words we expect in there + assertTrue(text, text.contains("Sample Excel")); + assertTrue(text, text.contains("Written and saved")); + + // Check we find a few numbers we expect in there + assertTrue(text, text.contains("15")); + assertTrue(text, text.contains("169")); + } + } public void testStrings() { OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); @@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase { // TODO Find some then test } - public void testFormattedNumbers() { + public void testFormattedNumbersExcel4() { OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); String text = extractor.getText(); @@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase { // assertTrue(text, text.contains("55,624")); // assertTrue(text, text.contains("11,743,477")); } + public void DISABLEDtestFormattedNumbersExcel5() { + for (String ver : new String[] {"5", "95"}) { + OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls"); + String text = extractor.getText(); + + // Simple numbers + assertTrue(text, text.contains("1")); + + // Numbers which come from formulas + assertTrue(text, text.contains("13")); + assertTrue(text, text.contains("169")); + } + } } diff --git a/test-data/spreadsheet/testEXCEL_5.xls b/test-data/spreadsheet/testEXCEL_5.xls new file mode 100644 index 0000000000..ed69b3d30f Binary files /dev/null and b/test-data/spreadsheet/testEXCEL_5.xls differ diff --git a/test-data/spreadsheet/testEXCEL_95.xls b/test-data/spreadsheet/testEXCEL_95.xls new file mode 100644 index 0000000000..49c65041c6 Binary files /dev/null and b/test-data/spreadsheet/testEXCEL_95.xls differ