aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2014-11-30 14:22:06 +0000
committerNick Burch <nick@apache.org>2014-11-30 14:22:06 +0000
commit37f001274abe9cf19683a7005912d81766878b78 (patch)
tree5f5434c41641157dad735f0ca03e7941904154f0 /src
parent63fd48d501df365e277e5779475c769edeea9bf0 (diff)
downloadpoi-37f001274abe9cf19683a7005912d81766878b78.tar.gz
poi-37f001274abe9cf19683a7005912d81766878b78.zip
Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java61
-rw-r--r--src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java4
-rw-r--r--src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java36
3 files changed, 90 insertions, 11 deletions
diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
index d78e2268f3..366d8e4996 100644
--- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
@@ -17,6 +17,8 @@
package org.apache.poi.hssf.extractor;
+import java.io.BufferedInputStream;
+import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord;
import org.apache.poi.hssf.record.OldStringRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.RecordInputStream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.ss.usermodel.Cell;
/**
- * A text extractor for very old (pre-OLE2) Excel files,
- * such as Excel 4 files.
+ * A text extractor for old Excel files, which are too old for
+ * HSSFWorkbook to handle. This includes Excel 95, and very old
+ * (pre-OLE2) Excel files, such as Excel 4 files.
* <p>
* Returns much (but not all) of the textual content of the file,
* suitable for indexing by something like Apache Lucene, or used
@@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell;
* </p>
*/
public class OldExcelExtractor {
- private InputStream input;
+ private RecordInputStream ris;
+ private Closeable input;
- public OldExcelExtractor(InputStream input) {
- this.input = input;
+ public OldExcelExtractor(InputStream input) throws IOException {
+ BufferedInputStream bstream = new BufferedInputStream(input, 8);
+ if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) {
+ open(new NPOIFSFileSystem(bstream));
+ } else {
+ open(bstream);
+ }
}
public OldExcelExtractor(File f) throws IOException {
- this.input = new FileInputStream(f);
+ InputStream input = new FileInputStream(f);
+ if (NPOIFSFileSystem.hasPOIFSHeader(input)) {
+ open(new NPOIFSFileSystem(f));
+ } else {
+ open(input);
+ }
+ }
+ public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException {
+ open(fs);
+ }
+ public OldExcelExtractor(DirectoryNode directory) throws IOException {
+ open(directory);
+ }
+
+ private void open(InputStream biffStream) {
+ input = biffStream;
+ ris = new RecordInputStream(biffStream);
+ }
+ private void open(NPOIFSFileSystem fs) throws IOException {
+ input = fs;
+ open(fs.getRoot());
+ }
+ private void open(DirectoryNode directory) throws IOException {
+ DocumentNode book = (DocumentNode)directory.getEntry("Book");
+ if (book == null) {
+ throw new IOException("No Excel 5/95 Book stream found");
+ }
+
+ ris = new RecordInputStream(directory.createDocumentInputStream(book));
}
public static void main(String[] args) throws Exception {
@@ -66,7 +106,6 @@ public class OldExcelExtractor {
public String getText() {
StringBuffer text = new StringBuffer();
- RecordInputStream ris = new RecordInputStream(input);
while (ris.hasNextRecord()) {
int sid = ris.getNextSid();
ris.nextRecord();
@@ -108,6 +147,14 @@ public class OldExcelExtractor {
ris.readFully(new byte[ris.remaining()]);
}
}
+
+ if (input != null) {
+ try {
+ input.close();
+ } catch (IOException e) {}
+ input = null;
+ }
+ ris = null;
return text.toString();
}
diff --git a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
index b7013c1503..e5062a0241 100644
--- a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
+++ b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
@@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
SILENT_EXCLUDED.add("46904.xls");
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
- SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
+ SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
+ SILENT_EXCLUDED.add("testEXCEL_5.xls"); // Biff 5 / Excel 5
+ SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95
}
@Override
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
index f6c36e16c1..8c58f9e4f3 100644
--- a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
@@ -24,7 +24,8 @@ import junit.framework.TestCase;
import org.apache.poi.hssf.HSSFTestDataSamples;
/**
- * Unit tests for the Excel 4 (and older) text extractor
+ * Unit tests for the Excel 5/95 and Excel 4 (and older) text
+ * extractor
*/
public final class TestOldExcelExtractor extends TestCase {
private static OldExcelExtractor createExtractor(String sampleFileName) {
@@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase {
}
}
- public void testSimple() {
+ public void testSimpleExcel4() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
// Check we can call getText without error
@@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase {
assertTrue(text, text.contains("11"));
assertTrue(text, text.contains("784"));
}
+ public void DISABLEDtestSimpleExcel5() {
+ for (String ver : new String[] {"5", "95"}) {
+ OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
+
+ // Check we can call getText without error
+ String text = extractor.getText();
+
+ // Check we find a few words we expect in there
+ assertTrue(text, text.contains("Sample Excel"));
+ assertTrue(text, text.contains("Written and saved"));
+
+ // Check we find a few numbers we expect in there
+ assertTrue(text, text.contains("15"));
+ assertTrue(text, text.contains("169"));
+ }
+ }
public void testStrings() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
@@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase {
// TODO Find some then test
}
- public void testFormattedNumbers() {
+ public void testFormattedNumbersExcel4() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
String text = extractor.getText();
@@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase {
// assertTrue(text, text.contains("55,624"));
// assertTrue(text, text.contains("11,743,477"));
}
+ public void DISABLEDtestFormattedNumbersExcel5() {
+ for (String ver : new String[] {"5", "95"}) {
+ OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
+ String text = extractor.getText();
+
+ // Simple numbers
+ assertTrue(text, text.contains("1"));
+
+ // Numbers which come from formulas
+ assertTrue(text, text.contains("13"));
+ assertTrue(text, text.contains("169"));
+ }
+ }
}