]> source.dussan.org Git - poi.git/commitdiff
Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490
authorNick Burch <nick@apache.org>
Sun, 30 Nov 2014 14:22:06 +0000 (14:22 +0000)
committerNick Burch <nick@apache.org>
Sun, 30 Nov 2014 14:22:06 +0000 (14:22 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
test-data/spreadsheet/testEXCEL_5.xls [new file with mode: 0644]
test-data/spreadsheet/testEXCEL_95.xls [new file with mode: 0644]

index d78e2268f3c17af3d33ea81697ab28921845576f..366d8e499697e9b147dcd2aea02d494ac600a0f0 100644 (file)
@@ -17,6 +17,8 @@
 
 package org.apache.poi.hssf.extractor;
 
+import java.io.BufferedInputStream;
+import java.io.Closeable;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord;
 import org.apache.poi.hssf.record.OldStringRecord;
 import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.RecordInputStream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.ss.usermodel.Cell;
 
 /**
- * A text extractor for very old (pre-OLE2) Excel files,
- *  such as Excel 4 files.
+ * A text extractor for old Excel files, which are too old for
+ *  HSSFWorkbook to handle. This includes Excel 95, and very old 
+ *  (pre-OLE2) Excel files, such as Excel 4 files.
  * <p>
  * Returns much (but not all) of the textual content of the file, 
  *  suitable for indexing by something like Apache Lucene, or used
@@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell;
  * </p>
  */
 public class OldExcelExtractor {
-    private InputStream input;
+    private RecordInputStream ris;
+    private Closeable input;
 
-    public OldExcelExtractor(InputStream input) {
-        this.input = input;
+    public OldExcelExtractor(InputStream input) throws IOException {
+        BufferedInputStream bstream = new BufferedInputStream(input, 8);
+        if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) {
+            open(new NPOIFSFileSystem(bstream));
+        } else {
+            open(bstream);
+        }
     }
     public OldExcelExtractor(File f) throws IOException {
-        this.input = new FileInputStream(f);
+        InputStream input = new FileInputStream(f);
+        if (NPOIFSFileSystem.hasPOIFSHeader(input)) {
+            open(new NPOIFSFileSystem(f));
+        } else {
+            open(input);
+        }
+    }
+    public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException {
+        open(fs);
+    }
+    public OldExcelExtractor(DirectoryNode directory) throws IOException {
+        open(directory);
+    }
+
+    private void open(InputStream biffStream) {
+        input = biffStream;
+        ris = new RecordInputStream(biffStream);
+    }
+    private void open(NPOIFSFileSystem fs) throws IOException {
+        input = fs;
+        open(fs.getRoot());
+    }
+    private void open(DirectoryNode directory) throws IOException {
+        DocumentNode book = (DocumentNode)directory.getEntry("Book");
+        if (book == null) {
+            throw new IOException("No Excel 5/95 Book stream found");
+        }
+        
+        ris = new RecordInputStream(directory.createDocumentInputStream(book));
     }
 
     public static void main(String[] args) throws Exception {
@@ -66,7 +106,6 @@ public class OldExcelExtractor {
     public String getText() {
         StringBuffer text = new StringBuffer();
 
-        RecordInputStream ris = new RecordInputStream(input);
         while (ris.hasNextRecord()) {
             int sid = ris.getNextSid();
             ris.nextRecord();
@@ -108,6 +147,14 @@ public class OldExcelExtractor {
                     ris.readFully(new byte[ris.remaining()]);
             }
         }
+        
+        if (input != null) {
+            try {
+                input.close();
+            } catch (IOException e) {}
+            input = null;
+        }
+        ris = null;
 
         return text.toString();
     }
index b7013c1503e9e4302f146698e32100478b370c25..e5062a0241bbddbe7406a64d18421b2a2d68fc90 100644 (file)
@@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
                SILENT_EXCLUDED.add("46904.xls");
         SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header 
                SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
-        SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
+        SILENT_EXCLUDED.add("testEXCEL_4.xls");  // Biff 4 / Excel 4, pre-OLE2
+        SILENT_EXCLUDED.add("testEXCEL_5.xls");  // Biff 5 / Excel 5
+        SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95
        }
 
        @Override
index f6c36e16c1de51631bfa42a99b4051205be49825..8c58f9e4f3bd8521f357a83ff9145ebba76563a0 100644 (file)
@@ -24,7 +24,8 @@ import junit.framework.TestCase;
 import org.apache.poi.hssf.HSSFTestDataSamples;
 
 /**
- * Unit tests for the Excel 4 (and older) text extractor
+ * Unit tests for the Excel 5/95 and Excel 4 (and older) text 
+ *  extractor
  */
 public final class TestOldExcelExtractor extends TestCase {
     private static OldExcelExtractor createExtractor(String sampleFileName) {
@@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase {
         }
     }
 
-    public void testSimple() {
+    public void testSimpleExcel4() {
         OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
 
         // Check we can call getText without error
@@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase {
         assertTrue(text, text.contains("11"));
         assertTrue(text, text.contains("784"));
     }
+    public void DISABLEDtestSimpleExcel5() {
+        for (String ver : new String[] {"5", "95"}) {
+            OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
+    
+            // Check we can call getText without error
+            String text = extractor.getText();
+    
+            // Check we find a few words we expect in there
+            assertTrue(text, text.contains("Sample Excel"));
+            assertTrue(text, text.contains("Written and saved"));
+            
+            // Check we find a few numbers we expect in there
+            assertTrue(text, text.contains("15"));
+            assertTrue(text, text.contains("169"));
+        }
+    }
 
     public void testStrings() {
         OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
@@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase {
         // TODO Find some then test
     }
 
-    public void testFormattedNumbers() {
+    public void testFormattedNumbersExcel4() {
         OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
         String text = extractor.getText();
 
@@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase {
 //      assertTrue(text, text.contains("55,624"));
 //      assertTrue(text, text.contains("11,743,477"));
     }
+    public void DISABLEDtestFormattedNumbersExcel5() {
+        for (String ver : new String[] {"5", "95"}) {
+            OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
+            String text = extractor.getText();
+            
+            // Simple numbers
+            assertTrue(text, text.contains("1"));
+            
+            // Numbers which come from formulas
+            assertTrue(text, text.contains("13"));
+            assertTrue(text, text.contains("169"));
+        }
+    }
 }
diff --git a/test-data/spreadsheet/testEXCEL_5.xls b/test-data/spreadsheet/testEXCEL_5.xls
new file mode 100644 (file)
index 0000000..ed69b3d
Binary files /dev/null and b/test-data/spreadsheet/testEXCEL_5.xls differ
diff --git a/test-data/spreadsheet/testEXCEL_95.xls b/test-data/spreadsheet/testEXCEL_95.xls
new file mode 100644 (file)
index 0000000..49c6504
Binary files /dev/null and b/test-data/spreadsheet/testEXCEL_95.xls differ