]> source.dussan.org Git - poi.git/commitdiff
Further Excel 4 text extractor support, for TIKA-1490
authorNick Burch <nick@apache.org>
Sun, 30 Nov 2014 00:48:17 +0000 (00:48 +0000)
committerNick Burch <nick@apache.org>
Sun, 30 Nov 2014 00:48:17 +0000 (00:48 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642491 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
src/java/org/apache/poi/hssf/record/FormulaRecord.java
src/java/org/apache/poi/hssf/record/OldLabelRecord.java
src/java/org/apache/poi/hssf/record/OldStringRecord.java [new file with mode: 0644]
src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java

index 3c2d067282a2939a2954d16d8591041544ddf235..e3705f159d3b951d6cd38f0de298417a1db7b023 100644 (file)
@@ -22,9 +22,13 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.NumberRecord;
 import org.apache.poi.hssf.record.OldLabelRecord;
+import org.apache.poi.hssf.record.OldStringRecord;
+import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.RecordInputStream;
+import org.apache.poi.ss.usermodel.Cell;
 
 /**
  * A text extractor for very old (pre-OLE2) Excel files,
@@ -76,20 +80,44 @@ public class OldExcelExtractor {
             ris.nextRecord();
 
             switch (sid) {
-                case LabelRecord.sid:
+                // label - 5.63 - TODO Needs codepages
+                case OldLabelRecord.biff2_sid:
+                case OldLabelRecord.biff345_sid:
                     OldLabelRecord lr = new OldLabelRecord(ris);
                     text.append(lr.getValue());
                     text.append('\n');
                     break;
+                // string - 5.102 - TODO Needs codepages
+                case OldStringRecord.biff2_sid:
+                case OldStringRecord.biff345_sid:
+                    OldStringRecord sr = new OldStringRecord(ris);
+                    text.append(sr.getString());
+                    text.append('\n');
+                    break;
+                // number - 5.71 - TODO Needs format strings
+                case NumberRecord.sid:
+                    NumberRecord nr = new NumberRecord(ris);
+                    text.append(nr.getValue());
+                    text.append('\n');
+                    break;
+/*                    
+                case OldFormulaRecord.sid:
+                    FormulaRecord fr = new FormulaRecord(ris);
+System.out.println(fr.getCachedResultType());                    
+                    if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) {
+                        text.append(fr.getValue());
+                        text.append('\n');
+                    }
+*/
+                case RKRecord.sid:
+                    RKRecord rr = new RKRecord(ris);
+                    text.append(rr.getRKNumber());
+                    text.append('\n');
+                    break;
                 default:
                     ris.readFully(new byte[ris.remaining()]);
+      //              text.append(" = " + ris.getSid() + " = \n");
             }
-
-            // label - 5.63 - TODO Needs codepages
-            // number - 5.71
-            // rk - 5.87
-            // string - 5.102
-
         }
 
         return text.toString();
index c0a63d3092aebbe4ea0b1b6bd2475b60b3f5fcb5..ccd3cb041291b3cffe42cbb76e7ccb2553c1013e 100644 (file)
@@ -36,6 +36,7 @@ import org.apache.poi.util.LittleEndianOutput;
 public final class FormulaRecord extends CellRecord {
 
        public static final short sid = 0x0006;   // docs say 406...because of a bug Microsoft support site article #Q184647)
+       public static final short olderSid = 0x0406; // older biff versions do manage 406! 
        private static int FIXED_SIZE = 14; // double + short + int
 
        private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001);
index 0fcd1cb4c6ed20a09a5158dad349b0b91d099571..97d6bef9627280896af3c2dedf2f82c24d7182b4 100644 (file)
@@ -39,7 +39,7 @@ public final class OldLabelRecord extends Record implements CellValueRecordInter
     private short             field_3_xf_index;   // Biff 3+
     private short             field_4_string_len;
     private byte[]            field_5_bytes;
-    //private XXXXX           codepage; // TODO
+    //private XXXXX           codepage; // TODO Implement for this and OldStringRecord
 
     /**
      * @param in the RecordInputstream to read the record from
diff --git a/src/java/org/apache/poi/hssf/record/OldStringRecord.java b/src/java/org/apache/poi/hssf/record/OldStringRecord.java
new file mode 100644 (file)
index 0000000..42549e2
--- /dev/null
@@ -0,0 +1,78 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.record;
+
+
+/**
+ * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for 
+ *  formula string results.
+ */
+public final class OldStringRecord {
+    public final static short biff2_sid = 0x0007;
+    public final static short biff345_sid = 0x0207;
+
+    private short             sid;
+    private short             field_1_string_len;
+    private byte[]            field_2_bytes;
+    //private XXXXX           codepage; // TODO Implement for this and OldLabelRecord
+
+    /**
+     * @param in the RecordInputstream to read the record from
+     */
+    public OldStringRecord(RecordInputStream in) {
+        sid = in.getSid();
+        
+        if (in.getSid() == biff2_sid) {
+            field_1_string_len  = (short)in.readUByte();
+        } else {
+            field_1_string_len   = in.readShort();
+        }
+
+        // Can only decode properly later when you know the codepage
+        field_2_bytes = new byte[field_1_string_len];
+        in.read(field_2_bytes, 0, field_1_string_len);
+    }
+
+    public boolean isBiff2() {
+        return sid == biff2_sid;
+    }
+
+    public short getSid() {
+        return sid;
+    }
+
+    /**
+     * @return The string represented by this record.
+     */
+    public String getString()
+    {
+        // We really need the codepage here to do this right...
+        return new String(field_2_bytes);
+    }
+
+    public String toString()
+    {
+        StringBuffer buffer = new StringBuffer();
+
+        buffer.append("[OLD STRING]\n");
+        buffer.append("    .string            = ")
+            .append(getString()).append("\n");
+        buffer.append("[/OLD STRING]\n");
+        return buffer.toString();
+    }
+}
index a5c7dbedc220572763271435b290e1b6485cb2fa..fd057cd63c74da340ef552cdfd1eb674b296d1cf 100644 (file)
@@ -46,7 +46,45 @@ public final class TestOldExcelExtractor extends TestCase {
         // Check we find a few words we expect in there
         assertTrue(text, text.contains("Size"));
         assertTrue(text, text.contains("Returns"));
+        
+        // Check we find a few numbers we expect in there
+        assertTrue(text, text.contains("11"));
+        assertTrue(text, text.contains("784"));
     }
 
-    // TODO Rest of the tests
+    public void testStrings() {
+        OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+        String text = extractor.getText();
+
+        // Simple strings
+        assertTrue(text, text.contains("Table 10 -- Examination Coverage:"));
+        assertTrue(text, text.contains("Recommended and Average Recommended Additional Tax After"));
+        assertTrue(text, text.contains("Individual income tax returns, total"));
+        
+        // More complicated strings
+        assertTrue(text, text.contains("$100,000 or more"));
+        assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]"));
+        // TODO Get these quotes working correctly
+//        assertTrue(text, text.contains("individual income tax return “short forms.”"));
+        
+        // Formula based strings
+        // TODO Find some then test
+    }
+
+    public void testFormattedNumbers() {
+        OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+        String text = extractor.getText();
+
+        // Simple numbers
+        assertTrue(text, text.contains("151"));
+        assertTrue(text, text.contains("784"));
+        
+        // Numbers which come from formulas
+        // TODO
+//        assertTrue(text, text.contains("0.40"));
+//        assertTrue(text, text.contains("624"));
+        
+        // Formatted numbers
+        // TODO
+    }
 }