]> source.dussan.org Git - poi.git/commitdiff
Start on a Text Extractor for the pre-OLE2 Excel formats like Excel 4, for TIKA-1490
authorNick Burch <nick@apache.org>
Sun, 30 Nov 2014 00:16:23 +0000 (00:16 +0000)
committerNick Burch <nick@apache.org>
Sun, 30 Nov 2014 00:16:23 +0000 (00:16 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68

maven/mvn-deploy.sh
src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java [new file with mode: 0644]
src/java/org/apache/poi/hssf/record/LabelRecord.java
src/java/org/apache/poi/hssf/record/OldLabelRecord.java [new file with mode: 0644]
src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java [new file with mode: 0644]
test-data/spreadsheet/testEXCEL_4.xls [new file with mode: 0644]

index 2d609e491f5761e5730b1de56b1db699786ccaa9..9d1c18a37d1068e5d6dc49b353587b6cdf2b1841 100755 (executable)
@@ -39,7 +39,7 @@
 #   2. cd build/dist
 #   3. ./mvn-deploy.sh 
 
-M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
+M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
 
 VERSION=@VERSION@
 DSTAMP=@DSTAMP@
diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
new file mode 100644 (file)
index 0000000..3c2d067
--- /dev/null
@@ -0,0 +1,97 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.OldLabelRecord;
+import org.apache.poi.hssf.record.RecordInputStream;
+
+/**
+ * A text extractor for very old (pre-OLE2) Excel files,
+ *  such as Excel 4 files.
+ * <p>
+ * Returns much (but not all) of the textual content of the file, 
+ *  suitable for indexing by something like Apache Lucene, or used
+ *  by Apache Tika, but not really intended for display to the user.
+ * </p>
+ */
+public class OldExcelExtractor {
+    private InputStream input;
+    private boolean _includeSheetNames = true;
+
+    public OldExcelExtractor(InputStream input) {
+        this.input = input;
+    }
+    public OldExcelExtractor(File f) throws IOException {
+        this.input = new FileInputStream(f);
+    }
+
+    public static void main(String[] args) throws Exception {
+        if (args.length < 1) {
+            System.err.println("Use:");
+            System.err.println("   OldExcelExtractor <filename>");
+            System.exit(1);
+        }
+        OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0]));
+        System.out.println(extractor.getText());
+    }
+
+    /**
+     * Should sheet names be included? Default is true
+     */
+    public void setIncludeSheetNames(boolean includeSheetNames) {
+        _includeSheetNames = includeSheetNames;
+    }
+
+    /**
+     * Retrieves the text contents of the file, as best we can
+     *  for these old file formats
+     */
+    public String getText() {
+        StringBuffer text = new StringBuffer();
+
+        RecordInputStream ris = new RecordInputStream(input);
+        while (ris.hasNextRecord()) {
+            int sid = ris.getNextSid();
+            ris.nextRecord();
+
+            switch (sid) {
+                case LabelRecord.sid:
+                    OldLabelRecord lr = new OldLabelRecord(ris);
+                    text.append(lr.getValue());
+                    text.append('\n');
+                    break;
+                default:
+                    ris.readFully(new byte[ris.remaining()]);
+            }
+
+            // label - 5.63 - TODO Needs codepages
+            // number - 5.71
+            // rk - 5.87
+            // string - 5.102
+
+        }
+
+        return text.toString();
+    }
+}
index 4d2570272b9d486d3bcb3ff0c2a8103e258b9305..c7a585672d5e6e947bc4d9a8a871a2b0f07f5ee0 100644 (file)
@@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger;
  * Label Record (0x0204) - read only support for strings stored directly in the cell..  Don't
  * use this (except to read), use LabelSST instead <P>
  * REFERENCE:  PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
- * @author Andrew C. Oliver (acoliver at apache dot org)
- * @author Jason Height (jheight at chariot dot net dot au)
- * @version 2.0-pre
+ * 
  * @see org.apache.poi.hssf.record.LabelSSTRecord
  */
 public final class LabelRecord extends Record implements CellValueRecordInterface {
diff --git a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
new file mode 100644 (file)
index 0000000..0fcd1cb
--- /dev/null
@@ -0,0 +1,168 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.record;
+
+import org.apache.poi.util.HexDump;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+/**
+ * Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for 
+ *  strings stored directly in the cell, from the older file formats that
+ *  didn't use {@link LabelSSTRecord}
+ */
+public final class OldLabelRecord extends Record implements CellValueRecordInterface {
+    private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class);
+
+    public final static short biff2_sid = 0x0004;
+    public final static short biff345_sid = 0x0204;
+
+    private short             sid;
+    private int               field_1_row;
+    private short             field_2_column;
+    private int               field_3_cell_attrs; // Biff 2
+    private short             field_3_xf_index;   // Biff 3+
+    private short             field_4_string_len;
+    private byte[]            field_5_bytes;
+    //private XXXXX           codepage; // TODO
+
+    /**
+     * @param in the RecordInputstream to read the record from
+     */
+    public OldLabelRecord(RecordInputStream in)
+    {
+        sid = in.getSid();
+
+        field_1_row          = in.readUShort();
+        field_2_column       = in.readShort();
+
+        if (in.getSid() == biff2_sid) {
+            field_3_cell_attrs = in.readUShort() << 8;
+            field_3_cell_attrs += in.readUByte();
+            field_4_string_len  = (short)in.readUByte();
+        } else {
+            field_3_xf_index     = in.readShort();
+            field_4_string_len   = in.readShort();
+        }
+
+        // Can only decode properly later when you know the codepage
+        field_5_bytes = new byte[field_4_string_len];
+        in.read(field_5_bytes, 0, field_4_string_len);
+
+        if (in.remaining() > 0) {
+            logger.log(POILogger.INFO,
+                    "LabelRecord data remains: " + in.remaining() +
+                    " : " + HexDump.toHex(in.readRemainder())
+                    );
+        }
+    }
+
+    public boolean isBiff2() {
+        return sid == biff2_sid;
+    }
+
+    public int getRow()
+    {
+        return field_1_row;
+    }
+
+    public short getColumn()
+    {
+        return field_2_column;
+    }
+
+    public short getXFIndex()
+    {
+        return field_3_xf_index;
+    }
+    public int getCellAttrs()
+    {
+        return field_3_cell_attrs;
+    }
+
+    /**
+     * get the number of characters this string contains
+     * @return number of characters
+     */
+    public short getStringLength()
+    {
+        return field_4_string_len;
+    }
+
+    /**
+     * Get the String of the cell
+     */
+    public String getValue()
+    {
+        // We really need the codepage here to do this right...
+        return new String(field_5_bytes);
+    }
+
+    /**
+     * Not supported
+     */
+    public int serialize(int offset, byte [] data) {
+        throw new RecordFormatException("Old Label Records are supported READ ONLY");
+    }
+    public int getRecordSize() {
+        throw new RecordFormatException("Old Label Records are supported READ ONLY");
+    }
+
+    public short getSid()
+    {
+        return sid;
+    }
+
+    public String toString()
+    {
+        StringBuffer sb = new StringBuffer();
+        sb.append("[OLD LABEL]\n");
+        sb.append("    .row       = ").append(HexDump.shortToHex(getRow())).append("\n");
+        sb.append("    .column    = ").append(HexDump.shortToHex(getColumn())).append("\n");
+        if (isBiff2()) {
+            sb.append("    .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n");
+        } else {
+            sb.append("    .xfindex   = ").append(HexDump.shortToHex(getXFIndex())).append("\n");
+        }
+        sb.append("    .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n");
+        sb.append("    .value       = ").append(getValue()).append("\n");
+        sb.append("[/OLD LABEL]\n");
+        return sb.toString();
+    }
+
+    /**
+     * NO-OP!
+     */
+    public void setColumn(short col)
+    {
+    }
+
+    /**
+     * NO-OP!
+     */
+    public void setRow(int row)
+    {
+    }
+
+    /**
+     * no op!
+     */
+    public void setXFIndex(short xf)
+    {
+    }
+}
index ff8e18937e3c6c02960784b2219414b2958aa948..b7013c1503e9e4302f146698e32100478b370c25 100644 (file)
@@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
                SILENT_EXCLUDED.add("46904.xls");
         SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header 
                SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
+        SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
        }
 
        @Override
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
new file mode 100644 (file)
index 0000000..a5c7dbe
--- /dev/null
@@ -0,0 +1,52 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.extractor;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.hssf.HSSFTestDataSamples;
+
+/**
+ * Unit tests for the Excel 4 (and older) text extractor
+ */
+public final class TestOldExcelExtractor extends TestCase {
+    private static OldExcelExtractor createExtractor(String sampleFileName) {
+        InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName);
+
+        try {
+            return new OldExcelExtractor(is);
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void testSimple() {
+        OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+
+        // Check we can call getText without error
+        String text = extractor.getText();
+
+        // Check we find a few words we expect in there
+        assertTrue(text, text.contains("Size"));
+        assertTrue(text, text.contains("Returns"));
+    }
+
+    // TODO Rest of the tests
+}
diff --git a/test-data/spreadsheet/testEXCEL_4.xls b/test-data/spreadsheet/testEXCEL_4.xls
new file mode 100644 (file)
index 0000000..02f58f7
Binary files /dev/null and b/test-data/spreadsheet/testEXCEL_4.xls differ