From: Nick Burch Date: Sun, 30 Nov 2014 00:16:23 +0000 (+0000) Subject: Start on a Text Extractor for the pre-OLE2 Excel formats like Excel 4, for TIKA-1490 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=48ab7bf987133915ec3b70dff5987f9f0ac613fa;p=poi.git Start on a Text Extractor for the pre-OLE2 Excel formats like Excel 4, for TIKA-1490 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/maven/mvn-deploy.sh b/maven/mvn-deploy.sh index 2d609e491f..9d1c18a37d 100755 --- a/maven/mvn-deploy.sh +++ b/maven/mvn-deploy.sh @@ -39,7 +39,7 @@ # 2. cd build/dist # 3. ./mvn-deploy.sh -M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2 +M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2 VERSION=@VERSION@ DSTAMP=@DSTAMP@ diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java new file mode 100644 index 0000000000..3c2d067282 --- /dev/null +++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java @@ -0,0 +1,97 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hssf.extractor; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.hssf.record.LabelRecord; +import org.apache.poi.hssf.record.OldLabelRecord; +import org.apache.poi.hssf.record.RecordInputStream; + +/** + * A text extractor for very old (pre-OLE2) Excel files, + * such as Excel 4 files. + *

+ * Returns much (but not all) of the textual content of the file, + * suitable for indexing by something like Apache Lucene, or used + * by Apache Tika, but not really intended for display to the user. + *

+ */ +public class OldExcelExtractor { + private InputStream input; + private boolean _includeSheetNames = true; + + public OldExcelExtractor(InputStream input) { + this.input = input; + } + public OldExcelExtractor(File f) throws IOException { + this.input = new FileInputStream(f); + } + + public static void main(String[] args) throws Exception { + if (args.length < 1) { + System.err.println("Use:"); + System.err.println(" OldExcelExtractor "); + System.exit(1); + } + OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0])); + System.out.println(extractor.getText()); + } + + /** + * Should sheet names be included? Default is true + */ + public void setIncludeSheetNames(boolean includeSheetNames) { + _includeSheetNames = includeSheetNames; + } + + /** + * Retrieves the text contents of the file, as best we can + * for these old file formats + */ + public String getText() { + StringBuffer text = new StringBuffer(); + + RecordInputStream ris = new RecordInputStream(input); + while (ris.hasNextRecord()) { + int sid = ris.getNextSid(); + ris.nextRecord(); + + switch (sid) { + case LabelRecord.sid: + OldLabelRecord lr = new OldLabelRecord(ris); + text.append(lr.getValue()); + text.append('\n'); + break; + default: + ris.readFully(new byte[ris.remaining()]); + } + + // label - 5.63 - TODO Needs codepages + // number - 5.71 + // rk - 5.87 + // string - 5.102 + + } + + return text.toString(); + } +} diff --git a/src/java/org/apache/poi/hssf/record/LabelRecord.java b/src/java/org/apache/poi/hssf/record/LabelRecord.java index 4d2570272b..c7a585672d 100644 --- a/src/java/org/apache/poi/hssf/record/LabelRecord.java +++ b/src/java/org/apache/poi/hssf/record/LabelRecord.java @@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger; * Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't * use this (except to read), use LabelSST instead

* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)

- * @author Andrew C. Oliver (acoliver at apache dot org) - * @author Jason Height (jheight at chariot dot net dot au) - * @version 2.0-pre + * * @see org.apache.poi.hssf.record.LabelSSTRecord */ public final class LabelRecord extends Record implements CellValueRecordInterface { diff --git a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java new file mode 100644 index 0000000000..0fcd1cb4c6 --- /dev/null +++ b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java @@ -0,0 +1,168 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hssf.record; + +import org.apache.poi.util.HexDump; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +/** + * Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for + * strings stored directly in the cell, from the older file formats that + * didn't use {@link LabelSSTRecord} + */ +public final class OldLabelRecord extends Record implements CellValueRecordInterface { + private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class); + + public final static short biff2_sid = 0x0004; + public final static short biff345_sid = 0x0204; + + private short sid; + private int field_1_row; + private short field_2_column; + private int field_3_cell_attrs; // Biff 2 + private short field_3_xf_index; // Biff 3+ + private short field_4_string_len; + private byte[] field_5_bytes; + //private XXXXX codepage; // TODO + + /** + * @param in the RecordInputstream to read the record from + */ + public OldLabelRecord(RecordInputStream in) + { + sid = in.getSid(); + + field_1_row = in.readUShort(); + field_2_column = in.readShort(); + + if (in.getSid() == biff2_sid) { + field_3_cell_attrs = in.readUShort() << 8; + field_3_cell_attrs += in.readUByte(); + field_4_string_len = (short)in.readUByte(); + } else { + field_3_xf_index = in.readShort(); + field_4_string_len = in.readShort(); + } + + // Can only decode properly later when you know the codepage + field_5_bytes = new byte[field_4_string_len]; + in.read(field_5_bytes, 0, field_4_string_len); + + if (in.remaining() > 0) { + logger.log(POILogger.INFO, + "LabelRecord data remains: " + in.remaining() + + " : " + HexDump.toHex(in.readRemainder()) + ); + } + } + + public boolean isBiff2() { + return sid == biff2_sid; + } + + public int getRow() + { + return field_1_row; + } + + public short getColumn() + { + return field_2_column; + } + + public short getXFIndex() + { + return field_3_xf_index; + } + public int getCellAttrs() + { + return field_3_cell_attrs; + } + + /** + * get the number of characters this string contains + * @return number of characters + */ + public short getStringLength() + { + return field_4_string_len; + } + + /** + * Get the String of the cell + */ + public String getValue() + { + // We really need the codepage here to do this right... + return new String(field_5_bytes); + } + + /** + * Not supported + */ + public int serialize(int offset, byte [] data) { + throw new RecordFormatException("Old Label Records are supported READ ONLY"); + } + public int getRecordSize() { + throw new RecordFormatException("Old Label Records are supported READ ONLY"); + } + + public short getSid() + { + return sid; + } + + public String toString() + { + StringBuffer sb = new StringBuffer(); + sb.append("[OLD LABEL]\n"); + sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n"); + sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n"); + if (isBiff2()) { + sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n"); + } else { + sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n"); + } + sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n"); + sb.append(" .value = ").append(getValue()).append("\n"); + sb.append("[/OLD LABEL]\n"); + return sb.toString(); + } + + /** + * NO-OP! + */ + public void setColumn(short col) + { + } + + /** + * NO-OP! + */ + public void setRow(int row) + { + } + + /** + * no op! + */ + public void setXFIndex(short xf) + { + } +} diff --git a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java index ff8e18937e..b7013c1503 100644 --- a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java +++ b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java @@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest { SILENT_EXCLUDED.add("46904.xls"); SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption + SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 } @Override diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java new file mode 100644 index 0000000000..a5c7dbedc2 --- /dev/null +++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java @@ -0,0 +1,52 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hssf.extractor; + +import java.io.InputStream; + +import junit.framework.TestCase; + +import org.apache.poi.hssf.HSSFTestDataSamples; + +/** + * Unit tests for the Excel 4 (and older) text extractor + */ +public final class TestOldExcelExtractor extends TestCase { + private static OldExcelExtractor createExtractor(String sampleFileName) { + InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName); + + try { + return new OldExcelExtractor(is); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public void testSimple() { + OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); + + // Check we can call getText without error + String text = extractor.getText(); + + // Check we find a few words we expect in there + assertTrue(text, text.contains("Size")); + assertTrue(text, text.contains("Returns")); + } + + // TODO Rest of the tests +} diff --git a/test-data/spreadsheet/testEXCEL_4.xls b/test-data/spreadsheet/testEXCEL_4.xls new file mode 100644 index 0000000000..02f58f73bc Binary files /dev/null and b/test-data/spreadsheet/testEXCEL_4.xls differ