From ff4b0376c8e08bfbd2fe73d240069e65fbce8a5f Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sun, 30 Nov 2014 00:48:17 +0000 Subject: [PATCH] Further Excel 4 text extractor support, for TIKA-1490 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642491 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hssf/extractor/OldExcelExtractor.java | 44 +++++++++-- .../apache/poi/hssf/record/FormulaRecord.java | 1 + .../poi/hssf/record/OldLabelRecord.java | 2 +- .../poi/hssf/record/OldStringRecord.java | 78 +++++++++++++++++++ .../hssf/extractor/TestOldExcelExtractor.java | 40 +++++++++- 5 files changed, 155 insertions(+), 10 deletions(-) create mode 100644 src/java/org/apache/poi/hssf/record/OldStringRecord.java diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java index 3c2d067282..e3705f159d 100644 --- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java @@ -22,9 +22,13 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import org.apache.poi.hssf.record.LabelRecord; +import org.apache.poi.hssf.record.FormulaRecord; +import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.OldLabelRecord; +import org.apache.poi.hssf.record.OldStringRecord; +import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.RecordInputStream; +import org.apache.poi.ss.usermodel.Cell; /** * A text extractor for very old (pre-OLE2) Excel files, @@ -76,20 +80,44 @@ public class OldExcelExtractor { ris.nextRecord(); switch (sid) { - case LabelRecord.sid: + // label - 5.63 - TODO Needs codepages + case OldLabelRecord.biff2_sid: + case OldLabelRecord.biff345_sid: OldLabelRecord lr = new OldLabelRecord(ris); text.append(lr.getValue()); text.append('\n'); break; + // string - 5.102 - TODO Needs codepages + case OldStringRecord.biff2_sid: + case OldStringRecord.biff345_sid: + OldStringRecord sr = new OldStringRecord(ris); + text.append(sr.getString()); + text.append('\n'); + break; + // number - 5.71 - TODO Needs format strings + case NumberRecord.sid: + NumberRecord nr = new NumberRecord(ris); + text.append(nr.getValue()); + text.append('\n'); + break; +/* + case OldFormulaRecord.sid: + FormulaRecord fr = new FormulaRecord(ris); +System.out.println(fr.getCachedResultType()); + if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) { + text.append(fr.getValue()); + text.append('\n'); + } +*/ + case RKRecord.sid: + RKRecord rr = new RKRecord(ris); + text.append(rr.getRKNumber()); + text.append('\n'); + break; default: ris.readFully(new byte[ris.remaining()]); + // text.append(" = " + ris.getSid() + " = \n"); } - - // label - 5.63 - TODO Needs codepages - // number - 5.71 - // rk - 5.87 - // string - 5.102 - } return text.toString(); diff --git a/src/java/org/apache/poi/hssf/record/FormulaRecord.java b/src/java/org/apache/poi/hssf/record/FormulaRecord.java index c0a63d3092..ccd3cb0412 100644 --- a/src/java/org/apache/poi/hssf/record/FormulaRecord.java +++ b/src/java/org/apache/poi/hssf/record/FormulaRecord.java @@ -36,6 +36,7 @@ import org.apache.poi.util.LittleEndianOutput; public final class FormulaRecord extends CellRecord { public static final short sid = 0x0006; // docs say 406...because of a bug Microsoft support site article #Q184647) + public static final short olderSid = 0x0406; // older biff versions do manage 406! private static int FIXED_SIZE = 14; // double + short + int private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001); diff --git a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java index 0fcd1cb4c6..97d6bef962 100644 --- a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java +++ b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java @@ -39,7 +39,7 @@ public final class OldLabelRecord extends Record implements CellValueRecordInter private short field_3_xf_index; // Biff 3+ private short field_4_string_len; private byte[] field_5_bytes; - //private XXXXX codepage; // TODO + //private XXXXX codepage; // TODO Implement for this and OldStringRecord /** * @param in the RecordInputstream to read the record from diff --git a/src/java/org/apache/poi/hssf/record/OldStringRecord.java b/src/java/org/apache/poi/hssf/record/OldStringRecord.java new file mode 100644 index 0000000000..42549e2c2e --- /dev/null +++ b/src/java/org/apache/poi/hssf/record/OldStringRecord.java @@ -0,0 +1,78 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hssf.record; + + +/** + * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for + * formula string results. + */ +public final class OldStringRecord { + public final static short biff2_sid = 0x0007; + public final static short biff345_sid = 0x0207; + + private short sid; + private short field_1_string_len; + private byte[] field_2_bytes; + //private XXXXX codepage; // TODO Implement for this and OldLabelRecord + + /** + * @param in the RecordInputstream to read the record from + */ + public OldStringRecord(RecordInputStream in) { + sid = in.getSid(); + + if (in.getSid() == biff2_sid) { + field_1_string_len = (short)in.readUByte(); + } else { + field_1_string_len = in.readShort(); + } + + // Can only decode properly later when you know the codepage + field_2_bytes = new byte[field_1_string_len]; + in.read(field_2_bytes, 0, field_1_string_len); + } + + public boolean isBiff2() { + return sid == biff2_sid; + } + + public short getSid() { + return sid; + } + + /** + * @return The string represented by this record. + */ + public String getString() + { + // We really need the codepage here to do this right... + return new String(field_2_bytes); + } + + public String toString() + { + StringBuffer buffer = new StringBuffer(); + + buffer.append("[OLD STRING]\n"); + buffer.append(" .string = ") + .append(getString()).append("\n"); + buffer.append("[/OLD STRING]\n"); + return buffer.toString(); + } +} diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java index a5c7dbedc2..fd057cd63c 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java @@ -46,7 +46,45 @@ public final class TestOldExcelExtractor extends TestCase { // Check we find a few words we expect in there assertTrue(text, text.contains("Size")); assertTrue(text, text.contains("Returns")); + + // Check we find a few numbers we expect in there + assertTrue(text, text.contains("11")); + assertTrue(text, text.contains("784")); } - // TODO Rest of the tests + public void testStrings() { + OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); + String text = extractor.getText(); + + // Simple strings + assertTrue(text, text.contains("Table 10 -- Examination Coverage:")); + assertTrue(text, text.contains("Recommended and Average Recommended Additional Tax After")); + assertTrue(text, text.contains("Individual income tax returns, total")); + + // More complicated strings + assertTrue(text, text.contains("$100,000 or more")); + assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]")); + // TODO Get these quotes working correctly +// assertTrue(text, text.contains("individual income tax return “short forms.”")); + + // Formula based strings + // TODO Find some then test + } + + public void testFormattedNumbers() { + OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); + String text = extractor.getText(); + + // Simple numbers + assertTrue(text, text.contains("151")); + assertTrue(text, text.contains("784")); + + // Numbers which come from formulas + // TODO +// assertTrue(text, text.contains("0.40")); +// assertTrue(text, text.contains("624")); + + // Formatted numbers + // TODO + } } -- 2.39.5