aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xmaven/mvn-deploy.sh2
-rw-r--r--src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java97
-rw-r--r--src/java/org/apache/poi/hssf/record/LabelRecord.java4
-rw-r--r--src/java/org/apache/poi/hssf/record/OldLabelRecord.java168
-rw-r--r--src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java1
-rw-r--r--src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java52
-rw-r--r--test-data/spreadsheet/testEXCEL_4.xlsbin0 -> 39942 bytes
7 files changed, 320 insertions, 4 deletions
diff --git a/maven/mvn-deploy.sh b/maven/mvn-deploy.sh
index 2d609e491f..9d1c18a37d 100755
--- a/maven/mvn-deploy.sh
+++ b/maven/mvn-deploy.sh
@@ -39,7 +39,7 @@
# 2. cd build/dist
# 3. ./mvn-deploy.sh
-M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
+M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
VERSION=@VERSION@
DSTAMP=@DSTAMP@
diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
new file mode 100644
index 0000000000..3c2d067282
--- /dev/null
+++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
@@ -0,0 +1,97 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.OldLabelRecord;
+import org.apache.poi.hssf.record.RecordInputStream;
+
+/**
+ * A text extractor for very old (pre-OLE2) Excel files,
+ * such as Excel 4 files.
+ * <p>
+ * Returns much (but not all) of the textual content of the file,
+ * suitable for indexing by something like Apache Lucene, or used
+ * by Apache Tika, but not really intended for display to the user.
+ * </p>
+ */
+public class OldExcelExtractor {
+ private InputStream input;
+ private boolean _includeSheetNames = true;
+
+ public OldExcelExtractor(InputStream input) {
+ this.input = input;
+ }
+ public OldExcelExtractor(File f) throws IOException {
+ this.input = new FileInputStream(f);
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" OldExcelExtractor <filename>");
+ System.exit(1);
+ }
+ OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0]));
+ System.out.println(extractor.getText());
+ }
+
+ /**
+ * Should sheet names be included? Default is true
+ */
+ public void setIncludeSheetNames(boolean includeSheetNames) {
+ _includeSheetNames = includeSheetNames;
+ }
+
+ /**
+ * Retrieves the text contents of the file, as best we can
+ * for these old file formats
+ */
+ public String getText() {
+ StringBuffer text = new StringBuffer();
+
+ RecordInputStream ris = new RecordInputStream(input);
+ while (ris.hasNextRecord()) {
+ int sid = ris.getNextSid();
+ ris.nextRecord();
+
+ switch (sid) {
+ case LabelRecord.sid:
+ OldLabelRecord lr = new OldLabelRecord(ris);
+ text.append(lr.getValue());
+ text.append('\n');
+ break;
+ default:
+ ris.readFully(new byte[ris.remaining()]);
+ }
+
+ // label - 5.63 - TODO Needs codepages
+ // number - 5.71
+ // rk - 5.87
+ // string - 5.102
+
+ }
+
+ return text.toString();
+ }
+}
diff --git a/src/java/org/apache/poi/hssf/record/LabelRecord.java b/src/java/org/apache/poi/hssf/record/LabelRecord.java
index 4d2570272b..c7a585672d 100644
--- a/src/java/org/apache/poi/hssf/record/LabelRecord.java
+++ b/src/java/org/apache/poi/hssf/record/LabelRecord.java
@@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger;
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't
* use this (except to read), use LabelSST instead <P>
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
- * @author Andrew C. Oliver (acoliver at apache dot org)
- * @author Jason Height (jheight at chariot dot net dot au)
- * @version 2.0-pre
+ *
* @see org.apache.poi.hssf.record.LabelSSTRecord
*/
public final class LabelRecord extends Record implements CellValueRecordInterface {
diff --git a/src/java/org/apache/poi/hssf/record/OldLabelRecord.java b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
new file mode 100644
index 0000000000..0fcd1cb4c6
--- /dev/null
+++ b/src/java/org/apache/poi/hssf/record/OldLabelRecord.java
@@ -0,0 +1,168 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.record;
+
+import org.apache.poi.util.HexDump;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+/**
+ * Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for
+ * strings stored directly in the cell, from the older file formats that
+ * didn't use {@link LabelSSTRecord}
+ */
+public final class OldLabelRecord extends Record implements CellValueRecordInterface {
+ private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class);
+
+ public final static short biff2_sid = 0x0004;
+ public final static short biff345_sid = 0x0204;
+
+ private short sid;
+ private int field_1_row;
+ private short field_2_column;
+ private int field_3_cell_attrs; // Biff 2
+ private short field_3_xf_index; // Biff 3+
+ private short field_4_string_len;
+ private byte[] field_5_bytes;
+ //private XXXXX codepage; // TODO
+
+ /**
+ * @param in the RecordInputstream to read the record from
+ */
+ public OldLabelRecord(RecordInputStream in)
+ {
+ sid = in.getSid();
+
+ field_1_row = in.readUShort();
+ field_2_column = in.readShort();
+
+ if (in.getSid() == biff2_sid) {
+ field_3_cell_attrs = in.readUShort() << 8;
+ field_3_cell_attrs += in.readUByte();
+ field_4_string_len = (short)in.readUByte();
+ } else {
+ field_3_xf_index = in.readShort();
+ field_4_string_len = in.readShort();
+ }
+
+ // Can only decode properly later when you know the codepage
+ field_5_bytes = new byte[field_4_string_len];
+ in.read(field_5_bytes, 0, field_4_string_len);
+
+ if (in.remaining() > 0) {
+ logger.log(POILogger.INFO,
+ "LabelRecord data remains: " + in.remaining() +
+ " : " + HexDump.toHex(in.readRemainder())
+ );
+ }
+ }
+
+ public boolean isBiff2() {
+ return sid == biff2_sid;
+ }
+
+ public int getRow()
+ {
+ return field_1_row;
+ }
+
+ public short getColumn()
+ {
+ return field_2_column;
+ }
+
+ public short getXFIndex()
+ {
+ return field_3_xf_index;
+ }
+ public int getCellAttrs()
+ {
+ return field_3_cell_attrs;
+ }
+
+ /**
+ * get the number of characters this string contains
+ * @return number of characters
+ */
+ public short getStringLength()
+ {
+ return field_4_string_len;
+ }
+
+ /**
+ * Get the String of the cell
+ */
+ public String getValue()
+ {
+ // We really need the codepage here to do this right...
+ return new String(field_5_bytes);
+ }
+
+ /**
+ * Not supported
+ */
+ public int serialize(int offset, byte [] data) {
+ throw new RecordFormatException("Old Label Records are supported READ ONLY");
+ }
+ public int getRecordSize() {
+ throw new RecordFormatException("Old Label Records are supported READ ONLY");
+ }
+
+ public short getSid()
+ {
+ return sid;
+ }
+
+ public String toString()
+ {
+ StringBuffer sb = new StringBuffer();
+ sb.append("[OLD LABEL]\n");
+ sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n");
+ sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n");
+ if (isBiff2()) {
+ sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n");
+ } else {
+ sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n");
+ }
+ sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n");
+ sb.append(" .value = ").append(getValue()).append("\n");
+ sb.append("[/OLD LABEL]\n");
+ return sb.toString();
+ }
+
+ /**
+ * NO-OP!
+ */
+ public void setColumn(short col)
+ {
+ }
+
+ /**
+ * NO-OP!
+ */
+ public void setRow(int row)
+ {
+ }
+
+ /**
+ * no op!
+ */
+ public void setXFIndex(short xf)
+ {
+ }
+}
diff --git a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
index ff8e18937e..b7013c1503 100644
--- a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
+++ b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
@@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
SILENT_EXCLUDED.add("46904.xls");
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
+ SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
}
@Override
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
new file mode 100644
index 0000000000..a5c7dbedc2
--- /dev/null
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
@@ -0,0 +1,52 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hssf.extractor;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.hssf.HSSFTestDataSamples;
+
+/**
+ * Unit tests for the Excel 4 (and older) text extractor
+ */
+public final class TestOldExcelExtractor extends TestCase {
+ private static OldExcelExtractor createExtractor(String sampleFileName) {
+ InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName);
+
+ try {
+ return new OldExcelExtractor(is);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void testSimple() {
+ OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
+
+ // Check we can call getText without error
+ String text = extractor.getText();
+
+ // Check we find a few words we expect in there
+ assertTrue(text, text.contains("Size"));
+ assertTrue(text, text.contains("Returns"));
+ }
+
+ // TODO Rest of the tests
+}
diff --git a/test-data/spreadsheet/testEXCEL_4.xls b/test-data/spreadsheet/testEXCEL_4.xls
new file mode 100644
index 0000000000..02f58f73bc
--- /dev/null
+++ b/test-data/spreadsheet/testEXCEL_4.xls
Binary files differ