git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_11_FINAL
@@ -39,7 +39,7 @@ | |||
# 2. cd build/dist | |||
# 3. ./mvn-deploy.sh | |||
M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2 | |||
M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2 | |||
VERSION=@VERSION@ | |||
DSTAMP=@DSTAMP@ |
@@ -0,0 +1,97 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hssf.extractor; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
import org.apache.poi.hssf.record.LabelRecord; | |||
import org.apache.poi.hssf.record.OldLabelRecord; | |||
import org.apache.poi.hssf.record.RecordInputStream; | |||
/** | |||
* A text extractor for very old (pre-OLE2) Excel files, | |||
* such as Excel 4 files. | |||
* <p> | |||
* Returns much (but not all) of the textual content of the file, | |||
* suitable for indexing by something like Apache Lucene, or used | |||
* by Apache Tika, but not really intended for display to the user. | |||
* </p> | |||
*/ | |||
public class OldExcelExtractor { | |||
private InputStream input; | |||
private boolean _includeSheetNames = true; | |||
public OldExcelExtractor(InputStream input) { | |||
this.input = input; | |||
} | |||
public OldExcelExtractor(File f) throws IOException { | |||
this.input = new FileInputStream(f); | |||
} | |||
public static void main(String[] args) throws Exception { | |||
if (args.length < 1) { | |||
System.err.println("Use:"); | |||
System.err.println(" OldExcelExtractor <filename>"); | |||
System.exit(1); | |||
} | |||
OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0])); | |||
System.out.println(extractor.getText()); | |||
} | |||
/** | |||
* Should sheet names be included? Default is true | |||
*/ | |||
public void setIncludeSheetNames(boolean includeSheetNames) { | |||
_includeSheetNames = includeSheetNames; | |||
} | |||
/** | |||
* Retrieves the text contents of the file, as best we can | |||
* for these old file formats | |||
*/ | |||
public String getText() { | |||
StringBuffer text = new StringBuffer(); | |||
RecordInputStream ris = new RecordInputStream(input); | |||
while (ris.hasNextRecord()) { | |||
int sid = ris.getNextSid(); | |||
ris.nextRecord(); | |||
switch (sid) { | |||
case LabelRecord.sid: | |||
OldLabelRecord lr = new OldLabelRecord(ris); | |||
text.append(lr.getValue()); | |||
text.append('\n'); | |||
break; | |||
default: | |||
ris.readFully(new byte[ris.remaining()]); | |||
} | |||
// label - 5.63 - TODO Needs codepages | |||
// number - 5.71 | |||
// rk - 5.87 | |||
// string - 5.102 | |||
} | |||
return text.toString(); | |||
} | |||
} |
@@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger; | |||
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't | |||
* use this (except to read), use LabelSST instead <P> | |||
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P> | |||
* @author Andrew C. Oliver (acoliver at apache dot org) | |||
* @author Jason Height (jheight at chariot dot net dot au) | |||
* @version 2.0-pre | |||
* | |||
* @see org.apache.poi.hssf.record.LabelSSTRecord | |||
*/ | |||
public final class LabelRecord extends Record implements CellValueRecordInterface { |
@@ -0,0 +1,168 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hssf.record; | |||
import org.apache.poi.util.HexDump; | |||
import org.apache.poi.util.POILogFactory; | |||
import org.apache.poi.util.POILogger; | |||
/** | |||
* Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for | |||
* strings stored directly in the cell, from the older file formats that | |||
* didn't use {@link LabelSSTRecord} | |||
*/ | |||
public final class OldLabelRecord extends Record implements CellValueRecordInterface { | |||
private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class); | |||
public final static short biff2_sid = 0x0004; | |||
public final static short biff345_sid = 0x0204; | |||
private short sid; | |||
private int field_1_row; | |||
private short field_2_column; | |||
private int field_3_cell_attrs; // Biff 2 | |||
private short field_3_xf_index; // Biff 3+ | |||
private short field_4_string_len; | |||
private byte[] field_5_bytes; | |||
//private XXXXX codepage; // TODO | |||
/** | |||
* @param in the RecordInputstream to read the record from | |||
*/ | |||
public OldLabelRecord(RecordInputStream in) | |||
{ | |||
sid = in.getSid(); | |||
field_1_row = in.readUShort(); | |||
field_2_column = in.readShort(); | |||
if (in.getSid() == biff2_sid) { | |||
field_3_cell_attrs = in.readUShort() << 8; | |||
field_3_cell_attrs += in.readUByte(); | |||
field_4_string_len = (short)in.readUByte(); | |||
} else { | |||
field_3_xf_index = in.readShort(); | |||
field_4_string_len = in.readShort(); | |||
} | |||
// Can only decode properly later when you know the codepage | |||
field_5_bytes = new byte[field_4_string_len]; | |||
in.read(field_5_bytes, 0, field_4_string_len); | |||
if (in.remaining() > 0) { | |||
logger.log(POILogger.INFO, | |||
"LabelRecord data remains: " + in.remaining() + | |||
" : " + HexDump.toHex(in.readRemainder()) | |||
); | |||
} | |||
} | |||
public boolean isBiff2() { | |||
return sid == biff2_sid; | |||
} | |||
public int getRow() | |||
{ | |||
return field_1_row; | |||
} | |||
public short getColumn() | |||
{ | |||
return field_2_column; | |||
} | |||
public short getXFIndex() | |||
{ | |||
return field_3_xf_index; | |||
} | |||
public int getCellAttrs() | |||
{ | |||
return field_3_cell_attrs; | |||
} | |||
/** | |||
* get the number of characters this string contains | |||
* @return number of characters | |||
*/ | |||
public short getStringLength() | |||
{ | |||
return field_4_string_len; | |||
} | |||
/** | |||
* Get the String of the cell | |||
*/ | |||
public String getValue() | |||
{ | |||
// We really need the codepage here to do this right... | |||
return new String(field_5_bytes); | |||
} | |||
/** | |||
* Not supported | |||
*/ | |||
public int serialize(int offset, byte [] data) { | |||
throw new RecordFormatException("Old Label Records are supported READ ONLY"); | |||
} | |||
public int getRecordSize() { | |||
throw new RecordFormatException("Old Label Records are supported READ ONLY"); | |||
} | |||
public short getSid() | |||
{ | |||
return sid; | |||
} | |||
public String toString() | |||
{ | |||
StringBuffer sb = new StringBuffer(); | |||
sb.append("[OLD LABEL]\n"); | |||
sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n"); | |||
sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n"); | |||
if (isBiff2()) { | |||
sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n"); | |||
} else { | |||
sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n"); | |||
} | |||
sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n"); | |||
sb.append(" .value = ").append(getValue()).append("\n"); | |||
sb.append("[/OLD LABEL]\n"); | |||
return sb.toString(); | |||
} | |||
/** | |||
* NO-OP! | |||
*/ | |||
public void setColumn(short col) | |||
{ | |||
} | |||
/** | |||
* NO-OP! | |||
*/ | |||
public void setRow(int row) | |||
{ | |||
} | |||
/** | |||
* no op! | |||
*/ | |||
public void setXFIndex(short xf) | |||
{ | |||
} | |||
} |
@@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest { | |||
SILENT_EXCLUDED.add("46904.xls"); | |||
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header | |||
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption | |||
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 | |||
} | |||
@Override |
@@ -0,0 +1,52 @@ | |||
/* ==================================================================== | |||
Licensed to the Apache Software Foundation (ASF) under one or more | |||
contributor license agreements. See the NOTICE file distributed with | |||
this work for additional information regarding copyright ownership. | |||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||
(the "License"); you may not use this file except in compliance with | |||
the License. You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. | |||
==================================================================== */ | |||
package org.apache.poi.hssf.extractor; | |||
import java.io.InputStream; | |||
import junit.framework.TestCase; | |||
import org.apache.poi.hssf.HSSFTestDataSamples; | |||
/** | |||
* Unit tests for the Excel 4 (and older) text extractor | |||
*/ | |||
public final class TestOldExcelExtractor extends TestCase { | |||
private static OldExcelExtractor createExtractor(String sampleFileName) { | |||
InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName); | |||
try { | |||
return new OldExcelExtractor(is); | |||
} catch (Exception e) { | |||
throw new RuntimeException(e); | |||
} | |||
} | |||
public void testSimple() { | |||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); | |||
// Check we can call getText without error | |||
String text = extractor.getText(); | |||
// Check we find a few words we expect in there | |||
assertTrue(text, text.contains("Size")); | |||
assertTrue(text, text.contains("Returns")); | |||
} | |||
// TODO Rest of the tests | |||
} |