git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_11_FINAL
# 2. cd build/dist | # 2. cd build/dist | ||||
# 3. ./mvn-deploy.sh | # 3. ./mvn-deploy.sh | ||||
M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2 | |||||
M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2 | |||||
VERSION=@VERSION@ | VERSION=@VERSION@ | ||||
DSTAMP=@DSTAMP@ | DSTAMP=@DSTAMP@ |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hssf.extractor; | |||||
import java.io.File; | |||||
import java.io.FileInputStream; | |||||
import java.io.IOException; | |||||
import java.io.InputStream; | |||||
import org.apache.poi.hssf.record.LabelRecord; | |||||
import org.apache.poi.hssf.record.OldLabelRecord; | |||||
import org.apache.poi.hssf.record.RecordInputStream; | |||||
/** | |||||
* A text extractor for very old (pre-OLE2) Excel files, | |||||
* such as Excel 4 files. | |||||
* <p> | |||||
* Returns much (but not all) of the textual content of the file, | |||||
* suitable for indexing by something like Apache Lucene, or used | |||||
* by Apache Tika, but not really intended for display to the user. | |||||
* </p> | |||||
*/ | |||||
public class OldExcelExtractor { | |||||
private InputStream input; | |||||
private boolean _includeSheetNames = true; | |||||
public OldExcelExtractor(InputStream input) { | |||||
this.input = input; | |||||
} | |||||
public OldExcelExtractor(File f) throws IOException { | |||||
this.input = new FileInputStream(f); | |||||
} | |||||
public static void main(String[] args) throws Exception { | |||||
if (args.length < 1) { | |||||
System.err.println("Use:"); | |||||
System.err.println(" OldExcelExtractor <filename>"); | |||||
System.exit(1); | |||||
} | |||||
OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0])); | |||||
System.out.println(extractor.getText()); | |||||
} | |||||
/** | |||||
* Should sheet names be included? Default is true | |||||
*/ | |||||
public void setIncludeSheetNames(boolean includeSheetNames) { | |||||
_includeSheetNames = includeSheetNames; | |||||
} | |||||
/** | |||||
* Retrieves the text contents of the file, as best we can | |||||
* for these old file formats | |||||
*/ | |||||
public String getText() { | |||||
StringBuffer text = new StringBuffer(); | |||||
RecordInputStream ris = new RecordInputStream(input); | |||||
while (ris.hasNextRecord()) { | |||||
int sid = ris.getNextSid(); | |||||
ris.nextRecord(); | |||||
switch (sid) { | |||||
case LabelRecord.sid: | |||||
OldLabelRecord lr = new OldLabelRecord(ris); | |||||
text.append(lr.getValue()); | |||||
text.append('\n'); | |||||
break; | |||||
default: | |||||
ris.readFully(new byte[ris.remaining()]); | |||||
} | |||||
// label - 5.63 - TODO Needs codepages | |||||
// number - 5.71 | |||||
// rk - 5.87 | |||||
// string - 5.102 | |||||
} | |||||
return text.toString(); | |||||
} | |||||
} |
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't | * Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't | ||||
* use this (except to read), use LabelSST instead <P> | * use this (except to read), use LabelSST instead <P> | ||||
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P> | * REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P> | ||||
* @author Andrew C. Oliver (acoliver at apache dot org) | |||||
* @author Jason Height (jheight at chariot dot net dot au) | |||||
* @version 2.0-pre | |||||
* | |||||
* @see org.apache.poi.hssf.record.LabelSSTRecord | * @see org.apache.poi.hssf.record.LabelSSTRecord | ||||
*/ | */ | ||||
public final class LabelRecord extends Record implements CellValueRecordInterface { | public final class LabelRecord extends Record implements CellValueRecordInterface { |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hssf.record; | |||||
import org.apache.poi.util.HexDump; | |||||
import org.apache.poi.util.POILogFactory; | |||||
import org.apache.poi.util.POILogger; | |||||
/** | |||||
* Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for | |||||
* strings stored directly in the cell, from the older file formats that | |||||
* didn't use {@link LabelSSTRecord} | |||||
*/ | |||||
public final class OldLabelRecord extends Record implements CellValueRecordInterface { | |||||
private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class); | |||||
public final static short biff2_sid = 0x0004; | |||||
public final static short biff345_sid = 0x0204; | |||||
private short sid; | |||||
private int field_1_row; | |||||
private short field_2_column; | |||||
private int field_3_cell_attrs; // Biff 2 | |||||
private short field_3_xf_index; // Biff 3+ | |||||
private short field_4_string_len; | |||||
private byte[] field_5_bytes; | |||||
//private XXXXX codepage; // TODO | |||||
/** | |||||
* @param in the RecordInputstream to read the record from | |||||
*/ | |||||
public OldLabelRecord(RecordInputStream in) | |||||
{ | |||||
sid = in.getSid(); | |||||
field_1_row = in.readUShort(); | |||||
field_2_column = in.readShort(); | |||||
if (in.getSid() == biff2_sid) { | |||||
field_3_cell_attrs = in.readUShort() << 8; | |||||
field_3_cell_attrs += in.readUByte(); | |||||
field_4_string_len = (short)in.readUByte(); | |||||
} else { | |||||
field_3_xf_index = in.readShort(); | |||||
field_4_string_len = in.readShort(); | |||||
} | |||||
// Can only decode properly later when you know the codepage | |||||
field_5_bytes = new byte[field_4_string_len]; | |||||
in.read(field_5_bytes, 0, field_4_string_len); | |||||
if (in.remaining() > 0) { | |||||
logger.log(POILogger.INFO, | |||||
"LabelRecord data remains: " + in.remaining() + | |||||
" : " + HexDump.toHex(in.readRemainder()) | |||||
); | |||||
} | |||||
} | |||||
public boolean isBiff2() { | |||||
return sid == biff2_sid; | |||||
} | |||||
public int getRow() | |||||
{ | |||||
return field_1_row; | |||||
} | |||||
public short getColumn() | |||||
{ | |||||
return field_2_column; | |||||
} | |||||
public short getXFIndex() | |||||
{ | |||||
return field_3_xf_index; | |||||
} | |||||
public int getCellAttrs() | |||||
{ | |||||
return field_3_cell_attrs; | |||||
} | |||||
/** | |||||
* get the number of characters this string contains | |||||
* @return number of characters | |||||
*/ | |||||
public short getStringLength() | |||||
{ | |||||
return field_4_string_len; | |||||
} | |||||
/** | |||||
* Get the String of the cell | |||||
*/ | |||||
public String getValue() | |||||
{ | |||||
// We really need the codepage here to do this right... | |||||
return new String(field_5_bytes); | |||||
} | |||||
/** | |||||
* Not supported | |||||
*/ | |||||
public int serialize(int offset, byte [] data) { | |||||
throw new RecordFormatException("Old Label Records are supported READ ONLY"); | |||||
} | |||||
public int getRecordSize() { | |||||
throw new RecordFormatException("Old Label Records are supported READ ONLY"); | |||||
} | |||||
public short getSid() | |||||
{ | |||||
return sid; | |||||
} | |||||
public String toString() | |||||
{ | |||||
StringBuffer sb = new StringBuffer(); | |||||
sb.append("[OLD LABEL]\n"); | |||||
sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n"); | |||||
sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n"); | |||||
if (isBiff2()) { | |||||
sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n"); | |||||
} else { | |||||
sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n"); | |||||
} | |||||
sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n"); | |||||
sb.append(" .value = ").append(getValue()).append("\n"); | |||||
sb.append("[/OLD LABEL]\n"); | |||||
return sb.toString(); | |||||
} | |||||
/** | |||||
* NO-OP! | |||||
*/ | |||||
public void setColumn(short col) | |||||
{ | |||||
} | |||||
/** | |||||
* NO-OP! | |||||
*/ | |||||
public void setRow(int row) | |||||
{ | |||||
} | |||||
/** | |||||
* no op! | |||||
*/ | |||||
public void setXFIndex(short xf) | |||||
{ | |||||
} | |||||
} |
SILENT_EXCLUDED.add("46904.xls"); | SILENT_EXCLUDED.add("46904.xls"); | ||||
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header | SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header | ||||
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption | SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption | ||||
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 | |||||
} | } | ||||
@Override | @Override |
/* ==================================================================== | |||||
Licensed to the Apache Software Foundation (ASF) under one or more | |||||
contributor license agreements. See the NOTICE file distributed with | |||||
this work for additional information regarding copyright ownership. | |||||
The ASF licenses this file to You under the Apache License, Version 2.0 | |||||
(the "License"); you may not use this file except in compliance with | |||||
the License. You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
==================================================================== */ | |||||
package org.apache.poi.hssf.extractor; | |||||
import java.io.InputStream; | |||||
import junit.framework.TestCase; | |||||
import org.apache.poi.hssf.HSSFTestDataSamples; | |||||
/** | |||||
* Unit tests for the Excel 4 (and older) text extractor | |||||
*/ | |||||
public final class TestOldExcelExtractor extends TestCase { | |||||
private static OldExcelExtractor createExtractor(String sampleFileName) { | |||||
InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName); | |||||
try { | |||||
return new OldExcelExtractor(is); | |||||
} catch (Exception e) { | |||||
throw new RuntimeException(e); | |||||
} | |||||
} | |||||
public void testSimple() { | |||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); | |||||
// Check we can call getText without error | |||||
String text = extractor.getText(); | |||||
// Check we find a few words we expect in there | |||||
assertTrue(text, text.contains("Size")); | |||||
assertTrue(text, text.contains("Returns")); | |||||
} | |||||
// TODO Rest of the tests | |||||
} |