123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hssf.extractor;
-
- import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME;
- import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
-
- import java.io.BufferedInputStream;
- import java.io.Closeable;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
-
- import org.apache.poi.EncryptedDocumentException;
- import org.apache.poi.extractor.POITextExtractor;
- import org.apache.poi.hssf.OldExcelFormatException;
- import org.apache.poi.hssf.record.BOFRecord;
- import org.apache.poi.hssf.record.CodepageRecord;
- import org.apache.poi.hssf.record.FormulaRecord;
- import org.apache.poi.hssf.record.NumberRecord;
- import org.apache.poi.hssf.record.OldFormulaRecord;
- import org.apache.poi.hssf.record.OldLabelRecord;
- import org.apache.poi.hssf.record.OldSheetRecord;
- import org.apache.poi.hssf.record.OldStringRecord;
- import org.apache.poi.hssf.record.RKRecord;
- import org.apache.poi.hssf.record.RecordInputStream;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.DocumentNode;
- import org.apache.poi.poifs.filesystem.FileMagic;
- import org.apache.poi.poifs.filesystem.NotOLE2FileException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.ss.usermodel.CellType;
- import org.apache.poi.util.IOUtils;
-
- /**
- * A text extractor for old Excel files, which are too old for
- * HSSFWorkbook to handle. This includes Excel 95, and very old
- * (pre-OLE2) Excel files, such as Excel 4 files.
- * <p>
- * Returns much (but not all) of the textual content of the file,
- * suitable for indexing by something like Apache Lucene, or used
- * by Apache Tika, but not really intended for display to the user.
- * </p>
- */
- public class OldExcelExtractor implements POITextExtractor {
-
- private static final int FILE_PASS_RECORD_SID = 0x2f;
- //arbitrarily selected; may need to increase
- private static final int MAX_RECORD_LENGTH = 100_000;
-
-
- private RecordInputStream ris;
-
- // sometimes we hold the stream here and thus need to ensure it is closed at some point
- private Closeable toClose;
-
- private int biffVersion;
- private int fileType;
-
- public OldExcelExtractor(InputStream input) throws IOException {
- open(input);
- }
-
- public OldExcelExtractor(File f) throws IOException {
- POIFSFileSystem poifs = null;
- try {
- poifs = new POIFSFileSystem(f);
- open(poifs);
- toClose = poifs;
- return;
- } catch (OldExcelFormatException | NotOLE2FileException e) {
- // will be handled by workaround below
- } finally {
- if (toClose == null) {
- IOUtils.closeQuietly(poifs);
- }
- }
-
- @SuppressWarnings("resource")
- FileInputStream biffStream = new FileInputStream(f); // NOSONAR
- try {
- open(biffStream);
- } catch (IOException | RuntimeException e) {
- // ensure that the stream is properly closed here if an Exception
- // is thrown while opening
- biffStream.close();
-
- toClose.close();
-
- throw e;
- }
- }
-
- public OldExcelExtractor(POIFSFileSystem fs) throws IOException {
- toClose = fs;
-
- open(fs);
- }
-
- public OldExcelExtractor(DirectoryNode directory) throws IOException {
- toClose = directory.getFileSystem();
-
- open(directory);
- }
-
- private void open(InputStream biffStream) throws IOException {
- BufferedInputStream bis = (biffStream instanceof BufferedInputStream)
- ? (BufferedInputStream)biffStream
- : new BufferedInputStream(biffStream, 8);
-
- if (FileMagic.valueOf(bis) == FileMagic.OLE2) {
- POIFSFileSystem poifs = new POIFSFileSystem(bis);
- try {
- open(poifs);
- toClose = poifs; // Fixed by GR, we should not close it here
- } finally {
- if (toClose == null) {
- poifs.close();
- }
- }
- } else {
- ris = new RecordInputStream(bis);
- toClose = bis;
- prepare();
- }
- }
-
- private void open(POIFSFileSystem fs) throws IOException {
- open(fs.getRoot());
- }
-
- private void open(DirectoryNode directory) throws IOException {
- DocumentNode book;
- try {
- book = (DocumentNode)directory.getEntry(OLD_WORKBOOK_DIR_ENTRY_NAME);
- } catch (FileNotFoundException | IllegalArgumentException e) {
- // some files have "Workbook" instead
- book = (DocumentNode)directory.getEntry(WORKBOOK_DIR_ENTRY_NAMES[0]);
- }
-
- if (book == null) {
- throw new IOException("No Excel 5/95 Book stream found");
- }
-
- ris = new RecordInputStream(directory.createDocumentInputStream(book));
- prepare();
- }
-
- public static void main(String[] args) throws IOException {
- if (args.length < 1) {
- System.err.println("Use:");
- System.err.println(" OldExcelExtractor <filename>");
- System.exit(1);
- }
- OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0]));
- System.out.println(extractor.getText());
- extractor.close();
- }
-
- private void prepare() {
- if (! ris.hasNextRecord()) {
- throw new IllegalArgumentException("File contains no records!");
- }
- ris.nextRecord();
-
- // Work out what version we're dealing with
- int bofSid = ris.getSid();
- switch (bofSid) {
- case BOFRecord.biff2_sid:
- biffVersion = 2;
- break;
- case BOFRecord.biff3_sid:
- biffVersion = 3;
- break;
- case BOFRecord.biff4_sid:
- biffVersion = 4;
- break;
- case BOFRecord.biff5_sid:
- biffVersion = 5;
- break;
- default:
- throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid);
- }
-
- // Get the type
- BOFRecord bof = new BOFRecord(ris);
- fileType = bof.getType();
- }
-
- /**
- * The Biff version, largely corresponding to the Excel version
- *
- * @return the Biff version
- */
- public int getBiffVersion() {
- return biffVersion;
- }
-
- /**
- * The kind of the file, one of {@link BOFRecord#TYPE_WORKSHEET},
- * {@link BOFRecord#TYPE_CHART}, {@link BOFRecord#TYPE_EXCEL_4_MACRO}
- * or {@link BOFRecord#TYPE_WORKSPACE_FILE}
- *
- * @return the file type
- */
- public int getFileType() {
- return fileType;
- }
-
- /**
- * Retrieves the text contents of the file, as best we can
- * for these old file formats
- *
- * @return the text contents of the file
- */
- public String getText() {
- StringBuilder text = new StringBuilder();
-
- // To track formats and encodings
- CodepageRecord codepage = null;
- // TODO track the XFs and Format Strings
-
- // Process each record in turn, looking for interesting ones
- while (ris.hasNextRecord()) {
- int sid = ris.getNextSid();
- ris.nextRecord();
-
- switch (sid) {
- case FILE_PASS_RECORD_SID:
- throw new EncryptedDocumentException("Encryption not supported for Old Excel files");
-
- case OldSheetRecord.sid:
- OldSheetRecord shr = new OldSheetRecord(ris);
- shr.setCodePage(codepage);
- text.append("Sheet: ");
- text.append(shr.getSheetname());
- text.append('\n');
- break;
-
- case OldLabelRecord.biff2_sid:
- case OldLabelRecord.biff345_sid:
- OldLabelRecord lr = new OldLabelRecord(ris);
- lr.setCodePage(codepage);
- text.append(lr.getValue());
- text.append('\n');
- break;
- case OldStringRecord.biff2_sid:
- case OldStringRecord.biff345_sid:
- OldStringRecord sr = new OldStringRecord(ris);
- sr.setCodePage(codepage);
- text.append(sr.getString());
- text.append('\n');
- break;
-
- case NumberRecord.sid:
- NumberRecord nr = new NumberRecord(ris);
- handleNumericCell(text, nr.getValue());
- break;
- case OldFormulaRecord.biff2_sid:
- case OldFormulaRecord.biff3_sid:
- case OldFormulaRecord.biff4_sid:
- // Biff 2 and 5+ share the same SID, due to a bug...
- if (biffVersion == 5) {
- FormulaRecord fr = new FormulaRecord(ris);
- if (fr.getCachedResultTypeEnum() == CellType.NUMERIC) {
- handleNumericCell(text, fr.getValue());
- }
- } else {
- OldFormulaRecord fr = new OldFormulaRecord(ris);
- if (fr.getCachedResultTypeEnum() == CellType.NUMERIC) {
- handleNumericCell(text, fr.getValue());
- }
- }
- break;
- case RKRecord.sid:
- RKRecord rr = new RKRecord(ris);
- handleNumericCell(text, rr.getRKNumber());
- break;
-
- case CodepageRecord.sid:
- codepage = new CodepageRecord(ris);
- break;
-
- default:
- ris.readFully(IOUtils.safelyAllocate(ris.remaining(), MAX_RECORD_LENGTH));
- }
- }
-
- ris = null;
-
- return text.toString();
- }
-
- protected void handleNumericCell(StringBuilder text, double value) {
- // TODO Need to fetch / use format strings
- text.append(value);
- text.append('\n');
- }
-
- @Override
- public POITextExtractor getMetadataTextExtractor() {
- return null;
- }
-
- @Override
- public void setCloseFilesystem(boolean doCloseFilesystem) {
-
- }
-
- @Override
- public boolean isCloseFilesystem() {
- return toClose != null;
- }
-
- @Override
- public Closeable getFilesystem() {
- return toClose;
- }
-
- @Override
- public Object getDocument() {
- return ris;
- }
- }
|