From a54ece7864439a74d084a1ec2478aeccf691dce6 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sun, 13 Apr 2008 14:58:27 +0000 Subject: Start on a eventusermodel based excel text extractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@647574 13f79535-47bb-0310-9956-ffa450edef68 --- .../hssf/extractor/EventBasedExcelExtractor.java | 260 +++++++++++++++++++++ .../poi/hssf/extractor/TestExcelExtractor.java | 46 ++++ 2 files changed, 306 insertions(+) create mode 100644 src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java (limited to 'src') diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java new file mode 100644 index 0000000000..718c025168 --- /dev/null +++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java @@ -0,0 +1,260 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.extractor; + +import java.io.IOException; +import java.text.DateFormat; +import java.text.DecimalFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import org.apache.poi.POIOLE2TextExtractor; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; +import org.apache.poi.hssf.eventusermodel.HSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFRequest; +import org.apache.poi.hssf.model.FormulaParser; +import org.apache.poi.hssf.record.BOFRecord; +import org.apache.poi.hssf.record.BoundSheetRecord; +import org.apache.poi.hssf.record.CellValueRecordInterface; +import org.apache.poi.hssf.record.FormulaRecord; +import org.apache.poi.hssf.record.LabelRecord; +import org.apache.poi.hssf.record.LabelSSTRecord; +import org.apache.poi.hssf.record.NoteRecord; +import org.apache.poi.hssf.record.NumberRecord; +import org.apache.poi.hssf.record.Record; +import org.apache.poi.hssf.record.SSTRecord; +import org.apache.poi.hssf.usermodel.HSSFDateUtil; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * A text extractor for Excel files, that is based + * on the hssf eventusermodel api. + * It will typically use less memory than + * {@link ExcelExtractor}, but may not provide + * the same richness of formatting. + * Returns the textual content of the file, suitable for + * indexing by something like Lucene, but not really + * intended for display to the user. + * To turn an excel file into a CSV or similar, then see + * the XLS2CSVmra example + * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra + */ +public class EventBasedExcelExtractor extends POIOLE2TextExtractor { + private POIFSFileSystem fs; + private boolean includeSheetNames = true; + private boolean formulasNotResults = false; + + public EventBasedExcelExtractor(POIFSFileSystem fs) throws IOException { + super(null); + this.fs = fs; + } + + /** + * Would return the document information metadata for the document, + * if we supported it + */ + public DocumentSummaryInformation getDocSummaryInformation() { + throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); + } + /** + * Would return the summary information metadata for the document, + * if we supported it + */ + public SummaryInformation getSummaryInformation() { + throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); + } + + + /** + * Should sheet names be included? Default is true + */ + public void setIncludeSheetNames(boolean includeSheetNames) { + this.includeSheetNames = includeSheetNames; + } + /** + * Should we return the formula itself, and not + * the result it produces? Default is false + */ + public void setFormulasNotResults(boolean formulasNotResults) { + this.formulasNotResults = formulasNotResults; + } + + + /** + * Retreives the text contents of the file + */ + public String getText() { + String text = null; + try { + TextListener tl = triggerExtraction(); + + text = tl.text.toString(); + if(! text.endsWith("\n")) { + text = text + "\n"; + } + } catch(IOException e) { + throw new RuntimeException(e); + } + + return text; + } + + private TextListener triggerExtraction() throws IOException { + TextListener tl = new TextListener(); + FormatTrackingHSSFListener ft = new FormatTrackingHSSFListener(tl); + tl.ft = ft; + + // Register and process + HSSFEventFactory factory = new HSSFEventFactory(); + HSSFRequest request = new HSSFRequest(); + request.addListenerForAllRecords(ft); + + factory.processWorkbookEvents(request, fs); + + return tl; + } + + private class TextListener implements HSSFListener { + private FormatTrackingHSSFListener ft; + private SSTRecord sstRecord; + + private List sheetNames = new ArrayList(); + private StringBuffer text = new StringBuffer(); + private int sheetNum = -1; + private int rowNum; + + public void processRecord(Record record) { + String thisText = null; + int thisRow = -1; + + switch(record.getSid()) { + case BoundSheetRecord.sid: + BoundSheetRecord sr = (BoundSheetRecord)record; + sheetNames.add(sr.getSheetname()); + break; + case BOFRecord.sid: + BOFRecord bof = (BOFRecord)record; + if(bof.getType() == BOFRecord.TYPE_WORKSHEET) { + sheetNum++; + rowNum = -1; + + if(includeSheetNames) { + if(text.length() > 0) text.append("\n"); + text.append(sheetNames.get(sheetNum)); + } + } + break; + case SSTRecord.sid: + sstRecord = (SSTRecord)record; + break; + + case FormulaRecord.sid: + FormulaRecord frec = (FormulaRecord) record; + thisRow = frec.getRow(); + + if(formulasNotResults) { + thisText = FormulaParser.toFormulaString(null, frec.getParsedExpression()); + } else { + if(Double.isNaN( frec.getValue() )) { + thisText = "(todo - string formulas)"; + } else { + thisText = formatNumberDateCell(frec, frec.getValue()); + } + } + break; + case LabelRecord.sid: + LabelRecord lrec = (LabelRecord) record; + thisRow = lrec.getRow(); + thisText = lrec.getValue(); + break; + case LabelSSTRecord.sid: + LabelSSTRecord lsrec = (LabelSSTRecord) record; + thisRow = lsrec.getRow(); + if(sstRecord == null) { + throw new IllegalStateException("No SST record found"); + } + thisText = sstRecord.getString(lsrec.getSSTIndex()).toString(); + break; + case NoteRecord.sid: + NoteRecord nrec = (NoteRecord) record; + thisRow = nrec.getRow(); + // TODO: Find object to match nrec.getShapeId() + break; + case NumberRecord.sid: + NumberRecord numrec = (NumberRecord) record; + thisRow = numrec.getRow(); + thisText = formatNumberDateCell(numrec, numrec.getValue()); + break; + default: + break; + } + + if(thisText != null) { + if(thisRow != rowNum) { + rowNum = thisRow; + if(text.length() > 0) + text.append("\n"); + } else { + text.append("\t"); + } + text.append(thisText); + } + } + + /** + * Formats a number or date cell, be that a real number, or the + * answer to a formula + */ + private String formatNumberDateCell(CellValueRecordInterface cell, double value) { + // Get the built in format, if there is one + int formatIndex = ft.getFormatIndex(cell); + String formatString = ft.getFormatString(cell); + + if(formatString == null) { + return Double.toString(value); + } else { + // Is it a date? + if(HSSFDateUtil.isADateFormat(formatIndex,formatString) && + HSSFDateUtil.isValidExcelDate(value)) { + // Java wants M not m for month + formatString = formatString.replace('m','M'); + // Change \- into -, if it's there + formatString = formatString.replaceAll("\\\\-","-"); + + // Format as a date + Date d = HSSFDateUtil.getJavaDate(value, false); + DateFormat df = new SimpleDateFormat(formatString); + return df.format(d); + } else { + if(formatString == "General") { + // Some sort of wierd default + return Double.toString(value); + } + + // Format as a number + DecimalFormat df = new DecimalFormat(formatString); + return df.format(value); + } + } + } + } +} diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index ad311eb271..b93bc65a24 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -122,6 +122,52 @@ public final class TestExcelExtractor extends TestCase { assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText()); } + + public void testEventExtractor() throws Exception { + EventBasedExcelExtractor extractor; + + // First up, a simple file with string + // based formulas in it + extractor = new EventBasedExcelExtractor( + new POIFSFileSystem( + HSSFTestDataSamples.openSampleFileStream("SimpleWithFormula.xls") + ) + ); + extractor.setIncludeSheetNames(true); + + String text = extractor.getText(); + // TODO + assertEquals("Sheet1\nreplaceme\nreplaceme\n(todo - string formulas)\nSheet2\nSheet3\n", text); +// assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text); + + extractor.setIncludeSheetNames(false); + extractor.setFormulasNotResults(true); + + text = extractor.getText(); + assertEquals("replaceme\nreplaceme\nCONCATENATE(A1,A2)\n", text); + + + // Now, a slightly longer file with numeric formulas + extractor = new EventBasedExcelExtractor( + new POIFSFileSystem( + HSSFTestDataSamples.openSampleFileStream("sumifformula.xls") + ) + ); + extractor.setIncludeSheetNames(false); + extractor.setFormulasNotResults(true); + + text = extractor.getText(); + assertEquals( + "1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" + + "2000.0\t2.0\n" + + "3000.0\t3.0\n" + + "4000.0\t4.0\n" + + "5000.0\t5.0\n", + text + ); + } + + /** * Embded in a non-excel file */ -- cgit v1.2.3