summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2008-04-13 14:58:27 +0000
committerNick Burch <nick@apache.org>2008-04-13 14:58:27 +0000
commita54ece7864439a74d084a1ec2478aeccf691dce6 (patch)
treea3b073f006a02fb401039e5a34dbfe1eb2e81603 /src
parent51859d65abb6b1d166c394260640fc756b1f4fd3 (diff)
downloadpoi-a54ece7864439a74d084a1ec2478aeccf691dce6.tar.gz
poi-a54ece7864439a74d084a1ec2478aeccf691dce6.zip
Start on a eventusermodel based excel text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@647574 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java260
-rw-r--r--src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java46
2 files changed, 306 insertions, 0 deletions
diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
new file mode 100644
index 0000000000..718c025168
--- /dev/null
+++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
@@ -0,0 +1,260 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.extractor;
+
+import java.io.IOException;
+import java.text.DateFormat;
+import java.text.DecimalFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.model.FormulaParser;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NoteRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.usermodel.HSSFDateUtil;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * A text extractor for Excel files, that is based
+ * on the hssf eventusermodel api.
+ * It will typically use less memory than
+ * {@link ExcelExtractor}, but may not provide
+ * the same richness of formatting.
+ * Returns the textual content of the file, suitable for
+ * indexing by something like Lucene, but not really
+ * intended for display to the user.
+ * To turn an excel file into a CSV or similar, then see
+ * the XLS2CSVmra example
+ * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
+ */
+public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
+ private POIFSFileSystem fs;
+ private boolean includeSheetNames = true;
+ private boolean formulasNotResults = false;
+
+ public EventBasedExcelExtractor(POIFSFileSystem fs) throws IOException {
+ super(null);
+ this.fs = fs;
+ }
+
+ /**
+ * Would return the document information metadata for the document,
+ * if we supported it
+ */
+ public DocumentSummaryInformation getDocSummaryInformation() {
+ throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
+ }
+ /**
+ * Would return the summary information metadata for the document,
+ * if we supported it
+ */
+ public SummaryInformation getSummaryInformation() {
+ throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
+ }
+
+
+ /**
+ * Should sheet names be included? Default is true
+ */
+ public void setIncludeSheetNames(boolean includeSheetNames) {
+ this.includeSheetNames = includeSheetNames;
+ }
+ /**
+ * Should we return the formula itself, and not
+ * the result it produces? Default is false
+ */
+ public void setFormulasNotResults(boolean formulasNotResults) {
+ this.formulasNotResults = formulasNotResults;
+ }
+
+
+ /**
+ * Retreives the text contents of the file
+ */
+ public String getText() {
+ String text = null;
+ try {
+ TextListener tl = triggerExtraction();
+
+ text = tl.text.toString();
+ if(! text.endsWith("\n")) {
+ text = text + "\n";
+ }
+ } catch(IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return text;
+ }
+
+ private TextListener triggerExtraction() throws IOException {
+ TextListener tl = new TextListener();
+ FormatTrackingHSSFListener ft = new FormatTrackingHSSFListener(tl);
+ tl.ft = ft;
+
+ // Register and process
+ HSSFEventFactory factory = new HSSFEventFactory();
+ HSSFRequest request = new HSSFRequest();
+ request.addListenerForAllRecords(ft);
+
+ factory.processWorkbookEvents(request, fs);
+
+ return tl;
+ }
+
+ private class TextListener implements HSSFListener {
+ private FormatTrackingHSSFListener ft;
+ private SSTRecord sstRecord;
+
+ private List sheetNames = new ArrayList();
+ private StringBuffer text = new StringBuffer();
+ private int sheetNum = -1;
+ private int rowNum;
+
+ public void processRecord(Record record) {
+ String thisText = null;
+ int thisRow = -1;
+
+ switch(record.getSid()) {
+ case BoundSheetRecord.sid:
+ BoundSheetRecord sr = (BoundSheetRecord)record;
+ sheetNames.add(sr.getSheetname());
+ break;
+ case BOFRecord.sid:
+ BOFRecord bof = (BOFRecord)record;
+ if(bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+ sheetNum++;
+ rowNum = -1;
+
+ if(includeSheetNames) {
+ if(text.length() > 0) text.append("\n");
+ text.append(sheetNames.get(sheetNum));
+ }
+ }
+ break;
+ case SSTRecord.sid:
+ sstRecord = (SSTRecord)record;
+ break;
+
+ case FormulaRecord.sid:
+ FormulaRecord frec = (FormulaRecord) record;
+ thisRow = frec.getRow();
+
+ if(formulasNotResults) {
+ thisText = FormulaParser.toFormulaString(null, frec.getParsedExpression());
+ } else {
+ if(Double.isNaN( frec.getValue() )) {
+ thisText = "(todo - string formulas)";
+ } else {
+ thisText = formatNumberDateCell(frec, frec.getValue());
+ }
+ }
+ break;
+ case LabelRecord.sid:
+ LabelRecord lrec = (LabelRecord) record;
+ thisRow = lrec.getRow();
+ thisText = lrec.getValue();
+ break;
+ case LabelSSTRecord.sid:
+ LabelSSTRecord lsrec = (LabelSSTRecord) record;
+ thisRow = lsrec.getRow();
+ if(sstRecord == null) {
+ throw new IllegalStateException("No SST record found");
+ }
+ thisText = sstRecord.getString(lsrec.getSSTIndex()).toString();
+ break;
+ case NoteRecord.sid:
+ NoteRecord nrec = (NoteRecord) record;
+ thisRow = nrec.getRow();
+ // TODO: Find object to match nrec.getShapeId()
+ break;
+ case NumberRecord.sid:
+ NumberRecord numrec = (NumberRecord) record;
+ thisRow = numrec.getRow();
+ thisText = formatNumberDateCell(numrec, numrec.getValue());
+ break;
+ default:
+ break;
+ }
+
+ if(thisText != null) {
+ if(thisRow != rowNum) {
+ rowNum = thisRow;
+ if(text.length() > 0)
+ text.append("\n");
+ } else {
+ text.append("\t");
+ }
+ text.append(thisText);
+ }
+ }
+
+ /**
+ * Formats a number or date cell, be that a real number, or the
+ * answer to a formula
+ */
+ private String formatNumberDateCell(CellValueRecordInterface cell, double value) {
+ // Get the built in format, if there is one
+ int formatIndex = ft.getFormatIndex(cell);
+ String formatString = ft.getFormatString(cell);
+
+ if(formatString == null) {
+ return Double.toString(value);
+ } else {
+ // Is it a date?
+ if(HSSFDateUtil.isADateFormat(formatIndex,formatString) &&
+ HSSFDateUtil.isValidExcelDate(value)) {
+ // Java wants M not m for month
+ formatString = formatString.replace('m','M');
+ // Change \- into -, if it's there
+ formatString = formatString.replaceAll("\\\\-","-");
+
+ // Format as a date
+ Date d = HSSFDateUtil.getJavaDate(value, false);
+ DateFormat df = new SimpleDateFormat(formatString);
+ return df.format(d);
+ } else {
+ if(formatString == "General") {
+ // Some sort of wierd default
+ return Double.toString(value);
+ }
+
+ // Format as a number
+ DecimalFormat df = new DecimalFormat(formatString);
+ return df.format(value);
+ }
+ }
+ }
+ }
+}
diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
index ad311eb271..b93bc65a24 100644
--- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
@@ -122,6 +122,52 @@ public final class TestExcelExtractor extends TestCase {
assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
}
+
+ public void testEventExtractor() throws Exception {
+ EventBasedExcelExtractor extractor;
+
+ // First up, a simple file with string
+ // based formulas in it
+ extractor = new EventBasedExcelExtractor(
+ new POIFSFileSystem(
+ HSSFTestDataSamples.openSampleFileStream("SimpleWithFormula.xls")
+ )
+ );
+ extractor.setIncludeSheetNames(true);
+
+ String text = extractor.getText();
+ // TODO
+ assertEquals("Sheet1\nreplaceme\nreplaceme\n(todo - string formulas)\nSheet2\nSheet3\n", text);
+// assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text);
+
+ extractor.setIncludeSheetNames(false);
+ extractor.setFormulasNotResults(true);
+
+ text = extractor.getText();
+ assertEquals("replaceme\nreplaceme\nCONCATENATE(A1,A2)\n", text);
+
+
+ // Now, a slightly longer file with numeric formulas
+ extractor = new EventBasedExcelExtractor(
+ new POIFSFileSystem(
+ HSSFTestDataSamples.openSampleFileStream("sumifformula.xls")
+ )
+ );
+ extractor.setIncludeSheetNames(false);
+ extractor.setFormulasNotResults(true);
+
+ text = extractor.getText();
+ assertEquals(
+ "1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
+ "2000.0\t2.0\n" +
+ "3000.0\t3.0\n" +
+ "4000.0\t4.0\n" +
+ "5000.0\t5.0\n",
+ text
+ );
+ }
+
+
/**
* Embded in a non-excel file
*/