From: Nick Burch Date: Sun, 13 Apr 2008 15:09:42 +0000 (+0000) Subject: Finish off eventusermodel based Excel Extractor, and update the xls to csv converter... X-Git-Tag: REL_3_0_3_BETA1~25 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=adb1c55e9606ea9e898712458949326b8e55c9cd;p=poi.git Finish off eventusermodel based Excel Extractor, and update the xls to csv converter (moved to correct place) based on discoveries for the text extractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@647576 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java b/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java new file mode 100644 index 0000000000..9bebd3a837 --- /dev/null +++ b/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java @@ -0,0 +1,323 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hssf.eventusermodel.examples; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintStream; +import java.text.DateFormat; +import java.text.DecimalFormat; +import java.text.SimpleDateFormat; +import java.util.Date; + +import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; +import org.apache.poi.hssf.eventusermodel.HSSFListener; +import org.apache.poi.hssf.eventusermodel.HSSFRequest; +import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener; +import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord; +import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord; +import org.apache.poi.hssf.model.FormulaParser; +import org.apache.poi.hssf.record.BlankRecord; +import org.apache.poi.hssf.record.BoolErrRecord; +import org.apache.poi.hssf.record.CellValueRecordInterface; +import org.apache.poi.hssf.record.FormulaRecord; +import org.apache.poi.hssf.record.LabelRecord; +import org.apache.poi.hssf.record.LabelSSTRecord; +import org.apache.poi.hssf.record.NoteRecord; +import org.apache.poi.hssf.record.NumberRecord; +import org.apache.poi.hssf.record.RKRecord; +import org.apache.poi.hssf.record.Record; +import org.apache.poi.hssf.record.SSTRecord; +import org.apache.poi.hssf.record.StringRecord; +import org.apache.poi.hssf.usermodel.HSSFDateUtil; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * A XLS -> CSV processor, that uses the MissingRecordAware + * EventModel code to ensure it outputs all columns and rows. + * @author Nick Burch + */ +public class XLS2CSVmra implements HSSFListener { + private int minColumns; + private POIFSFileSystem fs; + private PrintStream output; + + private int lastRowNumber; + private int lastColumnNumber; + + /** Should we output the formula, or the value it has? */ + private boolean outputFormulaValues = true; + + // Records we pick up as we process + private SSTRecord sstRecord; + private FormatTrackingHSSFListener formatListener; + + // For handling formulas with string results + private int nextRow; + private int nextColumn; + private boolean outputNextStringRecord; + + /** + * Creates a new XLS -> CSV converter + * @param fs The POIFSFileSystem to process + * @param output The PrintStream to output the CSV to + * @param minColumns The minimum number of columns to output, or -1 for no minimum + */ + public XLS2CSVmra(POIFSFileSystem fs, PrintStream output, int minColumns) { + this.fs = fs; + this.output = output; + this.minColumns = minColumns; + } + + /** + * Creates a new XLS -> CSV converter + * @param filename The file to process + * @param minColumns The minimum number of columns to output, or -1 for no minimum + * @throws IOException + * @throws FileNotFoundException + */ + public XLS2CSVmra(String filename, int minColumns) throws IOException, FileNotFoundException { + this( + new POIFSFileSystem(new FileInputStream(filename)), + System.out, minColumns + ); + } + + /** + * Initiates the processing of the XLS file to CSV + */ + public void process() throws IOException { + MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); + formatListener = new FormatTrackingHSSFListener(listener); + + HSSFEventFactory factory = new HSSFEventFactory(); + HSSFRequest request = new HSSFRequest(); + request.addListenerForAllRecords(formatListener); + + factory.processWorkbookEvents(request, fs); + } + + /** + * Main HSSFListener method, processes events, and outputs the + * CSV as the file is processed. + */ + public void processRecord(Record record) { + int thisRow = -1; + int thisColumn = -1; + String thisStr = null; + + switch (record.getSid()) + { + case SSTRecord.sid: + sstRecord = (SSTRecord) record; + break; + + case BlankRecord.sid: + BlankRecord brec = (BlankRecord) record; + + thisRow = brec.getRow(); + thisColumn = brec.getColumn(); + thisStr = ""; + break; + case BoolErrRecord.sid: + BoolErrRecord berec = (BoolErrRecord) record; + + thisRow = berec.getRow(); + thisColumn = berec.getColumn(); + thisStr = ""; + break; + + case FormulaRecord.sid: + FormulaRecord frec = (FormulaRecord) record; + + thisRow = frec.getRow(); + thisColumn = frec.getColumn(); + + if(outputFormulaValues) { + if(Double.isNaN( frec.getValue() )) { + // Formula result is a string + // This is stored in the next record + outputNextStringRecord = true; + nextRow = frec.getRow(); + nextColumn = frec.getColumn(); + } else { + thisStr = formatNumberDateCell(frec, frec.getValue()); + } + } else { + thisStr = '"' + + FormulaParser.toFormulaString(null, frec.getParsedExpression()) + '"'; + } + break; + case StringRecord.sid: + if(outputNextStringRecord) { + // String for formula + StringRecord srec = (StringRecord)record; + thisStr = srec.getString(); + thisRow = nextRow; + thisColumn = nextColumn; + outputNextStringRecord = false; + } + break; + + case LabelRecord.sid: + LabelRecord lrec = (LabelRecord) record; + + thisRow = lrec.getRow(); + thisColumn = lrec.getColumn(); + thisStr = '"' + lrec.getValue() + '"'; + break; + case LabelSSTRecord.sid: + LabelSSTRecord lsrec = (LabelSSTRecord) record; + + thisRow = lsrec.getRow(); + thisColumn = lsrec.getColumn(); + if(sstRecord == null) { + thisStr = '"' + "(No SST Record, can't identify string)" + '"'; + } else { + thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex()).toString() + '"'; + } + break; + case NoteRecord.sid: + NoteRecord nrec = (NoteRecord) record; + + thisRow = nrec.getRow(); + thisColumn = nrec.getColumn(); + // TODO: Find object to match nrec.getShapeId() + thisStr = '"' + "(TODO)" + '"'; + break; + case NumberRecord.sid: + NumberRecord numrec = (NumberRecord) record; + + thisRow = numrec.getRow(); + thisColumn = numrec.getColumn(); + + // Format + thisStr = formatNumberDateCell(numrec, numrec.getValue()); + break; + case RKRecord.sid: + RKRecord rkrec = (RKRecord) record; + + thisRow = rkrec.getRow(); + thisColumn = rkrec.getColumn(); + thisStr = '"' + "(TODO)" + '"'; + break; + default: + break; + } + + // Handle new row + if(thisRow != -1 && thisRow != lastRowNumber) { + lastColumnNumber = -1; + } + + // Handle missing column + if(record instanceof MissingCellDummyRecord) { + MissingCellDummyRecord mc = (MissingCellDummyRecord)record; + thisRow = mc.getRow(); + thisColumn = mc.getColumn(); + thisStr = ""; + } + + // If we got something to print out, do so + if(thisStr != null) { + if(thisColumn > 0) { + output.print(','); + } + output.print(thisStr); + } + + // Update column and row count + if(thisRow > -1) + lastRowNumber = thisRow; + if(thisColumn > -1) + lastColumnNumber = thisColumn; + + // Handle end of row + if(record instanceof LastCellOfRowDummyRecord) { + // Print out any missing commas if needed + if(minColumns > 0) { + // Columns are 0 based + if(lastColumnNumber == -1) { lastColumnNumber = 0; } + for(int i=lastColumnNumber; i<(minColumns); i++) { + output.print(','); + } + } + + // We're onto a new row + lastColumnNumber = -1; + + // End the row + output.println(); + } + } + + /** + * Formats a number or date cell, be that a real number, or the + * answer to a formula + */ + private String formatNumberDateCell(CellValueRecordInterface cell, double value) { + // Get the built in format, if there is one + int formatIndex = formatListener.getFormatIndex(cell); + String formatString = formatListener.getFormatString(cell); + + if(formatString == null) { + return Double.toString(value); + } else { + // Is it a date? + if(HSSFDateUtil.isADateFormat(formatIndex,formatString) && + HSSFDateUtil.isValidExcelDate(value)) { + // Java wants M not m for month + formatString = formatString.replace('m','M'); + // Change \- into -, if it's there + formatString = formatString.replaceAll("\\\\-","-"); + + // Format as a date + Date d = HSSFDateUtil.getJavaDate(value, false); + DateFormat df = new SimpleDateFormat(formatString); + return df.format(d); + } else { + if(formatString == "General") { + // Some sort of wierd default + return Double.toString(value); + } + + // Format as a number + DecimalFormat df = new DecimalFormat(formatString); + return df.format(value); + } + } + } + + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" XLS2CSVmra [min columns]"); + System.exit(1); + } + + int minColumns = -1; + if(args.length >= 2) { + minColumns = Integer.parseInt(args[1]); + } + + XLS2CSVmra xls2csv = new XLS2CSVmra(args[0], minColumns); + xls2csv.process(); + } +} diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java index 718c025168..8f3eebb2d3 100644 --- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java @@ -42,6 +42,7 @@ import org.apache.poi.hssf.record.NoteRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; +import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.usermodel.HSSFDateUtil; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -142,6 +143,9 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { private int sheetNum = -1; private int rowNum; + private boolean outputNextStringValue = false; + private int nextRow = -1; + public void processRecord(Record record) { String thisText = null; int thisRow = -1; @@ -175,12 +179,24 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { thisText = FormulaParser.toFormulaString(null, frec.getParsedExpression()); } else { if(Double.isNaN( frec.getValue() )) { - thisText = "(todo - string formulas)"; + // Formula result is a string + // This is stored in the next record + outputNextStringValue = true; + nextRow = frec.getRow(); } else { thisText = formatNumberDateCell(frec, frec.getValue()); } } break; + case StringRecord.sid: + if(outputNextStringValue) { + // String for formula + StringRecord srec = (StringRecord)record; + thisText = srec.getString(); + thisRow = nextRow; + outputNextStringValue = false; + } + break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; thisRow = lrec.getRow(); diff --git a/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java b/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java deleted file mode 100644 index 1c53f1ecc2..0000000000 --- a/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java +++ /dev/null @@ -1,303 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hssf.eventusermodel.examples; - -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintStream; -import java.text.DateFormat; -import java.text.DecimalFormat; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Hashtable; -import java.util.List; -import java.util.Map; - -import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; -import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; -import org.apache.poi.hssf.eventusermodel.HSSFListener; -import org.apache.poi.hssf.eventusermodel.HSSFRequest; -import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener; -import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord; -import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord; -import org.apache.poi.hssf.record.BlankRecord; -import org.apache.poi.hssf.record.BoolErrRecord; -import org.apache.poi.hssf.record.CellValueRecordInterface; -import org.apache.poi.hssf.record.ExtendedFormatRecord; -import org.apache.poi.hssf.record.FormatRecord; -import org.apache.poi.hssf.record.FormulaRecord; -import org.apache.poi.hssf.record.LabelRecord; -import org.apache.poi.hssf.record.LabelSSTRecord; -import org.apache.poi.hssf.record.NoteRecord; -import org.apache.poi.hssf.record.NumberRecord; -import org.apache.poi.hssf.record.RKRecord; -import org.apache.poi.hssf.record.Record; -import org.apache.poi.hssf.record.SSTRecord; -import org.apache.poi.hssf.usermodel.HSSFDataFormat; -import org.apache.poi.hssf.usermodel.HSSFDateUtil; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; - -/** - * A XLS -> CSV processor, that uses the MissingRecordAware - * EventModel code to ensure it outputs all columns and rows. - * @author Nick Burch - */ -public class XLS2CSVmra implements HSSFListener { - private int minColumns; - private POIFSFileSystem fs; - private PrintStream output; - - private int lastRowNumber; - private int lastColumnNumber; - - /** Should we output the formula, or the value it has? */ - private boolean outputFormulaValues = true; - - // Records we pick up as we process - private SSTRecord sstRecord; - private FormatTrackingHSSFListener formatListener; - - /** - * Creates a new XLS -> CSV converter - * @param fs The POIFSFileSystem to process - * @param output The PrintStream to output the CSV to - * @param minColumns The minimum number of columns to output, or -1 for no minimum - */ - public XLS2CSVmra(POIFSFileSystem fs, PrintStream output, int minColumns) { - this.fs = fs; - this.output = output; - this.minColumns = minColumns; - } - - /** - * Creates a new XLS -> CSV converter - * @param filename The file to process - * @param minColumns The minimum number of columns to output, or -1 for no minimum - * @throws IOException - * @throws FileNotFoundException - */ - public XLS2CSVmra(String filename, int minColumns) throws IOException, FileNotFoundException { - this( - new POIFSFileSystem(new FileInputStream(filename)), - System.out, minColumns - ); - } - - /** - * Initiates the processing of the XLS file to CSV - */ - public void process() throws IOException { - MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); - formatListener = new FormatTrackingHSSFListener(listener); - - HSSFEventFactory factory = new HSSFEventFactory(); - HSSFRequest request = new HSSFRequest(); - request.addListenerForAllRecords(formatListener); - - factory.processWorkbookEvents(request, fs); - } - - /** - * Main HSSFListener method, processes events, and outputs the - * CSV as the file is processed. - */ - public void processRecord(Record record) { - int thisRow = -1; - int thisColumn = -1; - String thisStr = null; - - switch (record.getSid()) - { - case SSTRecord.sid: - sstRecord = (SSTRecord) record; - break; - - case BlankRecord.sid: - BlankRecord brec = (BlankRecord) record; - - thisRow = brec.getRow(); - thisColumn = brec.getColumn(); - thisStr = ""; - break; - case BoolErrRecord.sid: - BoolErrRecord berec = (BoolErrRecord) record; - - thisRow = berec.getRow(); - thisColumn = berec.getColumn(); - thisStr = ""; - break; - case FormulaRecord.sid: - FormulaRecord frec = (FormulaRecord) record; - - thisRow = frec.getRow(); - thisColumn = frec.getColumn(); - - if(outputFormulaValues) { - thisStr = formatNumberDateCell(frec, frec.getValue()); - } else { - // TODO: Output the formula string - thisStr = '"' + frec.toString() + '"'; - } - break; - case LabelRecord.sid: - LabelRecord lrec = (LabelRecord) record; - - thisRow = lrec.getRow(); - thisColumn = lrec.getColumn(); - thisStr = '"' + lrec.getValue() + '"'; - break; - case LabelSSTRecord.sid: - LabelSSTRecord lsrec = (LabelSSTRecord) record; - - thisRow = lsrec.getRow(); - thisColumn = lsrec.getColumn(); - if(sstRecord == null) { - thisStr = '"' + "(No SST Record, can't identify string)" + '"'; - } else { - thisStr = '"' + sstRecord.getString(lsrec.getSSTIndex()).toString() + '"'; - } - break; - case NoteRecord.sid: - NoteRecord nrec = (NoteRecord) record; - - thisRow = nrec.getRow(); - thisColumn = nrec.getColumn(); - // TODO: Find object to match nrec.getShapeId() - thisStr = '"' + "(TODO)" + '"'; - break; - case NumberRecord.sid: - NumberRecord numrec = (NumberRecord) record; - - thisRow = numrec.getRow(); - thisColumn = numrec.getColumn(); - - // Format - thisStr = formatNumberDateCell(numrec, numrec.getValue()); - break; - case RKRecord.sid: - RKRecord rkrec = (RKRecord) record; - - thisRow = rkrec.getRow(); - thisColumn = rkrec.getColumn(); - thisStr = '"' + "(TODO)" + '"'; - break; - default: - break; - } - - // Handle new row - if(thisRow != -1 && thisRow != lastRowNumber) { - lastColumnNumber = -1; - } - - // Handle missing column - if(record instanceof MissingCellDummyRecord) { - MissingCellDummyRecord mc = (MissingCellDummyRecord)record; - thisRow = mc.getRow(); - thisColumn = mc.getColumn(); - thisStr = ""; - } - - // If we got something to print out, do so - if(thisStr != null) { - if(thisColumn > 0) { - output.print(','); - } - output.print(thisStr); - } - - // Update column and row count - if(thisRow > -1) - lastRowNumber = thisRow; - if(thisColumn > -1) - lastColumnNumber = thisColumn; - - // Handle end of row - if(record instanceof LastCellOfRowDummyRecord) { - // Print out any missing commas if needed - if(minColumns > 0) { - // Columns are 0 based - if(lastColumnNumber == -1) { lastColumnNumber = 0; } - for(int i=lastColumnNumber; i<(minColumns); i++) { - output.print(','); - } - } - - // We're onto a new row - lastColumnNumber = -1; - - // End the row - output.println(); - } - } - - /** - * Formats a number or date cell, be that a real number, or the - * answer to a formula - */ - private String formatNumberDateCell(CellValueRecordInterface cell, double value) { - // Get the built in format, if there is one - int formatIndex = formatListener.getFormatIndex(cell); - String formatString = formatListener.getFormatString(cell); - - if(formatString == null) { - return Double.toString(value); - } else { - // Is it a date? - if(HSSFDateUtil.isADateFormat(formatIndex,formatString) && - HSSFDateUtil.isValidExcelDate(value)) { - // Java wants M not m for month - formatString = formatString.replace('m','M'); - // Change \- into -, if it's there - formatString = formatString.replaceAll("\\\\-","-"); - - // Format as a date - Date d = HSSFDateUtil.getJavaDate(value, false); - DateFormat df = new SimpleDateFormat(formatString); - return df.format(d); - } else { - if(formatString == "General") { - // Some sort of wierd default - return Double.toString(value); - } - - // Format as a number - DecimalFormat df = new DecimalFormat(formatString); - return df.format(value); - } - } - } - - - public static void main(String[] args) throws Exception { - if(args.length < 1) { - System.err.println("Use:"); - System.err.println(" XLS2CSVmra [min columns]"); - System.exit(1); - } - - int minColumns = -1; - if(args.length >= 2) { - minColumns = Integer.parseInt(args[1]); - } - - XLS2CSVmra xls2csv = new XLS2CSVmra(args[0], minColumns); - xls2csv.process(); - } -} diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java index b93bc65a24..63d67ee771 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java @@ -136,9 +136,7 @@ public final class TestExcelExtractor extends TestCase { extractor.setIncludeSheetNames(true); String text = extractor.getText(); - // TODO - assertEquals("Sheet1\nreplaceme\nreplaceme\n(todo - string formulas)\nSheet2\nSheet3\n", text); -// assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text); + assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", text); extractor.setIncludeSheetNames(false); extractor.setFormulasNotResults(true);