From 710aa84544b1be885c04ccc7f9ad37c2788d6e73 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 7 Oct 2015 15:19:04 +0000 Subject: [PATCH] Update the XSSF EventModel example, to show how to do it better using the SheetContentsHandler helper that does most of the work for you git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1707326 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/xssf/eventusermodel/XLSX2CSV.java | 332 ++++-------------- .../eventusermodel/examples/FromHowTo.java | 14 +- 2 files changed, 83 insertions(+), 263 deletions(-) diff --git a/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java b/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java index e15b4ec0be..8e0007edf2 100644 --- a/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java +++ b/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java @@ -23,32 +23,28 @@ import java.io.InputStream; import java.io.PrintStream; import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; -import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.util.CellReference; +import org.apache.poi.util.SAXHelper; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.model.StylesTable; -import org.apache.poi.xssf.usermodel.XSSFCellStyle; -import org.apache.poi.xssf.usermodel.XSSFRichTextString; -import org.xml.sax.Attributes; +import org.apache.poi.xssf.usermodel.XSSFComment; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; /** * A rudimentary XLSX -> CSV processor modeled on the - * POI sample program XLS2CSVmra by Nick Burch from the - * package org.apache.poi.hssf.eventusermodel.examples. - * Unlike the HSSF version, this one completely ignores - * missing rows. + * POI sample program XLS2CSVmra from the package + * org.apache.poi.hssf.eventusermodel.examples. + * As with the HSSF version, this tries to spot missing + * rows and cells, and output empty entries for them. *

* Data sheets are read using a SAX parser to keep the * memory footprint relatively small, so this should be @@ -61,263 +57,81 @@ import org.xml.sax.helpers.DefaultHandler; *

* For a more advanced implementation of SAX event parsing * of XLSX files, see {@link XSSFEventBasedExcelExtractor} - * and {@link XSSFSheetXMLHandler}. Note that some use cases, + * and {@link XSSFSheetXMLHandler}. Note that for many cases, * it may be possible to simply use those with a custom * {@link SheetContentsHandler} and no SAX code needed of * your own! */ public class XLSX2CSV { - /** - * The type of the data value is indicated by an attribute on the cell. - * The value is usually in a "v" element within the cell. + * Uses the XSSF Event SAX helpers to do most of the work + * of parsing the Sheet XML, and outputs the contents + * as a (basic) CSV. */ - enum xssfDataType { - BOOL, - ERROR, - FORMULA, - INLINESTR, - SSTINDEX, - NUMBER, - } - - - /** - * Derived from http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api - *

- * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at - * http://www.ecma-international.org/publications/standards/Ecma-376.htm - *

- * A web-friendly version is http://openiso.org/Ecma/376/Part4 - */ - class MyXSSFSheetHandler extends DefaultHandler { - - /** - * Table with styles - */ - private StylesTable stylesTable; - - /** - * Table with unique strings - */ - private ReadOnlySharedStringsTable sharedStringsTable; - - // Set when V start element is seen - private boolean vIsOpen; - - // Set when cell start element is seen; - // used when cell close element is seen. - private xssfDataType nextDataType; - - // Used to format numeric cell values. - private short formatIndex; - private String formatString; - private final DataFormatter formatter; - - private int thisColumn = -1; - // The last column printed to the output stream - private int lastColumnNumber = -1; - - // Gathers characters as they are seen. - private StringBuffer value; - - /** - * Accepts objects needed while parsing. - * - * @param styles Table of styles - * @param strings Table of shared strings - * @param cols Minimum number of columns to show - * @param target Sink for output - */ - public MyXSSFSheetHandler( - StylesTable styles, - ReadOnlySharedStringsTable strings) { - this.stylesTable = styles; - this.sharedStringsTable = strings; - this.value = new StringBuffer(); - this.nextDataType = xssfDataType.NUMBER; - this.formatter = new DataFormatter(); - } - - /* - * (non-Javadoc) - * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) - */ - public void startElement(String uri, String localName, String name, - Attributes attributes) throws SAXException { - - if ("inlineStr".equals(name) || "v".equals(name)) { - vIsOpen = true; - // Clear contents cache - value.setLength(0); - } - // c => cell - else if ("c".equals(name)) { - // Get the cell reference - String r = attributes.getValue("r"); - int firstDigit = -1; - for (int c = 0; c < r.length(); ++c) { - if (Character.isDigit(r.charAt(c))) { - firstDigit = c; - break; - } - } - thisColumn = nameToColumn(r.substring(0, firstDigit)); - - // Set up defaults. - this.nextDataType = xssfDataType.NUMBER; - this.formatIndex = -1; - this.formatString = null; - String cellType = attributes.getValue("t"); - String cellStyleStr = attributes.getValue("s"); - if ("b".equals(cellType)) { - nextDataType = xssfDataType.BOOL; - } else if ("e".equals(cellType)) { - nextDataType = xssfDataType.ERROR; - } else if ("inlineStr".equals(cellType)) { - nextDataType = xssfDataType.INLINESTR; - } else if ("s".equals(cellType)) { - nextDataType = xssfDataType.SSTINDEX; - } else if ("str".equals(cellType)) { - nextDataType = xssfDataType.FORMULA; - } else if (cellStyleStr != null) { - // It's a number, but almost certainly one - // with a special style or format - int styleIndex = Integer.parseInt(cellStyleStr); - XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); - this.formatIndex = style.getDataFormat(); - this.formatString = style.getDataFormatString(); - if (this.formatString == null) { - this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); - } + private class SheetToCSV implements SheetContentsHandler { + private boolean firstCellOfRow = false; + private int currentRow = -1; + private int currentCol = -1; + + private void outputMissingRows(int number) { + for (int i=0; i contents of a cell - if ("v".equals(name)) { - // Process the value contents as required. - // Do now, as characters() may be called more than once - switch (nextDataType) { - - case BOOL: - char first = value.charAt(0); - thisStr = first == '0' ? "FALSE" : "TRUE"; - break; - - case ERROR: - thisStr = "\"ERROR:" + value.toString() + '"'; - break; - - case FORMULA: - // A formula could result in a string value, - // so always add double-quote characters. - thisStr = '"' + value.toString() + '"'; - break; - - case INLINESTR: - // TODO: have seen an example of this, so it's untested. - XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); - thisStr = '"' + rtsi.toString() + '"'; - break; - - case SSTINDEX: - String sstIndex = value.toString(); - try { - int idx = Integer.parseInt(sstIndex); - XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); - thisStr = '"' + rtss.toString() + '"'; - } - catch (NumberFormatException ex) { - output.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); - } - break; - - case NUMBER: - String n = value.toString(); - if (this.formatString != null && n.length() > 0) - thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); - else - thisStr = n; - break; - - default: - thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; - break; - } - - // Output after we've seen the string contents - // Emit commas for any fields that were missing on this row - if (lastColumnNumber == -1) { - lastColumnNumber = 0; - } - for (int i = lastColumnNumber; i < thisColumn; ++i) - output.print(','); - - // Might be the empty string. - output.print(thisStr); - - // Update column - if (thisColumn > -1) - lastColumnNumber = thisColumn; - - } else if ("row".equals(name)) { - - // Print out any missing commas if needed - if (minColumns > 0) { - // Columns are 0 based - if (lastColumnNumber == -1) { - lastColumnNumber = 0; - } - for (int i = lastColumnNumber; i < minColumns; i++) { - output.print(','); - } - } - - // We're onto a new row - output.println(); - lastColumnNumber = -1; - } - + public void startRow(int rowNum) { + // If there were gaps, output the missing rows + outputMissingRows(rowNum-currentRow-1); + // Prepare for this row + firstCellOfRow = true; + currentRow = rowNum; + currentCol = -1; } - /** - * Captures characters only if a suitable element is open. - * Originally was just "v"; extended for inlineStr also. - */ - public void characters(char[] ch, int start, int length) - throws SAXException { - if (vIsOpen) - value.append(ch, start, length); + public void endRow(int rowNum) { + // Ensure the minimum number of columns + for (int i=currentCol; i