]> source.dussan.org Git - poi.git/commitdiff
Update the XSSF EventModel example, to show how to do it better using the SheetConten...
authorNick Burch <nick@apache.org>
Wed, 7 Oct 2015 15:19:04 +0000 (15:19 +0000)
committerNick Burch <nick@apache.org>
Wed, 7 Oct 2015 15:19:04 +0000 (15:19 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1707326 13f79535-47bb-0310-9956-ffa450edef68

src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java
src/examples/src/org/apache/poi/xssf/eventusermodel/examples/FromHowTo.java

index e15b4ec0be87736a24c946996b14f10af5a84481..8e0007edf2269d9e7e553ff57b01afed6ba6b363 100644 (file)
@@ -23,32 +23,28 @@ import java.io.InputStream;
 import java.io.PrintStream;
 
 import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
 
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackageAccess;
-import org.apache.poi.ss.usermodel.BuiltinFormats;
 import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.util.CellReference;
+import org.apache.poi.util.SAXHelper;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.model.StylesTable;
-import org.apache.poi.xssf.usermodel.XSSFCellStyle;
-import org.apache.poi.xssf.usermodel.XSSFRichTextString;
-import org.xml.sax.Attributes;
+import org.apache.poi.xssf.usermodel.XSSFComment;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * A rudimentary XLSX -> CSV processor modeled on the
- * POI sample program XLS2CSVmra by Nick Burch from the
- * package org.apache.poi.hssf.eventusermodel.examples.
- * Unlike the HSSF version, this one completely ignores
- * missing rows.
+ * POI sample program XLS2CSVmra from the package
+ * org.apache.poi.hssf.eventusermodel.examples.
+ * As with the HSSF version, this tries to spot missing
+ *  rows and cells, and output empty entries for them.
  * <p/>
  * Data sheets are read using a SAX parser to keep the
  * memory footprint relatively small, so this should be
@@ -61,263 +57,81 @@ import org.xml.sax.helpers.DefaultHandler;
  * <p/>
  * For a more advanced implementation of SAX event parsing
  * of XLSX files, see {@link XSSFEventBasedExcelExtractor}
- * and {@link XSSFSheetXMLHandler}. Note that some use cases,
+ * and {@link XSSFSheetXMLHandler}. Note that for many cases,
  * it may be possible to simply use those with a custom 
  * {@link SheetContentsHandler} and no SAX code needed of
  * your own!
  */
 public class XLSX2CSV {
-
     /**
-     * The type of the data value is indicated by an attribute on the cell.
-     * The value is usually in a "v" element within the cell.
+     * Uses the XSSF Event SAX helpers to do most of the work
+     *  of parsing the Sheet XML, and outputs the contents
+     *  as a (basic) CSV.
      */
-    enum xssfDataType {
-        BOOL,
-        ERROR,
-        FORMULA,
-        INLINESTR,
-        SSTINDEX,
-        NUMBER,
-    }
-
-
-    /**
-     * Derived from http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
-     * <p/>
-     * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
-     * http://www.ecma-international.org/publications/standards/Ecma-376.htm
-     * <p/>
-     * A web-friendly version is http://openiso.org/Ecma/376/Part4
-     */
-    class MyXSSFSheetHandler extends DefaultHandler {
-
-        /**
-         * Table with styles
-         */
-        private StylesTable stylesTable;
-
-        /**
-         * Table with unique strings
-         */
-        private ReadOnlySharedStringsTable sharedStringsTable;
-
-        // Set when V start element is seen
-        private boolean vIsOpen;
-
-        // Set when cell start element is seen;
-        // used when cell close element is seen.
-        private xssfDataType nextDataType;
-
-        // Used to format numeric cell values.
-        private short formatIndex;
-        private String formatString;
-        private final DataFormatter formatter;
-
-        private int thisColumn = -1;
-        // The last column printed to the output stream
-        private int lastColumnNumber = -1;
-
-        // Gathers characters as they are seen.
-        private StringBuffer value;
-
-        /**
-         * Accepts objects needed while parsing.
-         *
-         * @param styles  Table of styles
-         * @param strings Table of shared strings
-         * @param cols    Minimum number of columns to show
-         * @param target  Sink for output
-         */
-        public MyXSSFSheetHandler(
-                StylesTable styles,
-                ReadOnlySharedStringsTable strings) {
-            this.stylesTable = styles;
-            this.sharedStringsTable = strings;
-            this.value = new StringBuffer();
-            this.nextDataType = xssfDataType.NUMBER;
-            this.formatter = new DataFormatter();
-        }
-
-        /*
-           * (non-Javadoc)
-           * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
-           */
-        public void startElement(String uri, String localName, String name,
-                                 Attributes attributes) throws SAXException {
-
-            if ("inlineStr".equals(name) || "v".equals(name)) {
-                vIsOpen = true;
-                // Clear contents cache
-                value.setLength(0);
-            }
-            // c => cell
-            else if ("c".equals(name)) {
-                // Get the cell reference
-                String r = attributes.getValue("r");
-                int firstDigit = -1;
-                for (int c = 0; c < r.length(); ++c) {
-                    if (Character.isDigit(r.charAt(c))) {
-                        firstDigit = c;
-                        break;
-                    }
-                }
-                thisColumn = nameToColumn(r.substring(0, firstDigit));
-
-                // Set up defaults.
-                this.nextDataType = xssfDataType.NUMBER;
-                this.formatIndex = -1;
-                this.formatString = null;
-                String cellType = attributes.getValue("t");
-                String cellStyleStr = attributes.getValue("s");
-                if ("b".equals(cellType)) {
-                    nextDataType = xssfDataType.BOOL;
-                } else if ("e".equals(cellType)) {
-                    nextDataType = xssfDataType.ERROR;
-                } else if ("inlineStr".equals(cellType)) {
-                    nextDataType = xssfDataType.INLINESTR;
-                } else if ("s".equals(cellType)) {
-                    nextDataType = xssfDataType.SSTINDEX;
-                } else if ("str".equals(cellType)) {
-                    nextDataType = xssfDataType.FORMULA;
-                } else if (cellStyleStr != null) {
-                    // It's a number, but almost certainly one
-                    //  with a special style or format 
-                    int styleIndex = Integer.parseInt(cellStyleStr);
-                    XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
-                    this.formatIndex = style.getDataFormat();
-                    this.formatString = style.getDataFormatString();
-                    if (this.formatString == null) {
-                        this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
-                    }
+    private class SheetToCSV implements SheetContentsHandler {
+        private boolean firstCellOfRow = false;
+        private int currentRow = -1;
+        private int currentCol = -1;
+        
+        private void outputMissingRows(int number) {
+            for (int i=0; i<number; i++) {
+                for (int j=0; j<minColumns; j++) {
+                    output.append(',');
                 }
+                output.append('\n');
             }
-
         }
 
-        /*
-           * (non-Javadoc)
-           * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
-           */
-        public void endElement(String uri, String localName, String name)
-                throws SAXException {
-
-            String thisStr = null;
-
-            // v => contents of a cell
-            if ("v".equals(name)) {
-                // Process the value contents as required.
-                // Do now, as characters() may be called more than once
-                switch (nextDataType) {
-
-                    case BOOL:
-                        char first = value.charAt(0);
-                        thisStr = first == '0' ? "FALSE" : "TRUE";
-                        break;
-
-                    case ERROR:
-                        thisStr = "\"ERROR:" + value.toString() + '"';
-                        break;
-
-                    case FORMULA:
-                        // A formula could result in a string value,
-                        // so always add double-quote characters.
-                        thisStr = '"' + value.toString() + '"';
-                        break;
-
-                    case INLINESTR:
-                        // TODO: have seen an example of this, so it's untested.
-                        XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
-                        thisStr = '"' + rtsi.toString() + '"';
-                        break;
-
-                    case SSTINDEX:
-                        String sstIndex = value.toString();
-                        try {
-                            int idx = Integer.parseInt(sstIndex);
-                            XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
-                            thisStr = '"' + rtss.toString() + '"';
-                        }
-                        catch (NumberFormatException ex) {
-                            output.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
-                        }
-                        break;
-
-                    case NUMBER:
-                        String n = value.toString();
-                        if (this.formatString != null && n.length() > 0)
-                            thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
-                        else
-                            thisStr = n;
-                        break;
-
-                    default:
-                        thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
-                        break;
-                }
-
-                // Output after we've seen the string contents
-                // Emit commas for any fields that were missing on this row
-                if (lastColumnNumber == -1) {
-                    lastColumnNumber = 0;
-                }
-                for (int i = lastColumnNumber; i < thisColumn; ++i)
-                    output.print(',');
-
-                // Might be the empty string.
-                output.print(thisStr);
-
-                // Update column
-                if (thisColumn > -1)
-                    lastColumnNumber = thisColumn;
-
-            } else if ("row".equals(name)) {
-
-                // Print out any missing commas if needed
-                if (minColumns > 0) {
-                    // Columns are 0 based
-                    if (lastColumnNumber == -1) {
-                        lastColumnNumber = 0;
-                    }
-                    for (int i = lastColumnNumber; i < minColumns; i++) {
-                        output.print(',');
-                    }
-                }
-
-                // We're onto a new row
-                output.println();
-                lastColumnNumber = -1;
-            }
-
+        public void startRow(int rowNum) {
+            // If there were gaps, output the missing rows
+            outputMissingRows(rowNum-currentRow-1);
+            // Prepare for this row
+            firstCellOfRow = true;
+            currentRow = rowNum;
+            currentCol = -1;
         }
 
-        /**
-         * Captures characters only if a suitable element is open.
-         * Originally was just "v"; extended for inlineStr also.
-         */
-        public void characters(char[] ch, int start, int length)
-                throws SAXException {
-            if (vIsOpen)
-                value.append(ch, start, length);
+        public void endRow(int rowNum) {
+            // Ensure the minimum number of columns
+            for (int i=currentCol; i<minColumns; i++) {
+                output.append(',');
+            }
+            output.append('\n');
         }
 
-        /**
-         * Converts an Excel column name like "C" to a zero-based index.
-         *
-         * @param name
-         * @return Index corresponding to the specified name
-         */
-        private int nameToColumn(String name) {
-            int column = -1;
-            for (int i = 0; i < name.length(); ++i) {
-                int c = name.charAt(i);
-                column = (column + 1) * 26 + c - 'A';
+        public void cell(String cellReference, String formattedValue,
+                XSSFComment comment) {
+            if (firstCellOfRow) {
+                firstCellOfRow = false;
+            } else {
+                output.append(',');
+            }
+            
+            // Did we miss any cells?
+            int thisCol = (new CellReference(cellReference)).getCol();
+            int missedCols = thisCol - currentCol - 1;
+            for (int i=0; i<missedCols; i++) {
+                output.append(',');
+            }
+            currentCol = thisCol;
+            
+            // Number or string?
+            try {
+                Double.parseDouble(formattedValue);
+                output.append(formattedValue);
+            } catch (NumberFormatException e) {
+                output.append('"');
+                output.append(formattedValue);
+                output.append('"');
             }
-            return column;
         }
 
+        public void headerFooter(String text, boolean isHeader, String tagName) {
+            // Skip, no headers or footers in CSV
+        }
     }
 
+
     ///////////////////////////////////////
 
     private final OPCPackage xlsxPackage;
@@ -356,16 +170,20 @@ public class XLSX2CSV {
     public void processSheet(
             StylesTable styles,
             ReadOnlySharedStringsTable strings,
+            SheetContentsHandler sheetHandler, 
             InputStream sheetInputStream)
             throws IOException, ParserConfigurationException, SAXException {
-
+        DataFormatter formatter = new DataFormatter();
         InputSource sheetSource = new InputSource(sheetInputStream);
-        SAXParserFactory saxFactory = SAXParserFactory.newInstance();
-        SAXParser saxParser = saxFactory.newSAXParser();
-        XMLReader sheetParser = saxParser.getXMLReader();
-        ContentHandler handler = new MyXSSFSheetHandler(styles, strings);
-        sheetParser.setContentHandler(handler);
-        sheetParser.parse(sheetSource);
+        try {
+            XMLReader sheetParser = SAXHelper.newXMLReader();
+            ContentHandler handler = new XSSFSheetXMLHandler(
+                  styles, null, strings, sheetHandler, formatter, false);
+            sheetParser.setContentHandler(handler);
+            sheetParser.parse(sheetSource);
+         } catch(ParserConfigurationException e) {
+            throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
+         }
     }
 
     /**
@@ -378,7 +196,6 @@ public class XLSX2CSV {
      */
     public void process()
             throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {
-
         ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage);
         XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
         StylesTable styles = xssfReader.getStylesTable();
@@ -389,7 +206,7 @@ public class XLSX2CSV {
             String sheetName = iter.getSheetName();
             this.output.println();
             this.output.println(sheetName + " [index=" + index + "]:");
-            processSheet(styles, strings, stream);
+            processSheet(styles, strings, new SheetToCSV(), stream);
             stream.close();
             ++index;
         }
@@ -418,5 +235,4 @@ public class XLSX2CSV {
                xlsx2csv.process();
                p.close();
        }
-
 }
index 4df63448ede05f127991d9d8ba7675444018cac5..cd102144e093a53134714b1f5e983c40948f0b9a 100644 (file)
@@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel.examples;
 import java.io.InputStream;
 import java.util.Iterator;
 
+import org.apache.poi.xssf.eventusermodel.XLSX2CSV;
 import org.apache.poi.xssf.eventusermodel.XSSFReader;
 import org.apache.poi.xssf.model.SharedStringsTable;
 import org.apache.poi.xssf.usermodel.XSSFRichTextString;
@@ -32,18 +33,21 @@ import org.xml.sax.helpers.DefaultHandler;
 import org.xml.sax.helpers.XMLReaderFactory;
 
 /**
- * XSSF and SAX (Event API)
+ * XSSF and SAX (Event API) basic example.
+ * See {@link XLSX2CSV} for a fuller example of doing
+ *  XSLX processing with the XSSF Event code.
  */
 public class FromHowTo {
-       public void processOneSheet(String filename) throws Exception {
+       public void processFirstSheet(String filename) throws Exception {
                OPCPackage pkg = OPCPackage.open(filename);
                XSSFReader r = new XSSFReader( pkg );
                SharedStringsTable sst = r.getSharedStringsTable();
 
                XMLReader parser = fetchSheetParser(sst);
 
-               // rId2 found by processing the Workbook
-               // Seems to either be rId# or rSheet#
+               // To look up the Sheet Name / Sheet Order / rID,
+               //  you need to process the core Workbook stream.
+               // Normally it's of the form rId# or rSheet#
                InputStream sheet2 = r.getSheet("rId2");
                InputSource sheetSource = new InputSource(sheet2);
                parser.parse(sheetSource);
@@ -133,7 +137,7 @@ public class FromHowTo {
        
        public static void main(String[] args) throws Exception {
                FromHowTo howto = new FromHowTo();
-               howto.processOneSheet(args[0]);
+               howto.processFirstSheet(args[0]);
                howto.processAllSheets(args[0]);
        }
 }