]> source.dussan.org Git - poi.git/commitdiff
New event based xssf text extractor (XSSFEventBasedExcelExtractor)
authorNick Burch <nick@apache.org>
Tue, 26 Jan 2010 11:39:44 +0000 (11:39 +0000)
committerNick Burch <nick@apache.org>
Tue, 26 Jan 2010 11:39:44 +0000 (11:39 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@903182 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
src/ooxml/testcases/org/apache/poi/xssf/XSSFTestDataSamples.java
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java [new file with mode: 0644]

index 7842c6136f2e4ace54da8ab9592ad269175d31cd..79da11ad8213c51a5cd4268d020140278ca95d98 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">New event based xssf text extractor (XSSFEventBasedExcelExtractor)</action>
            <action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
            <action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
            <action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
index ed7f22ac8dc4845580fa0d6e4de515caec21e515..ee3da8c61423bb22c5e87835adc345545fe0c79f 100644 (file)
@@ -50,6 +50,7 @@ import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
@@ -161,8 +162,7 @@ public class ExtractorFactory {
             corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
             corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
            if(getPreferEventExtractor()) {
-              // TODO
-              return new XSSFExcelExtractor(pkg);
+              return new XSSFEventBasedExcelExtractor(pkg);
            } else {
               return new XSSFExcelExtractor(pkg);
            }
diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
new file mode 100644 (file)
index 0000000..9be9711
--- /dev/null
@@ -0,0 +1,357 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.xssf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.ss.usermodel.BuiltinFormats;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFCellStyle;
+import org.apache.poi.xssf.usermodel.XSSFRichTextString;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Implementation of a text extractor from OOXML Excel
+ *  files that uses SAX event based parsing.
+ */
+public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
+   private OPCPackage container;
+       private boolean includeSheetNames = true;
+       private boolean formulasNotResults = false;
+
+   /**
+    * These are the different kinds of cells we support.
+    * We keep track of the current one between
+    *  the start and end.
+    */
+   enum xssfDataType {
+       BOOLEAN,
+       ERROR,
+       FORMULA,
+       INLINE_STRING,
+       SST_STRING,
+       NUMBER,
+   }
+   
+       public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+               this(OPCPackage.open(path));
+       }
+       public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+               super(null);
+               this.container = container;
+       }
+
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  XSSFEventBasedExcelExtractor <filename.xlsx>");
+                       System.exit(1);
+               }
+               POIXMLTextExtractor extractor =
+                       new XSSFEventBasedExcelExtractor(args[0]);
+               System.out.println(extractor.getText());
+       }
+
+       /**
+        * Should sheet names be included? Default is true
+        */
+       public void setIncludeSheetNames(boolean includeSheetNames) {
+               this.includeSheetNames = includeSheetNames;
+       }
+       /**
+        * Should we return the formula itself, and not
+        *  the result it produces? Default is false
+        */
+       public void setFormulasNotResults(boolean formulasNotResults) {
+               this.formulasNotResults = formulasNotResults;
+       }
+       
+       
+   /**
+    * Handler for sheets. Processes each row and cell,
+    *  formatting Cells as best as it can.
+    */
+   class MyXSSFSheetHandler extends DefaultHandler {
+       /**
+        * Table with the styles used for formatting
+        */
+       private StylesTable stylesTable;
+
+       private ReadOnlySharedStringsTable sharedStringsTable;
+
+       /**
+        * Where our text is going
+        */
+       private final StringBuffer output;
+
+       // Set when V start element is seen
+       private boolean vIsOpen;
+       // Set when F start element is seen
+       private boolean fIsOpen;
+
+       // Set when cell start element is seen;
+       // used when cell close element is seen.
+       private xssfDataType nextDataType;
+
+       // Used to format numeric cell values.
+       private short formatIndex;
+       private String formatString;
+       private final DataFormatter formatter;
+
+       // Gathers characters as they are seen.
+       private StringBuffer value = new StringBuffer();
+       private StringBuffer formula = new StringBuffer();
+       private boolean firstCellOfRow = true;
+
+       /**
+        * Accepts objects needed while parsing.
+        *
+        * @param styles  Table of styles
+        * @param strings Table of shared strings
+        * @param cols    Minimum number of columns to show
+        * @param target  Sink for output
+        */
+       public MyXSSFSheetHandler(
+               StylesTable styles,
+               ReadOnlySharedStringsTable strings,
+               StringBuffer output) {
+           this.stylesTable = styles;
+           this.sharedStringsTable = strings;
+           this.output = output;
+           this.nextDataType = xssfDataType.NUMBER;
+           this.formatter = new DataFormatter();
+       }
+
+       public void startElement(String uri, String localName, String name,
+                                Attributes attributes) throws SAXException {
+
+           if ("inlineStr".equals(name) || "v".equals(name)) {
+               vIsOpen = true;
+               // Clear contents cache
+               value.setLength(0);
+           } else if ("f".equals(name)) {
+              // Clear contents cache
+              formula.setLength(0);
+              
+              // Mark us as being a formula if not already
+              if(nextDataType == xssfDataType.NUMBER) {
+                 nextDataType = xssfDataType.FORMULA;
+              }
+              
+              // Decide where to get the formula string from
+              String type = attributes.getValue("t"); 
+              if(type != null && type.equals("shared")) {
+                 System.err.println("Warning - shared formulas not yet supported!");
+              } else {
+                 fIsOpen = true;
+              }
+           }
+           else if("row".equals(name)) {
+               firstCellOfRow = true;
+           }
+           // c => cell
+           else if ("c".equals(name)) {
+               // Set up defaults.
+               this.nextDataType = xssfDataType.NUMBER;
+               this.formatIndex = -1;
+               this.formatString = null;
+               String cellType = attributes.getValue("t");
+               String cellStyleStr = attributes.getValue("s");
+               if ("b".equals(cellType))
+                   nextDataType = xssfDataType.BOOLEAN;
+               else if ("e".equals(cellType))
+                   nextDataType = xssfDataType.ERROR;
+               else if ("inlineStr".equals(cellType))
+                   nextDataType = xssfDataType.INLINE_STRING;
+               else if ("s".equals(cellType))
+                   nextDataType = xssfDataType.SST_STRING;
+               else if ("str".equals(cellType))
+                   nextDataType = xssfDataType.FORMULA;
+               else if (cellStyleStr != null) {
+                  // Number, but almost certainly with a special style or format
+                   int styleIndex = Integer.parseInt(cellStyleStr);
+                   XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
+                   this.formatIndex = style.getDataFormat();
+                   this.formatString = style.getDataFormatString();
+                   if (this.formatString == null)
+                       this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
+               }
+           }
+       }
+
+       public void endElement(String uri, String localName, String name)
+               throws SAXException {
+           String thisStr = null;
+
+           // v => contents of a cell
+           if ("v".equals(name)) {
+               vIsOpen = false;
+               
+               // Process the value contents as required, now we have it all
+               switch (nextDataType) {
+                   case BOOLEAN:
+                       char first = value.charAt(0);
+                       thisStr = first == '0' ? "FALSE" : "TRUE";
+                       break;
+
+                   case ERROR:
+                       thisStr = "ERROR:" + value.toString();
+                       break;
+
+                   case FORMULA:
+                       if(formulasNotResults) {
+                          thisStr = formula.toString();
+                       } else {
+                          thisStr = value.toString();
+                       }
+                       break;
+
+                   case INLINE_STRING:
+                       // TODO: have seen an example of this, so it's untested.
+                       XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
+                       thisStr = rtsi.toString();
+                       break;
+
+                   case SST_STRING:
+                       String sstIndex = value.toString();
+                       try {
+                           int idx = Integer.parseInt(sstIndex);
+                           XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
+                           thisStr = rtss.toString();
+                       }
+                       catch (NumberFormatException ex) {
+                           System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
+                       }
+                       break;
+
+                   case NUMBER:
+                       String n = value.toString();
+                       if (this.formatString != null)
+                           thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
+                       else
+                           thisStr = n;
+                       break;
+
+                   default:
+                       thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
+                       break;
+               }
+               
+               // Output
+               if(!firstCellOfRow) {
+                  output.append('\t');
+               }
+               firstCellOfRow = false;
+               
+               output.append(thisStr);
+           } else if ("f".equals(name)) {
+              fIsOpen = false;
+           } else if ("row".equals(name)) {
+              // Finish the line
+              output.append('\n');
+           }
+       }
+
+       /**
+        * Captures characters only if a suitable element is open.
+        * Originally was just "v"; extended for inlineStr also.
+        */
+       public void characters(char[] ch, int start, int length)
+               throws SAXException {
+           if (vIsOpen) {
+               value.append(ch, start, length);
+           }
+           if (fIsOpen) {
+              formula.append(ch, start, length);
+           }
+       }
+   }
+
+   /**
+    * Processes the given sheet
+    */
+   public void processSheet(
+           StringBuffer output,
+           StylesTable styles,
+           ReadOnlySharedStringsTable strings,
+           InputStream sheetInputStream)
+           throws IOException, SAXException {
+
+       InputSource sheetSource = new InputSource(sheetInputStream);
+       SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+       try {
+          SAXParser saxParser = saxFactory.newSAXParser();
+          XMLReader sheetParser = saxParser.getXMLReader();
+          ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output);
+          sheetParser.setContentHandler(handler);
+          sheetParser.parse(sheetSource);
+       } catch(ParserConfigurationException e) {
+          throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
+       }
+   }
+
+   /**
+    * Processes the file and returns the text
+    */
+   public String getText() {
+       try {
+          ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container);
+          XSSFReader xssfReader = new XSSFReader(container);
+          StylesTable styles = xssfReader.getStylesTable();
+          XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+   
+          StringBuffer text = new StringBuffer();
+          while (iter.hasNext()) {
+              InputStream stream = iter.next();
+              if(includeSheetNames) {
+                 text.append(iter.getSheetName());
+                 text.append('\n');
+              }
+              processSheet(text, styles, strings, stream);
+              stream.close();
+          }
+          
+          return text.toString();
+       } catch(IOException e) {
+          System.err.println(e);
+          return null;
+       } catch(SAXException se) {
+          System.err.println(se);
+          return null;
+       } catch(OpenXML4JException o4je) {
+          System.err.println(o4je);
+          return null;
+       }
+   }
+}
index b4b3c9d78f77e6e1696d3d70f64610c4f301d992..0f089e2db186ed6c6e2da8ebd6f2d331f2cef472 100644 (file)
@@ -56,7 +56,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
        public static void main(String[] args) throws Exception {
                if(args.length < 1) {
                        System.err.println("Use:");
-                       System.err.println("  HXFExcelExtractor <filename.xlsx>");
+                       System.err.println("  XSSFExcelExtractor <filename.xlsx>");
                        System.exit(1);
                }
                POIXMLTextExtractor extractor =
index 81f55cc9f7e96b7069419209a4e654997b20b757..f4f178f2272ebe49d38ca3a9ac7cd6c3a92eb998 100644 (file)
@@ -32,6 +32,7 @@ import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 
@@ -427,7 +428,7 @@ public class TestExtractorFactory extends TestCase {
       
       assertTrue(
             ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
-            instanceof XSSFExcelExtractor // TODO
+            instanceof XSSFEventBasedExcelExtractor
       );
       assertTrue(
             ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
index 166c3b35e52ca3d4558806bd7e54aea22c791e5a..0d52565e775cb547c06b2bf10b22ee26dee1f8ae 100644 (file)
@@ -19,18 +19,14 @@ package org.apache.poi.xssf;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.poi.hssf.HSSFTestDataSamples;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.apache.poi.util.TempFile;
 
 /**
  * Centralises logic for finding/opening sample files in the src/testcases/org/apache/poi/hssf/hssf/data folder. 
@@ -39,6 +35,15 @@ import org.apache.poi.util.TempFile;
  */
 public class XSSFTestDataSamples {
 
+   public static OPCPackage openSamplePackage(String sampleName) {
+      try {
+         return OPCPackage.open(
+               HSSFTestDataSamples.openSampleFileStream(sampleName)
+         );
+      } catch(Exception e) {
+         throw new RuntimeException(e);
+      }
+   }
        public static XSSFWorkbook openSampleWorkbook(String sampleName) {
                InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleName);
                try {
diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java
new file mode 100644 (file)
index 0000000..c1b0247
--- /dev/null
@@ -0,0 +1,141 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xssf.extractor;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.HSSFTestDataSamples;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.xssf.XSSFTestDataSamples;
+
+/**
+ * Tests for {@link XSSFEventBasedExcelExtractor}
+ */
+public final class TestXSSFEventBasedExcelExtractor extends TestCase {
+
+
+       private static final XSSFEventBasedExcelExtractor getExtractor(String sampleName) throws Exception {
+               return new XSSFEventBasedExcelExtractor(XSSFTestDataSamples.
+                     openSamplePackage(sampleName));
+       }
+
+       /**
+        * Get text out of the simple file
+        */
+       public void testGetSimpleText() throws Exception {
+               // a very simple file
+          XSSFEventBasedExcelExtractor extractor = getExtractor("sample.xlsx");
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               // Check sheet names
+               assertTrue(text.startsWith("Sheet1"));
+               assertTrue(text.endsWith("Sheet3\n"));
+               
+               // Now without, will have text
+               extractor.setIncludeSheetNames(false);
+               text = extractor.getText();
+               String CHUNK1 =
+                       "Lorem\t111\n" + 
+               "ipsum\t222\n" + 
+               "dolor\t333\n" + 
+               "sit\t444\n" + 
+               "amet\t555\n" + 
+               "consectetuer\t666\n" + 
+               "adipiscing\t777\n" + 
+               "elit\t888\n" + 
+               "Nunc\t999\n";
+               String CHUNK2 =
+                       "The quick brown fox jumps over the lazy dog\n" +
+                       "hello, xssf    hello, xssf\n" +
+                       "hello, xssf    hello, xssf\n" +
+                       "hello, xssf    hello, xssf\n" +
+                       "hello, xssf    hello, xssf\n";
+               assertEquals(
+                               CHUNK1 + 
+                               "at\t4995\n" + 
+                               CHUNK2
+                               , text);
+               
+               // Now get formulas not their values
+               extractor.setFormulasNotResults(true);
+               text = extractor.getText();
+               assertEquals(
+                               CHUNK1 +
+                               "at\tSUM(B1:B9)\n" + 
+                               CHUNK2, text);
+               
+               // With sheet names too
+               extractor.setIncludeSheetNames(true);
+               text = extractor.getText();
+               assertEquals(
+                               "Sheet1\n" +
+                               CHUNK1 +
+                               "at\tSUM(B1:B9)\n" + 
+                               "rich test\n" +
+                               CHUNK2 +
+                               "Sheet3\n"
+                               , text);
+       }
+       
+       public void testGetComplexText() throws Exception {
+               // A fairly complex file
+          XSSFEventBasedExcelExtractor extractor = getExtractor("AverageTaxRates.xlsx");
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               // Might not have all formatting it should do!
+               assertTrue(text.startsWith(
+                                               "Avgtxfull\n" +
+                                               "(iii) AVERAGE TAX RATES ON ANNUAL"     
+               ));
+       }
+       
+       /**
+        * Test that we return pretty much the same as
+        *  ExcelExtractor does, when we're both passed
+        *  the same file, just saved as xls and xlsx
+        */
+       public void testComparedToOLE2() throws Exception {
+               // A fairly simple file - ooxml
+          XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("SampleSS.xlsx");
+
+               ExcelExtractor ole2Extractor =
+                       new ExcelExtractor(HSSFTestDataSamples.openSampleWorkbook("SampleSS.xls"));
+               
+               POITextExtractor[] extractors =
+                       new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
+               for (int i = 0; i < extractors.length; i++) {
+                       POITextExtractor extractor = extractors[i];
+                       
+                       String text = extractor.getText().replaceAll("[\r\t]", "");
+                       assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
+                       Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
+                       Matcher m = pattern.matcher(text);
+                       assertTrue(m.matches());                        
+               }
+       }
+}