--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.xssf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.ss.usermodel.BuiltinFormats;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFCellStyle;
+import org.apache.poi.xssf.usermodel.XSSFRichTextString;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Implementation of a text extractor from OOXML Excel
+ * files that uses SAX event based parsing.
+ */
+public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
+ private OPCPackage container;
+ private boolean includeSheetNames = true;
+ private boolean formulasNotResults = false;
+
+ /**
+ * These are the different kinds of cells we support.
+ * We keep track of the current one between
+ * the start and end.
+ */
+ enum xssfDataType {
+ BOOLEAN,
+ ERROR,
+ FORMULA,
+ INLINE_STRING,
+ SST_STRING,
+ NUMBER,
+ }
+
+ public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+ this(OPCPackage.open(path));
+ }
+ public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+ super(null);
+ this.container = container;
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" XSSFEventBasedExcelExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+ POIXMLTextExtractor extractor =
+ new XSSFEventBasedExcelExtractor(args[0]);
+ System.out.println(extractor.getText());
+ }
+
+ /**
+ * Should sheet names be included? Default is true
+ */
+ public void setIncludeSheetNames(boolean includeSheetNames) {
+ this.includeSheetNames = includeSheetNames;
+ }
+ /**
+ * Should we return the formula itself, and not
+ * the result it produces? Default is false
+ */
+ public void setFormulasNotResults(boolean formulasNotResults) {
+ this.formulasNotResults = formulasNotResults;
+ }
+
+
+ /**
+ * Handler for sheets. Processes each row and cell,
+ * formatting Cells as best as it can.
+ */
+ class MyXSSFSheetHandler extends DefaultHandler {
+ /**
+ * Table with the styles used for formatting
+ */
+ private StylesTable stylesTable;
+
+ private ReadOnlySharedStringsTable sharedStringsTable;
+
+ /**
+ * Where our text is going
+ */
+ private final StringBuffer output;
+
+ // Set when V start element is seen
+ private boolean vIsOpen;
+ // Set when F start element is seen
+ private boolean fIsOpen;
+
+ // Set when cell start element is seen;
+ // used when cell close element is seen.
+ private xssfDataType nextDataType;
+
+ // Used to format numeric cell values.
+ private short formatIndex;
+ private String formatString;
+ private final DataFormatter formatter;
+
+ // Gathers characters as they are seen.
+ private StringBuffer value = new StringBuffer();
+ private StringBuffer formula = new StringBuffer();
+ private boolean firstCellOfRow = true;
+
+ /**
+ * Accepts objects needed while parsing.
+ *
+ * @param styles Table of styles
+ * @param strings Table of shared strings
+ * @param cols Minimum number of columns to show
+ * @param target Sink for output
+ */
+ public MyXSSFSheetHandler(
+ StylesTable styles,
+ ReadOnlySharedStringsTable strings,
+ StringBuffer output) {
+ this.stylesTable = styles;
+ this.sharedStringsTable = strings;
+ this.output = output;
+ this.nextDataType = xssfDataType.NUMBER;
+ this.formatter = new DataFormatter();
+ }
+
+ public void startElement(String uri, String localName, String name,
+ Attributes attributes) throws SAXException {
+
+ if ("inlineStr".equals(name) || "v".equals(name)) {
+ vIsOpen = true;
+ // Clear contents cache
+ value.setLength(0);
+ } else if ("f".equals(name)) {
+ // Clear contents cache
+ formula.setLength(0);
+
+ // Mark us as being a formula if not already
+ if(nextDataType == xssfDataType.NUMBER) {
+ nextDataType = xssfDataType.FORMULA;
+ }
+
+ // Decide where to get the formula string from
+ String type = attributes.getValue("t");
+ if(type != null && type.equals("shared")) {
+ System.err.println("Warning - shared formulas not yet supported!");
+ } else {
+ fIsOpen = true;
+ }
+ }
+ else if("row".equals(name)) {
+ firstCellOfRow = true;
+ }
+ // c => cell
+ else if ("c".equals(name)) {
+ // Set up defaults.
+ this.nextDataType = xssfDataType.NUMBER;
+ this.formatIndex = -1;
+ this.formatString = null;
+ String cellType = attributes.getValue("t");
+ String cellStyleStr = attributes.getValue("s");
+ if ("b".equals(cellType))
+ nextDataType = xssfDataType.BOOLEAN;
+ else if ("e".equals(cellType))
+ nextDataType = xssfDataType.ERROR;
+ else if ("inlineStr".equals(cellType))
+ nextDataType = xssfDataType.INLINE_STRING;
+ else if ("s".equals(cellType))
+ nextDataType = xssfDataType.SST_STRING;
+ else if ("str".equals(cellType))
+ nextDataType = xssfDataType.FORMULA;
+ else if (cellStyleStr != null) {
+ // Number, but almost certainly with a special style or format
+ int styleIndex = Integer.parseInt(cellStyleStr);
+ XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
+ this.formatIndex = style.getDataFormat();
+ this.formatString = style.getDataFormatString();
+ if (this.formatString == null)
+ this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ String thisStr = null;
+
+ // v => contents of a cell
+ if ("v".equals(name)) {
+ vIsOpen = false;
+
+ // Process the value contents as required, now we have it all
+ switch (nextDataType) {
+ case BOOLEAN:
+ char first = value.charAt(0);
+ thisStr = first == '0' ? "FALSE" : "TRUE";
+ break;
+
+ case ERROR:
+ thisStr = "ERROR:" + value.toString();
+ break;
+
+ case FORMULA:
+ if(formulasNotResults) {
+ thisStr = formula.toString();
+ } else {
+ thisStr = value.toString();
+ }
+ break;
+
+ case INLINE_STRING:
+ // TODO: have seen an example of this, so it's untested.
+ XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
+ thisStr = rtsi.toString();
+ break;
+
+ case SST_STRING:
+ String sstIndex = value.toString();
+ try {
+ int idx = Integer.parseInt(sstIndex);
+ XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
+ thisStr = rtss.toString();
+ }
+ catch (NumberFormatException ex) {
+ System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString());
+ }
+ break;
+
+ case NUMBER:
+ String n = value.toString();
+ if (this.formatString != null)
+ thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString);
+ else
+ thisStr = n;
+ break;
+
+ default:
+ thisStr = "(TODO: Unexpected type: " + nextDataType + ")";
+ break;
+ }
+
+ // Output
+ if(!firstCellOfRow) {
+ output.append('\t');
+ }
+ firstCellOfRow = false;
+
+ output.append(thisStr);
+ } else if ("f".equals(name)) {
+ fIsOpen = false;
+ } else if ("row".equals(name)) {
+ // Finish the line
+ output.append('\n');
+ }
+ }
+
+ /**
+ * Captures characters only if a suitable element is open.
+ * Originally was just "v"; extended for inlineStr also.
+ */
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (vIsOpen) {
+ value.append(ch, start, length);
+ }
+ if (fIsOpen) {
+ formula.append(ch, start, length);
+ }
+ }
+ }
+
+ /**
+ * Processes the given sheet
+ */
+ public void processSheet(
+ StringBuffer output,
+ StylesTable styles,
+ ReadOnlySharedStringsTable strings,
+ InputStream sheetInputStream)
+ throws IOException, SAXException {
+
+ InputSource sheetSource = new InputSource(sheetInputStream);
+ SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+ try {
+ SAXParser saxParser = saxFactory.newSAXParser();
+ XMLReader sheetParser = saxParser.getXMLReader();
+ ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output);
+ sheetParser.setContentHandler(handler);
+ sheetParser.parse(sheetSource);
+ } catch(ParserConfigurationException e) {
+ throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
+ }
+ }
+
+ /**
+ * Processes the file and returns the text
+ */
+ public String getText() {
+ try {
+ ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container);
+ XSSFReader xssfReader = new XSSFReader(container);
+ StylesTable styles = xssfReader.getStylesTable();
+ XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+
+ StringBuffer text = new StringBuffer();
+ while (iter.hasNext()) {
+ InputStream stream = iter.next();
+ if(includeSheetNames) {
+ text.append(iter.getSheetName());
+ text.append('\n');
+ }
+ processSheet(text, styles, strings, stream);
+ stream.close();
+ }
+
+ return text.toString();
+ } catch(IOException e) {
+ System.err.println(e);
+ return null;
+ } catch(SAXException se) {
+ System.err.println(se);
+ return null;
+ } catch(OpenXML4JException o4je) {
+ System.err.println(o4je);
+ return null;
+ }
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xssf.extractor;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hssf.HSSFTestDataSamples;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.xssf.XSSFTestDataSamples;
+
+/**
+ * Tests for {@link XSSFEventBasedExcelExtractor}
+ */
+public final class TestXSSFEventBasedExcelExtractor extends TestCase {
+
+
+ private static final XSSFEventBasedExcelExtractor getExtractor(String sampleName) throws Exception {
+ return new XSSFEventBasedExcelExtractor(XSSFTestDataSamples.
+ openSamplePackage(sampleName));
+ }
+
+ /**
+ * Get text out of the simple file
+ */
+ public void testGetSimpleText() throws Exception {
+ // a very simple file
+ XSSFEventBasedExcelExtractor extractor = getExtractor("sample.xlsx");
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Check sheet names
+ assertTrue(text.startsWith("Sheet1"));
+ assertTrue(text.endsWith("Sheet3\n"));
+
+ // Now without, will have text
+ extractor.setIncludeSheetNames(false);
+ text = extractor.getText();
+ String CHUNK1 =
+ "Lorem\t111\n" +
+ "ipsum\t222\n" +
+ "dolor\t333\n" +
+ "sit\t444\n" +
+ "amet\t555\n" +
+ "consectetuer\t666\n" +
+ "adipiscing\t777\n" +
+ "elit\t888\n" +
+ "Nunc\t999\n";
+ String CHUNK2 =
+ "The quick brown fox jumps over the lazy dog\n" +
+ "hello, xssf hello, xssf\n" +
+ "hello, xssf hello, xssf\n" +
+ "hello, xssf hello, xssf\n" +
+ "hello, xssf hello, xssf\n";
+ assertEquals(
+ CHUNK1 +
+ "at\t4995\n" +
+ CHUNK2
+ , text);
+
+ // Now get formulas not their values
+ extractor.setFormulasNotResults(true);
+ text = extractor.getText();
+ assertEquals(
+ CHUNK1 +
+ "at\tSUM(B1:B9)\n" +
+ CHUNK2, text);
+
+ // With sheet names too
+ extractor.setIncludeSheetNames(true);
+ text = extractor.getText();
+ assertEquals(
+ "Sheet1\n" +
+ CHUNK1 +
+ "at\tSUM(B1:B9)\n" +
+ "rich test\n" +
+ CHUNK2 +
+ "Sheet3\n"
+ , text);
+ }
+
+ public void testGetComplexText() throws Exception {
+ // A fairly complex file
+ XSSFEventBasedExcelExtractor extractor = getExtractor("AverageTaxRates.xlsx");
+ extractor.getText();
+
+ String text = extractor.getText();
+ assertTrue(text.length() > 0);
+
+ // Might not have all formatting it should do!
+ assertTrue(text.startsWith(
+ "Avgtxfull\n" +
+ "(iii) AVERAGE TAX RATES ON ANNUAL"
+ ));
+ }
+
+ /**
+ * Test that we return pretty much the same as
+ * ExcelExtractor does, when we're both passed
+ * the same file, just saved as xls and xlsx
+ */
+ public void testComparedToOLE2() throws Exception {
+ // A fairly simple file - ooxml
+ XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("SampleSS.xlsx");
+
+ ExcelExtractor ole2Extractor =
+ new ExcelExtractor(HSSFTestDataSamples.openSampleWorkbook("SampleSS.xls"));
+
+ POITextExtractor[] extractors =
+ new POITextExtractor[] { ooxmlExtractor, ole2Extractor };
+ for (int i = 0; i < extractors.length; i++) {
+ POITextExtractor extractor = extractors[i];
+
+ String text = extractor.getText().replaceAll("[\r\t]", "");
+ assertTrue(text.startsWith("First Sheet\nTest spreadsheet\n2nd row2nd row 2nd column\n"));
+ Pattern pattern = Pattern.compile(".*13(\\.0+)?\\s+Sheet3.*", Pattern.DOTALL);
+ Matcher m = pattern.matcher(text);
+ assertTrue(m.matches());
+ }
+ }
+}