|
|
@@ -26,20 +26,16 @@ import javax.xml.parsers.SAXParserFactory; |
|
|
|
import org.apache.poi.POIXMLTextExtractor; |
|
|
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; |
|
|
|
import org.apache.poi.openxml4j.opc.OPCPackage; |
|
|
|
import org.apache.poi.ss.usermodel.BuiltinFormats; |
|
|
|
import org.apache.poi.ss.usermodel.DataFormatter; |
|
|
|
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; |
|
|
|
import org.apache.poi.xssf.eventusermodel.XSSFReader; |
|
|
|
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; |
|
|
|
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; |
|
|
|
import org.apache.poi.xssf.model.StylesTable; |
|
|
|
import org.apache.poi.xssf.usermodel.XSSFCellStyle; |
|
|
|
import org.apache.poi.xssf.usermodel.XSSFRichTextString; |
|
|
|
import org.apache.xmlbeans.XmlException; |
|
|
|
import org.xml.sax.Attributes; |
|
|
|
import org.xml.sax.ContentHandler; |
|
|
|
import org.xml.sax.InputSource; |
|
|
|
import org.xml.sax.SAXException; |
|
|
|
import org.xml.sax.XMLReader; |
|
|
|
import org.xml.sax.helpers.DefaultHandler; |
|
|
|
|
|
|
|
/** |
|
|
|
* Implementation of a text extractor from OOXML Excel |
|
|
@@ -50,20 +46,6 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { |
|
|
|
private boolean includeSheetNames = true; |
|
|
|
private boolean formulasNotResults = false; |
|
|
|
|
|
|
|
/** |
|
|
|
* These are the different kinds of cells we support. |
|
|
|
* We keep track of the current one between |
|
|
|
* the start and end. |
|
|
|
*/ |
|
|
|
enum xssfDataType { |
|
|
|
BOOLEAN, |
|
|
|
ERROR, |
|
|
|
FORMULA, |
|
|
|
INLINE_STRING, |
|
|
|
SST_STRING, |
|
|
|
NUMBER, |
|
|
|
} |
|
|
|
|
|
|
|
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { |
|
|
|
this(OPCPackage.open(path)); |
|
|
|
} |
|
|
@@ -97,212 +79,11 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { |
|
|
|
this.formulasNotResults = formulasNotResults; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
* Handler for sheets. Processes each row and cell, |
|
|
|
* formatting Cells as best as it can. |
|
|
|
*/ |
|
|
|
class MyXSSFSheetHandler extends DefaultHandler { |
|
|
|
/** |
|
|
|
* Table with the styles used for formatting |
|
|
|
*/ |
|
|
|
private StylesTable stylesTable; |
|
|
|
|
|
|
|
private ReadOnlySharedStringsTable sharedStringsTable; |
|
|
|
|
|
|
|
/** |
|
|
|
* Where our text is going |
|
|
|
*/ |
|
|
|
private final StringBuffer output; |
|
|
|
|
|
|
|
// Set when V start element is seen |
|
|
|
private boolean vIsOpen; |
|
|
|
// Set when F start element is seen |
|
|
|
private boolean fIsOpen; |
|
|
|
|
|
|
|
// Set when cell start element is seen; |
|
|
|
// used when cell close element is seen. |
|
|
|
private xssfDataType nextDataType; |
|
|
|
|
|
|
|
// Used to format numeric cell values. |
|
|
|
private short formatIndex; |
|
|
|
private String formatString; |
|
|
|
private final DataFormatter formatter; |
|
|
|
|
|
|
|
// Gathers characters as they are seen. |
|
|
|
private StringBuffer value = new StringBuffer(); |
|
|
|
private StringBuffer formula = new StringBuffer(); |
|
|
|
private boolean firstCellOfRow = true; |
|
|
|
|
|
|
|
/** |
|
|
|
* Accepts objects needed while parsing. |
|
|
|
* |
|
|
|
* @param styles Table of styles |
|
|
|
* @param strings Table of shared strings |
|
|
|
* @param cols Minimum number of columns to show |
|
|
|
* @param target Sink for output |
|
|
|
*/ |
|
|
|
public MyXSSFSheetHandler( |
|
|
|
StylesTable styles, |
|
|
|
ReadOnlySharedStringsTable strings, |
|
|
|
StringBuffer output) { |
|
|
|
this.stylesTable = styles; |
|
|
|
this.sharedStringsTable = strings; |
|
|
|
this.output = output; |
|
|
|
this.nextDataType = xssfDataType.NUMBER; |
|
|
|
this.formatter = new DataFormatter(); |
|
|
|
} |
|
|
|
|
|
|
|
public void startElement(String uri, String localName, String name, |
|
|
|
Attributes attributes) throws SAXException { |
|
|
|
|
|
|
|
if ("inlineStr".equals(name) || "v".equals(name)) { |
|
|
|
vIsOpen = true; |
|
|
|
// Clear contents cache |
|
|
|
value.setLength(0); |
|
|
|
} else if ("f".equals(name)) { |
|
|
|
// Clear contents cache |
|
|
|
formula.setLength(0); |
|
|
|
|
|
|
|
// Mark us as being a formula if not already |
|
|
|
if(nextDataType == xssfDataType.NUMBER) { |
|
|
|
nextDataType = xssfDataType.FORMULA; |
|
|
|
} |
|
|
|
|
|
|
|
// Decide where to get the formula string from |
|
|
|
String type = attributes.getValue("t"); |
|
|
|
if(type != null && type.equals("shared")) { |
|
|
|
System.err.println("Warning - shared formulas not yet supported!"); |
|
|
|
} else { |
|
|
|
fIsOpen = true; |
|
|
|
} |
|
|
|
} |
|
|
|
else if("row".equals(name)) { |
|
|
|
firstCellOfRow = true; |
|
|
|
} |
|
|
|
// c => cell |
|
|
|
else if ("c".equals(name)) { |
|
|
|
// Set up defaults. |
|
|
|
this.nextDataType = xssfDataType.NUMBER; |
|
|
|
this.formatIndex = -1; |
|
|
|
this.formatString = null; |
|
|
|
String cellType = attributes.getValue("t"); |
|
|
|
String cellStyleStr = attributes.getValue("s"); |
|
|
|
if ("b".equals(cellType)) |
|
|
|
nextDataType = xssfDataType.BOOLEAN; |
|
|
|
else if ("e".equals(cellType)) |
|
|
|
nextDataType = xssfDataType.ERROR; |
|
|
|
else if ("inlineStr".equals(cellType)) |
|
|
|
nextDataType = xssfDataType.INLINE_STRING; |
|
|
|
else if ("s".equals(cellType)) |
|
|
|
nextDataType = xssfDataType.SST_STRING; |
|
|
|
else if ("str".equals(cellType)) |
|
|
|
nextDataType = xssfDataType.FORMULA; |
|
|
|
else if (cellStyleStr != null) { |
|
|
|
// Number, but almost certainly with a special style or format |
|
|
|
int styleIndex = Integer.parseInt(cellStyleStr); |
|
|
|
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); |
|
|
|
this.formatIndex = style.getDataFormat(); |
|
|
|
this.formatString = style.getDataFormatString(); |
|
|
|
if (this.formatString == null) |
|
|
|
this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
public void endElement(String uri, String localName, String name) |
|
|
|
throws SAXException { |
|
|
|
String thisStr = null; |
|
|
|
|
|
|
|
// v => contents of a cell |
|
|
|
if ("v".equals(name)) { |
|
|
|
vIsOpen = false; |
|
|
|
|
|
|
|
// Process the value contents as required, now we have it all |
|
|
|
switch (nextDataType) { |
|
|
|
case BOOLEAN: |
|
|
|
char first = value.charAt(0); |
|
|
|
thisStr = first == '0' ? "FALSE" : "TRUE"; |
|
|
|
break; |
|
|
|
|
|
|
|
case ERROR: |
|
|
|
thisStr = "ERROR:" + value.toString(); |
|
|
|
break; |
|
|
|
|
|
|
|
case FORMULA: |
|
|
|
if(formulasNotResults) { |
|
|
|
thisStr = formula.toString(); |
|
|
|
} else { |
|
|
|
thisStr = value.toString(); |
|
|
|
} |
|
|
|
break; |
|
|
|
|
|
|
|
case INLINE_STRING: |
|
|
|
// TODO: have seen an example of this, so it's untested. |
|
|
|
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); |
|
|
|
thisStr = rtsi.toString(); |
|
|
|
break; |
|
|
|
|
|
|
|
case SST_STRING: |
|
|
|
String sstIndex = value.toString(); |
|
|
|
try { |
|
|
|
int idx = Integer.parseInt(sstIndex); |
|
|
|
XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); |
|
|
|
thisStr = rtss.toString(); |
|
|
|
} |
|
|
|
catch (NumberFormatException ex) { |
|
|
|
System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); |
|
|
|
} |
|
|
|
break; |
|
|
|
|
|
|
|
case NUMBER: |
|
|
|
String n = value.toString(); |
|
|
|
if (this.formatString != null) |
|
|
|
thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); |
|
|
|
else |
|
|
|
thisStr = n; |
|
|
|
break; |
|
|
|
|
|
|
|
default: |
|
|
|
thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
// Output |
|
|
|
if(!firstCellOfRow) { |
|
|
|
output.append('\t'); |
|
|
|
} |
|
|
|
firstCellOfRow = false; |
|
|
|
|
|
|
|
output.append(thisStr); |
|
|
|
} else if ("f".equals(name)) { |
|
|
|
fIsOpen = false; |
|
|
|
} else if ("row".equals(name)) { |
|
|
|
// Finish the line |
|
|
|
output.append('\n'); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* Captures characters only if a suitable element is open. |
|
|
|
* Originally was just "v"; extended for inlineStr also. |
|
|
|
*/ |
|
|
|
public void characters(char[] ch, int start, int length) |
|
|
|
throws SAXException { |
|
|
|
if (vIsOpen) { |
|
|
|
value.append(ch, start, length); |
|
|
|
} |
|
|
|
if (fIsOpen) { |
|
|
|
formula.append(ch, start, length); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/** |
|
|
|
* Processes the given sheet |
|
|
|
*/ |
|
|
|
public void processSheet( |
|
|
|
StringBuffer output, |
|
|
|
SheetTextExtractor sheetExtractor, |
|
|
|
StylesTable styles, |
|
|
|
ReadOnlySharedStringsTable strings, |
|
|
|
InputStream sheetInputStream) |
|
|
@@ -313,7 +94,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { |
|
|
|
try { |
|
|
|
SAXParser saxParser = saxFactory.newSAXParser(); |
|
|
|
XMLReader sheetParser = saxParser.getXMLReader(); |
|
|
|
ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output); |
|
|
|
ContentHandler handler = new XSSFSheetXMLHandler(styles, strings, sheetExtractor, formulasNotResults); |
|
|
|
sheetParser.setContentHandler(handler); |
|
|
|
sheetParser.parse(sheetSource); |
|
|
|
} catch(ParserConfigurationException e) { |
|
|
@@ -332,13 +113,15 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { |
|
|
|
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); |
|
|
|
|
|
|
|
StringBuffer text = new StringBuffer(); |
|
|
|
SheetTextExtractor sheetExtractor = new SheetTextExtractor(text); |
|
|
|
|
|
|
|
while (iter.hasNext()) { |
|
|
|
InputStream stream = iter.next(); |
|
|
|
if(includeSheetNames) { |
|
|
|
text.append(iter.getSheetName()); |
|
|
|
text.append('\n'); |
|
|
|
} |
|
|
|
processSheet(text, styles, strings, stream); |
|
|
|
processSheet(sheetExtractor, styles, strings, stream); |
|
|
|
stream.close(); |
|
|
|
} |
|
|
|
|
|
|
@@ -354,4 +137,30 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { |
|
|
|
return null; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
protected class SheetTextExtractor implements SheetContentsHandler { |
|
|
|
private final StringBuffer output; |
|
|
|
private boolean firstCellOfRow = true; |
|
|
|
|
|
|
|
protected SheetTextExtractor(StringBuffer output) { |
|
|
|
this.output = output; |
|
|
|
} |
|
|
|
|
|
|
|
public void startRow(int rowNum) { |
|
|
|
firstCellOfRow = true; |
|
|
|
} |
|
|
|
|
|
|
|
public void endRow() { |
|
|
|
output.append('\n'); |
|
|
|
} |
|
|
|
|
|
|
|
public void cell(String cellRef, String formattedValue) { |
|
|
|
if(firstCellOfRow) { |
|
|
|
firstCellOfRow = false; |
|
|
|
} else { |
|
|
|
output.append('\t'); |
|
|
|
} |
|
|
|
output.append(formattedValue); |
|
|
|
} |
|
|
|
} |
|
|
|
} |