Browse Source

Patch from Shaun Kalley from bug #56023 - Allow XSSF event model to find + return comments, and use this for the event based .xlsx text extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1613266 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_11_BETA1
Nick Burch 9 years ago
parent
commit
62bd48af74

+ 153
- 6
src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java View File

==================================================================== */ ==================================================================== */
package org.apache.poi.xssf.eventusermodel; package org.apache.poi.xssf.eventusermodel;


import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger; import org.apache.poi.util.POILogger;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTComment;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.DefaultHandler;
*/ */
private StylesTable stylesTable; private StylesTable stylesTable;


/**
* Table with cell comments
*/
private CommentsTable commentsTable;

/**
* Read only access to the shared strings table, for looking
* up (most) string cell's contents
*/
private ReadOnlySharedStringsTable sharedStringsTable; private ReadOnlySharedStringsTable sharedStringsTable;


/** /**
private short formatIndex; private short formatIndex;
private String formatString; private String formatString;
private final DataFormatter formatter; private final DataFormatter formatter;
private int rowNum;
private String cellRef; private String cellRef;
private boolean formulasNotResults; private boolean formulasNotResults;


private StringBuffer formula = new StringBuffer(); private StringBuffer formula = new StringBuffer();
private StringBuffer headerFooter = new StringBuffer(); private StringBuffer headerFooter = new StringBuffer();


private Queue<CellReference> commentCellRefs;

/** /**
* Accepts objects needed while parsing. * Accepts objects needed while parsing.
* *
*/ */
public XSSFSheetXMLHandler( public XSSFSheetXMLHandler(
StylesTable styles, StylesTable styles,
CommentsTable comments,
ReadOnlySharedStringsTable strings, ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetContentsHandler, SheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter, DataFormatter dataFormatter,
boolean formulasNotResults) { boolean formulasNotResults) {
this.stylesTable = styles; this.stylesTable = styles;
this.commentsTable = comments;
this.sharedStringsTable = strings; this.sharedStringsTable = strings;
this.output = sheetContentsHandler; this.output = sheetContentsHandler;
this.formulasNotResults = formulasNotResults; this.formulasNotResults = formulasNotResults;
this.nextDataType = xssfDataType.NUMBER; this.nextDataType = xssfDataType.NUMBER;
this.formatter = dataFormatter; this.formatter = dataFormatter;
init();
}
/**
* Accepts objects needed while parsing.
*
* @param styles Table of styles
* @param strings Table of shared strings
*/
public XSSFSheetXMLHandler(
StylesTable styles,
ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter,
boolean formulasNotResults) {
this(styles, null, strings, sheetContentsHandler, dataFormatter, formulasNotResults);
} }
/** /**
* Accepts objects needed while parsing. * Accepts objects needed while parsing.
* *
boolean formulasNotResults) { boolean formulasNotResults) {
this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults); this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults);
} }
private void init() {
if (commentsTable != null) {
commentCellRefs = new LinkedList<CellReference>();
List<CTComment> commentList = commentsTable.getCTComments().getCommentList().getCommentList();
for (CTComment comment : commentList) {
commentCellRefs.add(new CellReference(comment.getRef()));
}
}
}


private boolean isTextTag(String name) { private boolean isTextTag(String name) {
if("v".equals(name)) { if("v".equals(name)) {
headerFooter.setLength(0); headerFooter.setLength(0);
} }
else if("row".equals(name)) { else if("row".equals(name)) {
int rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
output.startRow(rowNum); output.startRow(rowNum);
} }
// c => cell // c => cell
break; break;
} }
// Do we have a comment for this cell?
checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
XSSFComment comment = commentsTable != null ? commentsTable.findCellComment(cellRef) : null;
// Output // Output
output.cell(cellRef, thisStr);
output.cell(cellRef, thisStr, comment);
} else if ("f".equals(name)) { } else if ("f".equals(name)) {
fIsOpen = false; fIsOpen = false;
} else if ("is".equals(name)) { } else if ("is".equals(name)) {
isIsOpen = false; isIsOpen = false;
} else if ("row".equals(name)) { } else if ("row".equals(name)) {
output.endRow();
// Handle any "missing" cells which had comments attached
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW);
// Finish up the row
output.endRow(rowNum);
} else if ("sheetData".equals(name)) {
// Handle any "missing" cells which had comments attached
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA);
} }
else if("oddHeader".equals(name) || "evenHeader".equals(name) || else if("oddHeader".equals(name) || "evenHeader".equals(name) ||
"firstHeader".equals(name)) { "firstHeader".equals(name)) {
headerFooter.append(ch, start, length); headerFooter.append(ch, start, length);
} }
} }
/**
* Do a check for, and output, comments in otherwise empty cells.
*/
private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) {
if (commentCellRefs != null && !commentCellRefs.isEmpty()) {
// If we've reached the end of the sheet data, output any
// comments we haven't yet already handled
if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) {
while (!commentCellRefs.isEmpty()) {
outputEmptyCellComment(commentCellRefs.remove());
}
return;
}

// At the end of a row, handle any comments for "missing" rows before us
if (this.cellRef == null) {
if (type == EmptyCellCommentsCheckType.END_OF_ROW) {
while (!commentCellRefs.isEmpty()) {
if (commentCellRefs.peek().getRow() == rowNum) {
outputEmptyCellComment(commentCellRefs.remove());
} else {
return;
}
}
return;
} else {
throw new IllegalStateException("Cell ref should be null only if there are only empty cells in the row; rowNum: " + rowNum);
}
}

CellReference nextCommentCellRef;
do {
CellReference cellRef = new CellReference(this.cellRef);
CellReference peekCellRef = commentCellRefs.peek();
if (type == EmptyCellCommentsCheckType.CELL && cellRef.equals(peekCellRef)) {
// remove the comment cell ref from the list if we're about to handle it alongside the cell content
commentCellRefs.remove();
return;
} else {
// fill in any gaps if there are empty cells with comment mixed in with non-empty cells
int comparison = cellRefComparator.compare(peekCellRef, cellRef);
if (comparison > 0 && type == EmptyCellCommentsCheckType.END_OF_ROW && peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else if (comparison < 0 && type == EmptyCellCommentsCheckType.CELL && peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else {
nextCommentCellRef = null;
}
}
} while (nextCommentCellRef != null && !commentCellRefs.isEmpty());
}
}


/**
* Output an empty-cell comment.
*/
private void outputEmptyCellComment(CellReference cellRef) {
String cellRefString = cellRef.formatAsString();
XSSFComment comment = commentsTable.findCellComment(cellRefString);
output.emptyCellComment(cellRefString, comment);
}
private enum EmptyCellCommentsCheckType {
CELL,
END_OF_ROW,
END_OF_SHEET_DATA
}
private static final Comparator<CellReference> cellRefComparator = new Comparator<CellReference>() {
@Override
public int compare(CellReference o1, CellReference o2) {
int result = compare(o1.getRow(), o2.getRow());
if (result == 0) {
result = compare(o1.getCol(), o2.getCol());
}
return result;
}
public int compare(int x, int y) {
return (x < y) ? -1 : ((x == y) ? 0 : 1);
}
};


/** /**
* You need to implement this to handle the results * You need to implement this to handle the results
/** A row with the (zero based) row number has started */ /** A row with the (zero based) row number has started */
public void startRow(int rowNum); public void startRow(int rowNum);
/** A row with the (zero based) row number has ended */ /** A row with the (zero based) row number has ended */
public void endRow();
/** A cell, with the given formatted value, was encountered */
public void cell(String cellReference, String formattedValue);
public void endRow(int rowNum);
/** A cell, with the given formatted value, and possibly a comment, was encountered */
public void cell(String cellReference, String formattedValue, XSSFComment comment);
/** A comment for an otherwise-empty cell was encountered */
public void emptyCellComment(String cellReference, XSSFComment comment);
/** A header or footer has been encountered */ /** A header or footer has been encountered */
public void headerFooter(String text, boolean isHeader, String tagName); public void headerFooter(String text, boolean isHeader, String tagName);
} }

+ 27
- 9
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java View File

import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
private Locale locale; private Locale locale;
private boolean includeTextBoxes = true; private boolean includeTextBoxes = true;
private boolean includeSheetNames = true; private boolean includeSheetNames = true;
private boolean includeCellComments = false;
private boolean includeHeadersFooters = true; private boolean includeHeadersFooters = true;
private boolean formulasNotResults = false; private boolean formulasNotResults = false;


} }


/** /**
* Would control the inclusion of cell comments from the document,
* if we supported it
* Should cell comments be included? Default is false
*/ */
public void setIncludeCellComments(boolean includeCellComments) { public void setIncludeCellComments(boolean includeCellComments) {
throw new IllegalStateException("Comment extraction not supported in streaming mode, please use XSSFExcelExtractor");
this.includeCellComments = includeCellComments;
} }


public void setLocale(Locale locale) { public void setLocale(Locale locale) {
public void processSheet( public void processSheet(
SheetContentsHandler sheetContentsExtractor, SheetContentsHandler sheetContentsExtractor,
StylesTable styles, StylesTable styles,
CommentsTable comments,
ReadOnlySharedStringsTable strings, ReadOnlySharedStringsTable strings,
InputStream sheetInputStream) InputStream sheetInputStream)
throws IOException, SAXException { throws IOException, SAXException {
SAXParser saxParser = saxFactory.newSAXParser(); SAXParser saxParser = saxFactory.newSAXParser();
XMLReader sheetParser = saxParser.getXMLReader(); XMLReader sheetParser = saxParser.getXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler( ContentHandler handler = new XSSFSheetXMLHandler(
styles, strings, sheetContentsExtractor, formatter, formulasNotResults);
styles, comments, strings, sheetContentsExtractor, formatter, formulasNotResults);
sheetParser.setContentHandler(handler); sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource); sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) { } catch(ParserConfigurationException e) {
text.append(iter.getSheetName()); text.append(iter.getSheetName());
text.append('\n'); text.append('\n');
} }
processSheet(sheetExtractor, styles, strings, stream);
CommentsTable comments = includeCellComments ? iter.getSheetComments() : null;
processSheet(sheetExtractor, styles, comments, strings, stream);
if (includeHeadersFooters) { if (includeHeadersFooters) {
sheetExtractor.appendHeaderText(text); sheetExtractor.appendHeaderText(text);
} }
firstCellOfRow = true; firstCellOfRow = true;
} }


public void endRow() {
public void endRow(int rowNum) {
output.append('\n'); output.append('\n');
} }


public void cell(String cellRef, String formattedValue) {
public void cell(String cellRef, String formattedValue, XSSFComment comment) {
if(firstCellOfRow) { if(firstCellOfRow) {
firstCellOfRow = false; firstCellOfRow = false;
} else { } else {
output.append('\t'); output.append('\t');
} }
output.append(formattedValue);
if (formattedValue != null) {
output.append(formattedValue);
}
if (includeCellComments && comment != null) {
String commentText = comment.getString().getString().replace('\n', ' ');
output.append(formattedValue != null ? " Comment by " : "Comment by ");
if (commentText.startsWith(comment.getAuthor() + ": ")) {
output.append(commentText);
} else {
output.append(comment.getAuthor()).append(": ").append(commentText);
}
}
}

public void emptyCellComment(String cellRef, XSSFComment comment) {
cell(cellRef, null, comment);
} }


public void headerFooter(String text, boolean isHeader, String tagName) { public void headerFooter(String text, boolean isHeader, String tagName) {
} }
} }



/** /**
* Append the text for the named header or footer if found. * Append the text for the named header or footer if found.
*/ */

+ 66
- 2
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java View File

import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;


import junit.framework.TestCase;

import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.xssf.XSSFTestDataSamples; import org.apache.poi.xssf.XSSFTestDataSamples;


import junit.framework.TestCase;

/** /**
* Tests for {@link XSSFEventBasedExcelExtractor} * Tests for {@link XSSFEventBasedExcelExtractor}
*/ */
fixture.setIncludeHeadersFooters(false); fixture.setIncludeHeadersFooters(false);
assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText()); assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText());
} }

/**
* Test that XSSFEventBasedExcelExtractor outputs comments when specified.
* The output will contain two improvements over the output from
* XSSFExcelExtractor in that (1) comments from empty cells will be
* outputted, and (2) the author will not be outputted twice.
* <p>
* This test will need to be modified if these improvements are ported to
* XSSFExcelExtractor.
*/
public void testCommentsComparedToNonEventBasedExtractor()
throws Exception {

String expectedOutputWithoutComments =
"Sheet1\n" +
"\n" +
"abc\n" +
"\n" +
"123\n" +
"\n" +
"\n" +
"\n";

String nonEventBasedExtractorOutputWithComments =
"Sheet1\n" +
"\n" +
"abc Comment by Shaun Kalley: Shaun Kalley: Comment A2\n" +
"\n" +
"123 Comment by Shaun Kalley: Shaun Kalley: Comment B4\n" +
"\n" +
"\n" +
"\n";

String eventBasedExtractorOutputWithComments =
"Sheet1\n" +
"Comment by Shaun Kalley: Comment A1\tComment by Shaun Kalley: Comment B1\n" +
"abc Comment by Shaun Kalley: Comment A2\tComment by Shaun Kalley: Comment B2\n" +
"Comment by Shaun Kalley: Comment A3\tComment by Shaun Kalley: Comment B3\n" +
"Comment by Shaun Kalley: Comment A4\t123 Comment by Shaun Kalley: Comment B4\n" +
"Comment by Shaun Kalley: Comment A5\tComment by Shaun Kalley: Comment B5\n" +
"Comment by Shaun Kalley: Comment A7\tComment by Shaun Kalley: Comment B7\n" +
"Comment by Shaun Kalley: Comment A8\tComment by Shaun Kalley: Comment B8\n";

XSSFExcelExtractor extractor = new XSSFExcelExtractor(
XSSFTestDataSamples.openSampleWorkbook("commentTest.xlsx"));
try {
assertEquals(expectedOutputWithoutComments, extractor.getText());
extractor.setIncludeCellComments(true);
assertEquals(nonEventBasedExtractorOutputWithComments, extractor.getText());
} finally {
extractor.close();
}

XSSFEventBasedExcelExtractor fixture =
new XSSFEventBasedExcelExtractor(
XSSFTestDataSamples.openSamplePackage("commentTest.xlsx"));
try {
assertEquals(expectedOutputWithoutComments, fixture.getText());
fixture.setIncludeCellComments(true);
assertEquals(eventBasedExtractorOutputWithComments, fixture.getText());
} finally {
fixture.close();
}
}
} }

BIN
test-data/spreadsheet/commentTest.xlsx View File


Loading…
Cancel
Save