diff options
author | Nick Burch <nick@apache.org> | 2014-07-24 20:13:54 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2014-07-24 20:13:54 +0000 |
commit | 62bd48af7491276d453df4f68c617aa17b4fc9a7 (patch) | |
tree | d636b11ce6cb81a1bc092aac5b6a45da5cfdeb4e | |
parent | f3dba528888a0ee4f1fe10f13bdbd025755913f9 (diff) | |
download | poi-62bd48af7491276d453df4f68c617aa17b4fc9a7.tar.gz poi-62bd48af7491276d453df4f68c617aa17b4fc9a7.zip |
Patch from Shaun Kalley from bug #56023 - Allow XSSF event model to find + return comments, and use this for the event based .xlsx text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1613266 13f79535-47bb-0310-9956-ffa450edef68
4 files changed, 246 insertions, 17 deletions
diff --git a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java index 0ebb40858a..06c9f09b9a 100644 --- a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java +++ b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java @@ -16,13 +16,22 @@ ==================================================================== */ package org.apache.poi.xssf.eventusermodel; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; + import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.util.CellReference; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; +import org.apache.poi.xssf.model.CommentsTable; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFCellStyle; +import org.apache.poi.xssf.usermodel.XSSFComment; import org.apache.poi.xssf.usermodel.XSSFRichTextString; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTComment; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; @@ -54,6 +63,15 @@ public class XSSFSheetXMLHandler extends DefaultHandler { */ private StylesTable stylesTable; + /** + * Table with cell comments + */ + private CommentsTable commentsTable; + + /** + * Read only access to the shared strings table, for looking + * up (most) string cell's contents + */ private ReadOnlySharedStringsTable sharedStringsTable; /** @@ -78,6 +96,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler { private short formatIndex; private String formatString; private final DataFormatter formatter; + private int rowNum; private String cellRef; private boolean formulasNotResults; @@ -86,6 +105,8 @@ public class XSSFSheetXMLHandler extends DefaultHandler { private StringBuffer formula = new StringBuffer(); private StringBuffer headerFooter = new StringBuffer(); + private Queue<CellReference> commentCellRefs; + /** * Accepts objects needed while parsing. * @@ -94,17 +115,36 @@ public class XSSFSheetXMLHandler extends DefaultHandler { */ public XSSFSheetXMLHandler( StylesTable styles, + CommentsTable comments, ReadOnlySharedStringsTable strings, SheetContentsHandler sheetContentsHandler, DataFormatter dataFormatter, boolean formulasNotResults) { this.stylesTable = styles; + this.commentsTable = comments; this.sharedStringsTable = strings; this.output = sheetContentsHandler; this.formulasNotResults = formulasNotResults; this.nextDataType = xssfDataType.NUMBER; this.formatter = dataFormatter; + init(); + } + + /** + * Accepts objects needed while parsing. + * + * @param styles Table of styles + * @param strings Table of shared strings + */ + public XSSFSheetXMLHandler( + StylesTable styles, + ReadOnlySharedStringsTable strings, + SheetContentsHandler sheetContentsHandler, + DataFormatter dataFormatter, + boolean formulasNotResults) { + this(styles, null, strings, sheetContentsHandler, dataFormatter, formulasNotResults); } + /** * Accepts objects needed while parsing. * @@ -118,6 +158,16 @@ public class XSSFSheetXMLHandler extends DefaultHandler { boolean formulasNotResults) { this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults); } + + private void init() { + if (commentsTable != null) { + commentCellRefs = new LinkedList<CellReference>(); + List<CTComment> commentList = commentsTable.getCTComments().getCommentList().getCommentList(); + for (CTComment comment : commentList) { + commentCellRefs.add(new CellReference(comment.getRef())); + } + } + } private boolean isTextTag(String name) { if("v".equals(name)) { @@ -190,7 +240,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler { headerFooter.setLength(0); } else if("row".equals(name)) { - int rowNum = Integer.parseInt(attributes.getValue("r")) - 1; + rowNum = Integer.parseInt(attributes.getValue("r")) - 1; output.startRow(rowNum); } // c => cell @@ -304,14 +354,25 @@ public class XSSFSheetXMLHandler extends DefaultHandler { break; } + // Do we have a comment for this cell? + checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL); + XSSFComment comment = commentsTable != null ? commentsTable.findCellComment(cellRef) : null; + // Output - output.cell(cellRef, thisStr); + output.cell(cellRef, thisStr, comment); } else if ("f".equals(name)) { fIsOpen = false; } else if ("is".equals(name)) { isIsOpen = false; } else if ("row".equals(name)) { - output.endRow(); + // Handle any "missing" cells which had comments attached + checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW); + + // Finish up the row + output.endRow(rowNum); + } else if ("sheetData".equals(name)) { + // Handle any "missing" cells which had comments attached + checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA); } else if("oddHeader".equals(name) || "evenHeader".equals(name) || "firstHeader".equals(name)) { @@ -342,6 +403,90 @@ public class XSSFSheetXMLHandler extends DefaultHandler { headerFooter.append(ch, start, length); } } + + /** + * Do a check for, and output, comments in otherwise empty cells. + */ + private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) { + if (commentCellRefs != null && !commentCellRefs.isEmpty()) { + // If we've reached the end of the sheet data, output any + // comments we haven't yet already handled + if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) { + while (!commentCellRefs.isEmpty()) { + outputEmptyCellComment(commentCellRefs.remove()); + } + return; + } + + // At the end of a row, handle any comments for "missing" rows before us + if (this.cellRef == null) { + if (type == EmptyCellCommentsCheckType.END_OF_ROW) { + while (!commentCellRefs.isEmpty()) { + if (commentCellRefs.peek().getRow() == rowNum) { + outputEmptyCellComment(commentCellRefs.remove()); + } else { + return; + } + } + return; + } else { + throw new IllegalStateException("Cell ref should be null only if there are only empty cells in the row; rowNum: " + rowNum); + } + } + + CellReference nextCommentCellRef; + do { + CellReference cellRef = new CellReference(this.cellRef); + CellReference peekCellRef = commentCellRefs.peek(); + if (type == EmptyCellCommentsCheckType.CELL && cellRef.equals(peekCellRef)) { + // remove the comment cell ref from the list if we're about to handle it alongside the cell content + commentCellRefs.remove(); + return; + } else { + // fill in any gaps if there are empty cells with comment mixed in with non-empty cells + int comparison = cellRefComparator.compare(peekCellRef, cellRef); + if (comparison > 0 && type == EmptyCellCommentsCheckType.END_OF_ROW && peekCellRef.getRow() <= rowNum) { + nextCommentCellRef = commentCellRefs.remove(); + outputEmptyCellComment(nextCommentCellRef); + } else if (comparison < 0 && type == EmptyCellCommentsCheckType.CELL && peekCellRef.getRow() <= rowNum) { + nextCommentCellRef = commentCellRefs.remove(); + outputEmptyCellComment(nextCommentCellRef); + } else { + nextCommentCellRef = null; + } + } + } while (nextCommentCellRef != null && !commentCellRefs.isEmpty()); + } + } + + + /** + * Output an empty-cell comment. + */ + private void outputEmptyCellComment(CellReference cellRef) { + String cellRefString = cellRef.formatAsString(); + XSSFComment comment = commentsTable.findCellComment(cellRefString); + output.emptyCellComment(cellRefString, comment); + } + + private enum EmptyCellCommentsCheckType { + CELL, + END_OF_ROW, + END_OF_SHEET_DATA + } + private static final Comparator<CellReference> cellRefComparator = new Comparator<CellReference>() { + @Override + public int compare(CellReference o1, CellReference o2) { + int result = compare(o1.getRow(), o2.getRow()); + if (result == 0) { + result = compare(o1.getCol(), o2.getCol()); + } + return result; + } + public int compare(int x, int y) { + return (x < y) ? -1 : ((x == y) ? 0 : 1); + } + }; /** * You need to implement this to handle the results @@ -351,9 +496,11 @@ public class XSSFSheetXMLHandler extends DefaultHandler { /** A row with the (zero based) row number has started */ public void startRow(int rowNum); /** A row with the (zero based) row number has ended */ - public void endRow(); - /** A cell, with the given formatted value, was encountered */ - public void cell(String cellReference, String formattedValue); + public void endRow(int rowNum); + /** A cell, with the given formatted value, and possibly a comment, was encountered */ + public void cell(String cellReference, String formattedValue, XSSFComment comment); + /** A comment for an otherwise-empty cell was encountered */ + public void emptyCellComment(String cellReference, XSSFComment comment); /** A header or footer has been encountered */ public void headerFooter(String text, boolean isHeader, String tagName); } diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java index 6929242f3f..c598ed2e3d 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java @@ -39,7 +39,9 @@ import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; +import org.apache.poi.xssf.model.CommentsTable; import org.apache.poi.xssf.model.StylesTable; +import org.apache.poi.xssf.usermodel.XSSFComment; import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.xmlbeans.XmlException; @@ -60,6 +62,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor private Locale locale; private boolean includeTextBoxes = true; private boolean includeSheetNames = true; + private boolean includeCellComments = false; private boolean includeHeadersFooters = true; private boolean formulasNotResults = false; @@ -112,11 +115,10 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor } /** - * Would control the inclusion of cell comments from the document, - * if we supported it + * Should cell comments be included? Default is false */ public void setIncludeCellComments(boolean includeCellComments) { - throw new IllegalStateException("Comment extraction not supported in streaming mode, please use XSSFExcelExtractor"); + this.includeCellComments = includeCellComments; } public void setLocale(Locale locale) { @@ -159,6 +161,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor public void processSheet( SheetContentsHandler sheetContentsExtractor, StylesTable styles, + CommentsTable comments, ReadOnlySharedStringsTable strings, InputStream sheetInputStream) throws IOException, SAXException { @@ -176,7 +179,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor SAXParser saxParser = saxFactory.newSAXParser(); XMLReader sheetParser = saxParser.getXMLReader(); ContentHandler handler = new XSSFSheetXMLHandler( - styles, strings, sheetContentsExtractor, formatter, formulasNotResults); + styles, comments, strings, sheetContentsExtractor, formatter, formulasNotResults); sheetParser.setContentHandler(handler); sheetParser.parse(sheetSource); } catch(ParserConfigurationException e) { @@ -203,7 +206,8 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor text.append(iter.getSheetName()); text.append('\n'); } - processSheet(sheetExtractor, styles, strings, stream); + CommentsTable comments = includeCellComments ? iter.getSheetComments() : null; + processSheet(sheetExtractor, styles, comments, strings, stream); if (includeHeadersFooters) { sheetExtractor.appendHeaderText(text); } @@ -268,17 +272,32 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor firstCellOfRow = true; } - public void endRow() { + public void endRow(int rowNum) { output.append('\n'); } - public void cell(String cellRef, String formattedValue) { + public void cell(String cellRef, String formattedValue, XSSFComment comment) { if(firstCellOfRow) { firstCellOfRow = false; } else { output.append('\t'); } - output.append(formattedValue); + if (formattedValue != null) { + output.append(formattedValue); + } + if (includeCellComments && comment != null) { + String commentText = comment.getString().getString().replace('\n', ' '); + output.append(formattedValue != null ? " Comment by " : "Comment by "); + if (commentText.startsWith(comment.getAuthor() + ": ")) { + output.append(commentText); + } else { + output.append(comment.getAuthor()).append(": ").append(commentText); + } + } + } + + public void emptyCellComment(String cellRef, XSSFComment comment) { + cell(cellRef, null, comment); } public void headerFooter(String text, boolean isHeader, String tagName) { @@ -287,7 +306,6 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor } } - /** * Append the text for the named header or footer if found. */ diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java index 98aeb627f2..80c1f116b5 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java @@ -20,13 +20,13 @@ package org.apache.poi.xssf.extractor; import java.util.regex.Matcher; import java.util.regex.Pattern; -import junit.framework.TestCase; - import org.apache.poi.POITextExtractor; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.xssf.XSSFTestDataSamples; +import junit.framework.TestCase; + /** * Tests for {@link XSSFEventBasedExcelExtractor} */ @@ -240,4 +240,68 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase { fixture.setIncludeHeadersFooters(false); assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText()); } + + /** + * Test that XSSFEventBasedExcelExtractor outputs comments when specified. + * The output will contain two improvements over the output from + * XSSFExcelExtractor in that (1) comments from empty cells will be + * outputted, and (2) the author will not be outputted twice. + * <p> + * This test will need to be modified if these improvements are ported to + * XSSFExcelExtractor. + */ + public void testCommentsComparedToNonEventBasedExtractor() + throws Exception { + + String expectedOutputWithoutComments = + "Sheet1\n" + + "\n" + + "abc\n" + + "\n" + + "123\n" + + "\n" + + "\n" + + "\n"; + + String nonEventBasedExtractorOutputWithComments = + "Sheet1\n" + + "\n" + + "abc Comment by Shaun Kalley: Shaun Kalley: Comment A2\n" + + "\n" + + "123 Comment by Shaun Kalley: Shaun Kalley: Comment B4\n" + + "\n" + + "\n" + + "\n"; + + String eventBasedExtractorOutputWithComments = + "Sheet1\n" + + "Comment by Shaun Kalley: Comment A1\tComment by Shaun Kalley: Comment B1\n" + + "abc Comment by Shaun Kalley: Comment A2\tComment by Shaun Kalley: Comment B2\n" + + "Comment by Shaun Kalley: Comment A3\tComment by Shaun Kalley: Comment B3\n" + + "Comment by Shaun Kalley: Comment A4\t123 Comment by Shaun Kalley: Comment B4\n" + + "Comment by Shaun Kalley: Comment A5\tComment by Shaun Kalley: Comment B5\n" + + "Comment by Shaun Kalley: Comment A7\tComment by Shaun Kalley: Comment B7\n" + + "Comment by Shaun Kalley: Comment A8\tComment by Shaun Kalley: Comment B8\n"; + + XSSFExcelExtractor extractor = new XSSFExcelExtractor( + XSSFTestDataSamples.openSampleWorkbook("commentTest.xlsx")); + try { + assertEquals(expectedOutputWithoutComments, extractor.getText()); + extractor.setIncludeCellComments(true); + assertEquals(nonEventBasedExtractorOutputWithComments, extractor.getText()); + } finally { + extractor.close(); + } + + XSSFEventBasedExcelExtractor fixture = + new XSSFEventBasedExcelExtractor( + XSSFTestDataSamples.openSamplePackage("commentTest.xlsx")); + try { + assertEquals(expectedOutputWithoutComments, fixture.getText()); + fixture.setIncludeCellComments(true); + assertEquals(eventBasedExtractorOutputWithComments, fixture.getText()); + } finally { + fixture.close(); + } + } } diff --git a/test-data/spreadsheet/commentTest.xlsx b/test-data/spreadsheet/commentTest.xlsx Binary files differnew file mode 100644 index 0000000000..10e7837d64 --- /dev/null +++ b/test-data/spreadsheet/commentTest.xlsx |