From: Tim Allison Date: Mon, 16 Jun 2014 18:46:00 +0000 (+0000) Subject: BUG 54771 extract text from SDTs at the cell level within a table row X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=af7b947bb961e2527986ba52e7b115885f677944;p=poi.git BUG 54771 extract text from SDTs at the cell level within a table row git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1602955 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 089ed5dc5d..afff770c74 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -27,6 +27,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; import org.apache.poi.xwpf.usermodel.IBodyElement; +import org.apache.poi.xwpf.usermodel.ICell; import org.apache.poi.xwpf.usermodel.IRunElement; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFHyperlink; @@ -34,6 +35,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.poi.xwpf.usermodel.XWPFSDT; +import org.apache.poi.xwpf.usermodel.XWPFSDTCell; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; @@ -161,14 +163,18 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { } - private void appendTableText(StringBuffer text, XWPFTable table){ + private void appendTableText(StringBuffer text, XWPFTable table) { //this works recursively to pull embedded tables from tables - for (XWPFTableRow row : table.getRows()){ - List cells = row.getTableCells(); - for (int i = 0; i < cells.size(); i++){ - XWPFTableCell cell = cells.get(i); - text.append(cell.getTextRecursively()); - if (i < cells.size()-1){ + for (XWPFTableRow row : table.getRows()) { + List cells = row.getTableICells(); + for (int i = 0; i < cells.size(); i++) { + ICell cell = cells.get(i); + if (cell instanceof XWPFTableCell) { + text.append(((XWPFTableCell)cell).getTextRecursively()); + } else if (cell instanceof XWPFSDTCell) { + text.append(((XWPFSDTCell)cell).getContent().getText()); + } + if (i < cells.size()-1) { text.append("\t"); } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/AbstractXWPFSDT.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/AbstractXWPFSDT.java new file mode 100644 index 0000000000..9c668e8453 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/AbstractXWPFSDT.java @@ -0,0 +1,113 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +import java.util.List; + +import org.apache.poi.POIXMLDocumentPart; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString; + +/** + * Experimental abstract class that is a base for XWPFSDT and XWPFSDTCell + * + * WARNING - APIs expected to change rapidly. + * + * These classes have so far been built only for read-only processing. + * + */ +public abstract class AbstractXWPFSDT implements ISDTContents { + private final String title; + private final String tag; + private final IBody part; + + public AbstractXWPFSDT(CTSdtPr pr, IBody part){ + + List aliases = pr.getAliasList(); + if (aliases != null && aliases.size() > 0){ + title = aliases.get(0).getVal(); + } else { + title = ""; + } + List tags = pr.getTagList(); + if (tags != null && tags.size() > 0){ + tag = tags.get(0).getVal(); + } else { + tag = ""; + } + this.part = part; + + } + + /** + * + * @return first SDT Title + */ + public String getTitle(){ + return title; + } + + /** + * + * @return first SDT Tag + */ + public String getTag(){ + return tag; + } + + /** + * + * @return the content object + */ + public abstract ISDTContent getContent(); + + /** + * + * @return null + */ + public IBody getBody() { + return null; + } + + /** + * + * @return document part + */ + public POIXMLDocumentPart getPart() { + return part.getPart(); + } + + /** + * + * @return partType + */ + public BodyType getPartType() { + return BodyType.CONTENTCONTROL; + } + + /** + * + * @return element type + */ + public BodyElementType getElementType() { + return BodyElementType.CONTENTCONTROL; + } + + public XWPFDocument getDocument() { + return part.getXWPFDocument(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java new file mode 100644 index 0000000000..cf32924b8e --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ICell.java @@ -0,0 +1,27 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +/** + * Interface for anything that can be at a table cell level: + * {@link XWPFTableCell}, {@link XWPFSDTCell} + *

+ * Schematically something like this: + * <tr><tc/><tc/><sdt><tc/></sdt></tr> + */ +public interface ICell { +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java index 7ee12bd2de..40776e218a 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/ISDTContents.java @@ -17,7 +17,7 @@ package org.apache.poi.xwpf.usermodel; /** - * Interface for anything that can be within a STD: + * Interface for anything that can be within an SDT: * {@link XWPFRun}, {@link XWPFTable}, {@link XWPFParagraph}, * {@link XWPFSDT} etc */ diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java index a17d51f8dd..4a51725b19 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDT.java @@ -16,95 +16,32 @@ ==================================================================== */ package org.apache.poi.xwpf.usermodel; -import java.util.List; - -import org.apache.poi.POIXMLDocumentPart; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun; -import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString; /** * Experimental class to offer rudimentary read-only processing of * of StructuredDocumentTags/ContentControl - * - * * * WARNING - APIs expected to change rapidly * */ -public class XWPFSDT implements IBodyElement, IRunBody, ISDTContents, IRunElement { - private final String title; - private final String tag; - private final XWPFSDTContent content; - private final IBody part; +public class XWPFSDT extends AbstractXWPFSDT + implements IBodyElement, IRunBody, ISDTContents, IRunElement { + private final ISDTContent content; public XWPFSDT(CTSdtRun sdtRun, IBody part){ - this.part = part; + super(sdtRun.getSdtPr(), part); this.content = new XWPFSDTContent(sdtRun.getSdtContent(), part, this); - CTSdtPr pr = sdtRun.getSdtPr(); - List aliases = pr.getAliasList(); - if (aliases != null && aliases.size() > 0){ - title = aliases.get(0).getVal(); - } else { - title = ""; - } - @SuppressWarnings("deprecation") - CTString[] array = pr.getTagArray(); - if (array != null && array.length > 0){ - tag = array[0].getVal(); - } else { - tag = ""; - } - } + public XWPFSDT(CTSdtBlock block, IBody part){ - this.part = part; + super(block.getSdtPr(), part); this.content = new XWPFSDTContent( block.getSdtContent(), part, this); - CTSdtPr pr = block.getSdtPr(); - List aliases = pr.getAliasList(); - if (aliases != null && aliases.size() > 0){ - title = aliases.get(0).getVal(); - } else { - title = ""; - } - @SuppressWarnings("deprecation") - CTString[] array = pr.getTagArray(); - if (array != null && array.length > 0){ - tag = array[0].getVal(); - } else { - tag = ""; - } - - } - public String getTitle(){ - return title; - } - public String getTag(){ - return tag; - } - public XWPFSDTContent getContent(){ - return content; - } - - public IBody getBody() { - // TODO Auto-generated method stub - return null; - } - - public POIXMLDocumentPart getPart() { - return part.getPart(); } - public BodyType getPartType() { - return BodyType.CONTENTCONTROL; - } - - public BodyElementType getElementType() { - return BodyElementType.CONTENTCONTROL; + public ISDTContent getContent(){ + return content; } - public XWPFDocument getDocument() { - return part.getXWPFDocument(); - } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTCell.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTCell.java new file mode 100644 index 0000000000..21cca3e7b2 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTCell.java @@ -0,0 +1,44 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell; + +/** + * Experimental class to offer rudimentary read-only processing of + * of StructuredDocumentTags/ContentControl that can appear + * in a table row as if a table cell. + *

+ * These can contain one or more cells or other SDTs within them. + * + * WARNING - APIs expected to change rapidly + * + */ +public class XWPFSDTCell extends AbstractXWPFSDT implements ICell { + private final XWPFSDTContentCell cellContent; + + public XWPFSDTCell(CTSdtCell sdtCell, XWPFTableRow xwpfTableRow, IBody part){ + super(sdtCell.getSdtPr(), part); + cellContent = new XWPFSDTContentCell(sdtCell.getSdtContent(), xwpfTableRow, part); + } + + @Override + public ISDTContent getContent(){ + return cellContent; + } + +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java index de12a92164..59a0abba9f 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContent.java @@ -39,7 +39,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; * WARNING - APIs expected to change rapidly * */ -public class XWPFSDTContent { +public class XWPFSDTContent implements ISDTContent { // private final IBody part; // private final XWPFDocument document; @@ -87,10 +87,10 @@ public class XWPFSDTContent { for (int i = 0; i < bodyElements.size(); i++){ Object o = bodyElements.get(i); if (o instanceof XWPFParagraph){ - text.append(((XWPFParagraph)o).getText()); + appendParagraph((XWPFParagraph)o, text); addNewLine = true; } else if (o instanceof XWPFTable){ - text.append(((XWPFTable)o).getText()); + appendTable((XWPFTable)o, text); addNewLine = true; } else if (o instanceof XWPFSDT){ text.append(((XWPFSDT)o).getContent().getText()); @@ -106,6 +106,31 @@ public class XWPFSDTContent { return text.toString(); } + private void appendTable(XWPFTable table, StringBuilder text) { + //this works recursively to pull embedded tables from within cells + for (XWPFTableRow row : table.getRows()) { + List cells = row.getTableICells(); + for (int i = 0; i < cells.size(); i++) { + ICell cell = cells.get(i); + if (cell instanceof XWPFTableCell) { + text.append(((XWPFTableCell)cell).getTextRecursively()); + } else if (cell instanceof XWPFSDTCell) { + text.append(((XWPFSDTCell)cell).getContent().getText()); + } + if (i < cells.size()-1) { + text.append("\t"); + } + } + text.append('\n'); + } + } + + private void appendParagraph(XWPFParagraph paragraph, StringBuilder text) { + for(IRunElement run : paragraph.getRuns()) { + text.append(run.toString()); + } + } + public String toString(){ return getText(); } diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContentCell.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContentCell.java new file mode 100644 index 0000000000..25e258a5cc --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFSDTContentCell.java @@ -0,0 +1,114 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xwpf.usermodel; + + +import javax.xml.namespace.QName; + +import org.apache.xmlbeans.XmlCursor; +import org.apache.xmlbeans.XmlCursor.TokenType; + +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentCell; + + + +/** + * Experimental class to offer rudimentary read-only processing of + * of the XWPFSDTCellContent. + + * WARNING - APIs expected to change rapidly + * + */ +public class XWPFSDTContentCell implements ISDTContent { + + //A full implementation would grab the icells + //that a content cell can contain. This would require + //significant changes, including changing the notion that the + //parent of a cell can be not just a row, but an sdt. + //For now we are just grabbing the text out of the text tokentypes. + + //private List cells = new ArrayList(). + + private String text = ""; + public XWPFSDTContentCell(CTSdtContentCell sdtContentCell, + XWPFTableRow xwpfTableRow, IBody part){ + super(); + StringBuilder sb = new StringBuilder(); + XmlCursor cursor = sdtContentCell.newCursor(); + + //keep track of the following, + //and add "\n" only before the start of a body + //element if it is not the first body element. + + //index of cell in row + int tcCnt = 0; + //count of body objects + int iBodyCnt = 0; + int depth = 1; + + while (cursor.hasNextToken() && depth > 0) { + TokenType t = cursor.toNextToken(); + if (t.isText()){ + sb.append(cursor.getTextValue()); + } else if (isStartToken(cursor, "tr")) { + tcCnt = 0; + iBodyCnt = 0; + } else if (isStartToken(cursor, "tc")) { + if (tcCnt++ > 0) { + sb.append("\t"); + } + iBodyCnt = 0; + } else if (isStartToken(cursor, "p") || + isStartToken(cursor, "tbl") || + isStartToken(cursor, "sdt")) { + if (iBodyCnt > 0) { + sb.append("\n"); + } + iBodyCnt++; + } + if (cursor.isStart()){ + depth++; + } else if (cursor.isEnd()){ + depth--; + } + } + text = sb.toString(); + } + + + + private boolean isStartToken(XmlCursor cursor, String string) { + if (! cursor.isStart()) { + return false; + } + QName qName = cursor.getName(); + if (qName != null && qName.getLocalPart() != null && + qName.getLocalPart().equals(string)) { + return true; + } + return false; + } + + + public String getText(){ + return text; + } + + public String toString(){ + return getText(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java index c2b1c175f8..554e7bef2c 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTable.java @@ -159,6 +159,13 @@ public class XWPFTable implements IBodyElement, ISDTContents { } /** + * Convenience method to extract text in cells. This + * does not extract text recursively in cells, and it does not + * currently include text in SDT (form) components. + *

+ * To get all text within a table, see XWPFWordExtractor's appendTableText + * as an example. + * * @return text */ public String getText() { diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java index 148bd20727..e1b46fecd2 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableCell.java @@ -42,7 +42,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.STVerticalJc; * Represents a Cell within a {@link XWPFTable}. The * Cell is the thing that holds the actual content (paragraphs etc) */ -public class XWPFTableCell implements IBody { +public class XWPFTableCell implements IBody, ICell { private final CTTc ctTc; protected List paragraphs = null; protected List tables = null; diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java index a16b247a5f..56ea38745f 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFTableRow.java @@ -21,9 +21,12 @@ import java.util.ArrayList; import java.util.List; import org.apache.poi.util.Internal; +import org.apache.xmlbeans.XmlCursor; +import org.apache.xmlbeans.XmlObject; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHeight; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTrPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff; @@ -121,6 +124,29 @@ public class XWPFTableRow { return table; } + /** + * create and return a list of all XWPFTableCell + * who belongs to this row + * @return a list of {@link XWPFTableCell} + */ + public List getTableICells(){ + + List cells = new ArrayList(); + //Can't use ctRow.getTcList because that only gets table cells + //Can't use ctRow.getSdtList because that only gets sdts that are at cell level + XmlCursor cursor = ctRow.newCursor(); + cursor.selectPath("./*"); + while (cursor.toNextSelection()) { + XmlObject o = cursor.getObject(); + if (o instanceof CTTc){ + cells.add(new XWPFTableCell((CTTc)o, this, table.getBody())); + } else if (o instanceof CTSdtCell) { + cells.add(new XWPFSDTCell((CTSdtCell)o, this, table.getBody())); + } + } + return cells; + } + /** * create and return a list of all XWPFTableCell * who belongs to this row diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 6b9f7125ea..d63bd642de 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -18,6 +18,8 @@ package org.apache.poi.xwpf.extractor; import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import junit.framework.TestCase; @@ -327,12 +329,14 @@ public class TestXWPFWordExtractor extends TestCase { String[] targs = new String[]{ "header_rich_text", "rich_text", - "rich_text_pre_table\nrich_text_cell1\t\t\t\n\nrich_text_post_table", + "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table", "plain_text_no_newlines", "plain_text_with_newlines1\nplain_text_with_newlines2\n", "watermelon\n", "dirt\n", "4/16/2013\n", + "rich_text_in_cell", + "abc", "rich_text_in_paragraph_in_cell", "footer_rich_text", "footnote_sdt", @@ -352,6 +356,36 @@ public class TestXWPFWordExtractor extends TestCase { } assertEquals("controlled content loading hit count", targs.length, hits); ex.close(); + + + doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx"); + targs = new String[]{ + "bb", + "test subtitle\n", + "test user\n", + }; + ex = new XWPFWordExtractor(doc); + s = ex.getText().toLowerCase(); + + //At one point in development there were three copies of the text. + //This ensures that there is only one copy. + for (String targ : targs){ + Matcher m = Pattern.compile(targ).matcher(s); + int hit = 0; + while (m.find()) { + hit++; + } + assertEquals("controlled content loading-"+targ, 1, hit); + } + //"test\n" appears twice: once as the "title" and once in the text. + //This also happens when you save this document as text from MSWord. + Matcher m = Pattern.compile("test\n").matcher(s); + int hit = 0; + while (m.find()){ + hit++; + } + assertEquals("test", 2, hit); + ex.close(); } /** No Header or Footer in document */ diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java b/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java index f1a585567b..f4114c9620 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/usermodel/TestXWPFSDT.java @@ -18,8 +18,10 @@ package org.apache.poi.xwpf.usermodel; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import junit.framework.TestCase; @@ -35,15 +37,16 @@ public final class TestXWPFSDT extends TestCase { XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); String tag = null; String title= null; - List sdts = extractAllSDTs(doc); - for (XWPFSDT sdt :sdts){ + List sdts = extractAllSDTs(doc); + for (AbstractXWPFSDT sdt :sdts){ if (sdt.getContent().toString().equals("Rich_text")){ tag = "MyTag"; title = "MyTitle"; break; } + } - assertEquals("controls size", 12, sdts.size()); + assertEquals("controls size", 13, sdts.size()); assertEquals("tag", "MyTag", tag); assertEquals("title", "MyTitle", title); @@ -54,12 +57,13 @@ public final class TestXWPFSDT extends TestCase { String[] contents = new String[]{ "header_rich_text", "Rich_text", - "Rich_text_pre_table\nRich_text_cell1\t\t\t\n\nRich_text_post_table", + "Rich_text_pre_table\nRich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nRich_text_post_table", "Plain_text_no_newlines", "Plain_text_with_newlines1\nplain_text_with_newlines2", "Watermelon", "Dirt", "4/16/2013", + "Rich_text_in_cell", "rich_text_in_paragraph_in_cell", "Footer_rich_text", "Footnote_sdt", @@ -67,31 +71,40 @@ public final class TestXWPFSDT extends TestCase { }; XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); - List sdts = extractAllSDTs(doc); + List sdts = extractAllSDTs(doc); assertEquals("number of sdts", contents.length, sdts.size()); - for (int i = 0; i < sdts.size(); i++){//contents.length; i++){ - XWPFSDT sdt = sdts.get(i); - + for (int i = 0; i < contents.length; i++){ + AbstractXWPFSDT sdt = sdts.get(i); assertEquals(i+ ": " + contents[i], contents[i], sdt.getContent().toString()); } } + /** + * POI-54771 and TIKA-1317 + */ + public void testSDTAsCell() throws Exception { + //Bug54771a.docx and Bug54771b.docx test slightly + //different recursion patterns. Keep both! + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54771a.docx"); + List sdts = extractAllSDTs(doc); + String text = sdts.get(0).getContent().getText(); + assertEquals(2, sdts.size()); + assertTrue(text.indexOf("Test") > -1); + + text = sdts.get(1).getContent().getText(); + assertTrue(text.indexOf("Test Subtitle") > -1); + assertTrue(text.indexOf("Test User") > -1); + assertTrue(text.indexOf("Test") < text.indexOf("Test Subtitle")); + + doc = XWPFTestDataSamples.openSampleDocument("Bug54771b.docx"); + sdts = extractAllSDTs(doc); + assertEquals(3, sdts.size()); + assertTrue(sdts.get(0).getContent().getText().indexOf("Test") > -1); + + assertTrue(sdts.get(1).getContent().getText().indexOf("Test Subtitle") > -1); + assertTrue(sdts.get(2).getContent().getText().indexOf("Test User") > -1); - public void testFailureToGetSDTAsCell() throws Exception{ - /** - * The current code fails to extract an sdt if it comprises/is the parent - * of a cell in a table. - */ - XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug54849.docx"); - List sdts = extractAllSDTs(doc); - boolean found = false; - for (XWPFSDT sdt : sdts){ - if (sdt.getContent().getText().toLowerCase().indexOf("rich_text_in_cell") > -1){ - found = true; - } - } - assertEquals("SDT as cell known failure", false, found); } /** @@ -99,7 +112,7 @@ public final class TestXWPFSDT extends TestCase { */ public void testNewLinesBetweenRuns() throws Exception{ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("Bug55142.docx"); - List sdts = extractAllSDTs(doc); + List sdts = extractAllSDTs(doc); List targs = new ArrayList(); //these test newlines and tabs in paragraphs/body elements targs.add("Rich-text1 abcdefghi"); @@ -114,14 +127,14 @@ public final class TestXWPFSDT extends TestCase { targs.add("sdt_incell2 abcdefg"); for (int i = 0; i < sdts.size(); i++){ - XWPFSDT sdt = sdts.get(i); + AbstractXWPFSDT sdt = sdts.get(i); assertEquals(targs.get(i), targs.get(i), sdt.getContent().getText()); } } - private List extractAllSDTs(XWPFDocument doc){ - - List sdts = new ArrayList(); + private List extractAllSDTs(XWPFDocument doc){ + + List sdts = new ArrayList(); List headers = doc.getHeaderList(); for (XWPFHeader header : headers){ @@ -135,7 +148,6 @@ public final class TestXWPFSDT extends TestCase { } for (XWPFFootnote footnote : doc.getFootnotes()){ - sdts.addAll(extractSDTsFromBodyElements(footnote.getBodyElements())); } for (Map.Entry e : doc.endnotes.entrySet()){ @@ -144,8 +156,8 @@ public final class TestXWPFSDT extends TestCase { return sdts; } - private List extractSDTsFromBodyElements(List elements){ - List sdts = new ArrayList(); + private List extractSDTsFromBodyElements(List elements){ + List sdts = new ArrayList(); for (IBodyElement e : elements){ if (e instanceof XWPFSDT){ XWPFSDT sdt = (XWPFSDT)e; @@ -167,11 +179,16 @@ public final class TestXWPFSDT extends TestCase { return sdts; } - private List extractSDTsFromTable(XWPFTable table){ - List sdts = new ArrayList(); - for (XWPFTableRow r : table.getRows()){ - for (XWPFTableCell c : r.getTableCells()){ - sdts.addAll(extractSDTsFromBodyElements(c.getBodyElements())); + private List extractSDTsFromTable(XWPFTable table) { + + List sdts = new ArrayList(); + for (XWPFTableRow r : table.getRows()) { + for (ICell c : r.getTableICells()) { + if (c instanceof XWPFSDTCell) { + sdts.add((XWPFSDTCell)c); + } else if (c instanceof XWPFTableCell) { + sdts.addAll(extractSDTsFromBodyElements(((XWPFTableCell)c).getBodyElements())); + } } } return sdts; diff --git a/test-data/document/Bug54771a.docx b/test-data/document/Bug54771a.docx new file mode 100644 index 0000000000..19bbbd1a45 Binary files /dev/null and b/test-data/document/Bug54771a.docx differ diff --git a/test-data/document/Bug54771b.docx b/test-data/document/Bug54771b.docx new file mode 100644 index 0000000000..f9850bb556 Binary files /dev/null and b/test-data/document/Bug54771b.docx differ