From: Nick Burch Date: Tue, 25 May 2010 20:31:42 +0000 (+0000) Subject: Fix bug #49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the... X-Git-Tag: REL_3_7_BETA1~59 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=216ecd14077cf0b290a2addfa355f7c1cc2b4bc9;p=poi.git Fix bug #49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the XSD is silly and maps them to CTEmpty git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@948199 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 826937bee5..be43882899 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the XSD is silly and maps them to CTEmpty 49273 - Correct handling for Font Character Sets with indicies greater than 127 49334 - Track the ValueRangeRecords of charts in HSSFChart, to allow the basic axis operations 49242 - Track the LinkDataRecords of charts in HSSFChart diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 3bdf4c365e..b09cc1edd2 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -24,6 +24,7 @@ import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlObject; import org.apache.poi.util.Internal; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc; @@ -108,6 +109,18 @@ public class XWPFParagraph { if (o instanceof CTPTab) { text.append("\t"); } + if (o instanceof CTEmpty) { + // Some inline text elements get returned not as + // themselves, but as CTEmpty, owing to some odd + // definitions around line 5642 of the XSDs + String tagName = o.getDomNode().getNodeName(); + if ("w:tab".equals(tagName)) { + text.append("\t"); + } + if ("w:cr".equals(tagName)) { + text.append("\n"); + } + } //got a reference to a footnote if (o instanceof CTFtnEdnRef) { CTFtnEdnRef ftn = (CTFtnEdnRef) o; diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index ddc0957aa0..f98d5e69ca 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -219,4 +219,22 @@ public class TestXWPFWordExtractor extends TestCase { assertTrue(extractor.getText().contains("2008")); assertTrue(extractor.getText().contains("(120 ")); } + + /** + * Test that we handle things like tabs and + * carriage returns properly in the text that + * we're extracting (bug #49189) + */ + public void testDocTabs() { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("WithTabs.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + + // Check bits + assertTrue(extractor.getText().contains("a")); + assertTrue(extractor.getText().contains("\t")); + assertTrue(extractor.getText().contains("b")); + + // Now check the first paragraph in total + assertTrue(extractor.getText().contains("a\tb\n")); + } } diff --git a/test-data/document/WithTabs.docx b/test-data/document/WithTabs.docx new file mode 100644 index 0000000000..e9b3a7a1dc Binary files /dev/null and b/test-data/document/WithTabs.docx differ