Fix bug #49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the...

author Nick Burch <nick@apache.org>

Tue, 25 May 2010 20:31:42 +0000 (20:31 +0000)

committer Nick Burch <nick@apache.org>

Tue, 25 May 2010 20:31:42 +0000 (20:31 +0000)
author Nick Burch <nick@apache.org>
Tue, 25 May 2010 20:31:42 +0000 (20:31 +0000)
committer Nick Burch <nick@apache.org>
Tue, 25 May 2010 20:31:42 +0000 (20:31 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 826937bee5e7250420506b238fd6bde029c82bbe..be43882899c2de5a2caa97640633d1d50c4f0f77 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
  
      <changes>
          <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the XSD is silly and maps them to CTEmpty</action>
             <action dev="POI-DEVELOPERS" type="fix">49273 - Correct handling for Font Character Sets with indicies greater than 127</action>
             <action dev="POI-DEVELOPERS" type="add">49334 - Track the ValueRangeRecords of charts in HSSFChart, to allow the basic axis operations</action>
             <action dev="POI-DEVELOPERS" type="add">49242 - Track the LinkDataRecords of charts in HSSFChart</action>
diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java

index 3bdf4c365ebe3e82002bf10473684e63c1e614c5..b09cc1edd2ebf9decc511dcdfdae6fd3f0cac80a 100644 (file)
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java
@@ -24,6 +24,7 @@ import org.apache.xmlbeans.XmlCursor;
  import org.apache.xmlbeans.XmlObject;
  import org.apache.poi.util.Internal;
  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef;
  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
@@ -108,6 +109,18 @@ public class XWPFParagraph {
                      if (o instanceof CTPTab) {
                          text.append("\t");
                      }
+                    if (o instanceof CTEmpty) {
+                       // Some inline text elements get returned not as
+                       //  themselves, but as CTEmpty, owing to some odd
+                       //  definitions around line 5642 of the XSDs
+                       String tagName = o.getDomNode().getNodeName();
+                       if ("w:tab".equals(tagName)) {
+                          text.append("\t");
+                       }
+                       if ("w:cr".equals(tagName)) {
+                          text.append("\n");
+                       }
+                    }
                      //got a reference to a footnote
                      if (o instanceof CTFtnEdnRef) {
                          CTFtnEdnRef ftn = (CTFtnEdnRef) o;
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java

index ddc0957aa0c39e624a5591d3539999e13f2c4d16..f98d5e69ca029529a4e05a038e00f56e70b64021 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@@ -219,4 +219,22 @@ public class TestXWPFWordExtractor extends TestCase {
          assertTrue(extractor.getText().contains("2008"));
          assertTrue(extractor.getText().contains("(120 "));
      }
+    
+    /**
+     * Test that we handle things like tabs and
+     *  carriage returns properly in the text that
+     *  we're extracting (bug #49189)
+     */
+    public void testDocTabs() {
+       XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("WithTabs.docx");
+       XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+ 
+       // Check bits
+       assertTrue(extractor.getText().contains("a"));
+       assertTrue(extractor.getText().contains("\t"));
+       assertTrue(extractor.getText().contains("b"));
+       
+       // Now check the first paragraph in total
+       assertTrue(extractor.getText().contains("a\tb\n"));
+    }
  }
diff --git a/test-data/document/WithTabs.docx b/test-data/document/WithTabs.docx

new file mode 100644 (file)

index 0000000..e9b3a7a

Binary files /dev/null and b/test-data/document/WithTabs.docx differ
author	Nick Burch <nick@apache.org>
	Tue, 25 May 2010 20:31:42 +0000 (20:31 +0000)
committer	Nick Burch <nick@apache.org>
	Tue, 25 May 2010 20:31:42 +0000 (20:31 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java		patch \| blob \| history
test-data/document/WithTabs.docx	[new file with mode: 0644]	patch \| blob