aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPJ Fanning <fanningpj@apache.org>2017-07-28 07:42:23 +0000
committerPJ Fanning <fanningpj@apache.org>2017-07-28 07:42:23 +0000
commit36d940ce3906ca59af69b39b79d678f8fc701a77 (patch)
tree77c36a09545372f50cfcb75b5926a1280adc0dd9
parent3cea90e7c47f193411bf5eda05bbfe3678b6c525 (diff)
downloadpoi-36d940ce3906ca59af69b39b79d678f8fc701a77.tar.gz
poi-36d940ce3906ca59af69b39b79d678f8fc701a77.zip
[Bug-61354] fix issue with extracting text from Word docs. This closes #66
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1803250 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--.gitignore1
-rw-r--r--src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java44
-rw-r--r--src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java10
-rw-r--r--test-data/document/MultipleBodyBug.docxbin0 -> 102103 bytes
4 files changed, 37 insertions, 18 deletions
diff --git a/.gitignore b/.gitignore
index 2ebc54416b..103731746e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,3 +45,4 @@ sonar/*/target
.ant-targets-build.xml
build
dist
+lib/
diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java
index 081b751cd3..01463c64a5 100644
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java
@@ -156,26 +156,34 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody {
// parse the document with cursor and add
// the XmlObject to its lists
- XmlCursor cursor = ctDocument.getBody().newCursor();
- cursor.selectPath("./*");
- while (cursor.toNextSelection()) {
- XmlObject o = cursor.getObject();
- if (o instanceof CTP) {
- XWPFParagraph p = new XWPFParagraph((CTP) o, this);
- bodyElements.add(p);
- paragraphs.add(p);
- } else if (o instanceof CTTbl) {
- XWPFTable t = new XWPFTable((CTTbl) o, this);
- bodyElements.add(t);
- tables.add(t);
- } else if (o instanceof CTSdtBlock) {
- XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this);
- bodyElements.add(c);
- contentControls.add(c);
+ XmlCursor docCursor = ctDocument.newCursor();
+ docCursor.selectPath("./*");
+ while (docCursor.toNextSelection()) {
+ XmlObject o = docCursor.getObject();
+ if (o instanceof CTBody) {
+ XmlCursor bodyCursor = o.newCursor();
+ bodyCursor.selectPath("./*");
+ while (bodyCursor.toNextSelection()) {
+ XmlObject bodyObj = bodyCursor.getObject();
+ if (bodyObj instanceof CTP) {
+ XWPFParagraph p = new XWPFParagraph((CTP) bodyObj,
+ this);
+ bodyElements.add(p);
+ paragraphs.add(p);
+ } else if (bodyObj instanceof CTTbl) {
+ XWPFTable t = new XWPFTable((CTTbl) bodyObj, this);
+ bodyElements.add(t);
+ tables.add(t);
+ } else if (bodyObj instanceof CTSdtBlock) {
+ XWPFSDT c = new XWPFSDT((CTSdtBlock) bodyObj, this);
+ bodyElements.add(c);
+ contentControls.add(c);
+ }
+ }
+ bodyCursor.dispose();
}
}
- cursor.dispose();
-
+ docCursor.dispose();
// Sort out headers and footers
if (doc.getDocument().getBody().getSectPr() != null)
headerFooterPolicy = new XWPFHeaderFooterPolicy(this);
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
index f8e4e49410..b83b27d737 100644
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@@ -411,4 +411,14 @@ public class TestXWPFWordExtractor extends TestCase {
"In Sequence:\n|X||_||X|\n", extractor.getText());
extractor.close();
}
+
+ public void testMultipleBodyBug() throws IOException {
+ XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("MultipleBodyBug.docx");
+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+ assertEquals("START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.\n"
+ + "START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.\n"
+ + "START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.\n",
+ extractor.getText());
+ extractor.close();
+ }
}
diff --git a/test-data/document/MultipleBodyBug.docx b/test-data/document/MultipleBodyBug.docx
new file mode 100644
index 0000000000..84f795b26f
--- /dev/null
+++ b/test-data/document/MultipleBodyBug.docx
Binary files differ