diff options
author | PJ Fanning <fanningpj@apache.org> | 2017-07-28 07:42:23 +0000 |
---|---|---|
committer | PJ Fanning <fanningpj@apache.org> | 2017-07-28 07:42:23 +0000 |
commit | 36d940ce3906ca59af69b39b79d678f8fc701a77 (patch) | |
tree | 77c36a09545372f50cfcb75b5926a1280adc0dd9 | |
parent | 3cea90e7c47f193411bf5eda05bbfe3678b6c525 (diff) | |
download | poi-36d940ce3906ca59af69b39b79d678f8fc701a77.tar.gz poi-36d940ce3906ca59af69b39b79d678f8fc701a77.zip |
[Bug-61354] fix issue with extracting text from Word docs. This closes #66
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1803250 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java | 44 | ||||
-rw-r--r-- | src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java | 10 | ||||
-rw-r--r-- | test-data/document/MultipleBodyBug.docx | bin | 0 -> 102103 bytes |
4 files changed, 37 insertions, 18 deletions
diff --git a/.gitignore b/.gitignore index 2ebc54416b..103731746e 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,4 @@ sonar/*/target .ant-targets-build.xml build dist +lib/ diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java index 081b751cd3..01463c64a5 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFDocument.java @@ -156,26 +156,34 @@ public class XWPFDocument extends POIXMLDocument implements Document, IBody { // parse the document with cursor and add // the XmlObject to its lists - XmlCursor cursor = ctDocument.getBody().newCursor(); - cursor.selectPath("./*"); - while (cursor.toNextSelection()) { - XmlObject o = cursor.getObject(); - if (o instanceof CTP) { - XWPFParagraph p = new XWPFParagraph((CTP) o, this); - bodyElements.add(p); - paragraphs.add(p); - } else if (o instanceof CTTbl) { - XWPFTable t = new XWPFTable((CTTbl) o, this); - bodyElements.add(t); - tables.add(t); - } else if (o instanceof CTSdtBlock) { - XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this); - bodyElements.add(c); - contentControls.add(c); + XmlCursor docCursor = ctDocument.newCursor(); + docCursor.selectPath("./*"); + while (docCursor.toNextSelection()) { + XmlObject o = docCursor.getObject(); + if (o instanceof CTBody) { + XmlCursor bodyCursor = o.newCursor(); + bodyCursor.selectPath("./*"); + while (bodyCursor.toNextSelection()) { + XmlObject bodyObj = bodyCursor.getObject(); + if (bodyObj instanceof CTP) { + XWPFParagraph p = new XWPFParagraph((CTP) bodyObj, + this); + bodyElements.add(p); + paragraphs.add(p); + } else if (bodyObj instanceof CTTbl) { + XWPFTable t = new XWPFTable((CTTbl) bodyObj, this); + bodyElements.add(t); + tables.add(t); + } else if (bodyObj instanceof CTSdtBlock) { + XWPFSDT c = new XWPFSDT((CTSdtBlock) bodyObj, this); + bodyElements.add(c); + contentControls.add(c); + } + } + bodyCursor.dispose(); } } - cursor.dispose(); - + docCursor.dispose(); // Sort out headers and footers if (doc.getDocument().getBody().getSectPr() != null) headerFooterPolicy = new XWPFHeaderFooterPolicy(this); diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index f8e4e49410..b83b27d737 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -411,4 +411,14 @@ public class TestXWPFWordExtractor extends TestCase { "In Sequence:\n|X||_||X|\n", extractor.getText()); extractor.close(); } + + public void testMultipleBodyBug() throws IOException { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("MultipleBodyBug.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + assertEquals("START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.\n" + + "START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.\n" + + "START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.\n", + extractor.getText()); + extractor.close(); + } } diff --git a/test-data/document/MultipleBodyBug.docx b/test-data/document/MultipleBodyBug.docx Binary files differnew file mode 100644 index 0000000000..84f795b26f --- /dev/null +++ b/test-data/document/MultipleBodyBug.docx |