aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2008-08-09 10:45:19 +0000
committerNick Burch <nick@apache.org>2008-08-09 10:45:19 +0000
commit0153f65092cf1c2df060c846821088fe61a9c5f6 (patch)
treedab5cc21911d9f4eb0f1ebe6ade6b04185f6fbb9 /src
parentc7cdf484614271681c26503f2c68eef9c4d2c539 (diff)
downloadpoi-0153f65092cf1c2df060c846821088fe61a9c5f6.tar.gz
poi-0153f65092cf1c2df060c846821088fe61a9c5f6.zip
Patch from bug #45592 - improve xwpf text extraction
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684219 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/documentation/content/xdocs/changes.xml1
-rw-r--r--src/documentation/content/xdocs/status.xml1
-rw-r--r--src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java14
-rw-r--r--src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java7
4 files changed, 21 insertions, 2 deletions
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml
index 60cfd0f9f3..ac0604d774 100644
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.5.1-beta2" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action>
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index bc6acc93c2..89114557f1 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.5.1-beta2" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
<action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
<action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action>
diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java
index 813ae140d4..09fe099e82 100644
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java
@@ -89,6 +89,20 @@ public class XWPFParagraph extends XMLParagraph
* including text from pictures in it.
*/
public String getText() {
+ return getParagraphText() + getPictureText();
+ }
+ /**
+ * Returns the text of the paragraph, but not
+ * of any objects in the paragraph
+ */
+ public String getParagraphText() {
return text.toString();
}
+ /**
+ * Returns any text from any suitable
+ * pictures in the paragraph
+ */
+ public String getPictureText() {
+ return pictureText.toString();
+ }
}
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
index e62dd66ce7..f29d09d02f 100644
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@@ -117,9 +117,12 @@ public class TestXWPFWordExtractor extends TestCase {
assertTrue(text.startsWith(
" \n(V) ILLUSTRATIVE CASES\n\n"
));
- assertTrue(text.endsWith(
+ assertTrue(text.contains(
"As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
));
+ assertTrue(text.endsWith(
+ "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n"
+ ));
// Check number of paragraphs
int ps = 0;
@@ -127,7 +130,7 @@ public class TestXWPFWordExtractor extends TestCase {
for (int i = 0; i < t.length; i++) {
if(t[i] == '\n') { ps++; }
}
- assertEquals(79, ps);
+ assertEquals(103, ps);
}
public void testGetWithHyperlinks() throws Exception {