diff options
author | Nick Burch <nick@apache.org> | 2008-08-09 10:45:19 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2008-08-09 10:45:19 +0000 |
commit | 0153f65092cf1c2df060c846821088fe61a9c5f6 (patch) | |
tree | dab5cc21911d9f4eb0f1ebe6ade6b04185f6fbb9 /src | |
parent | c7cdf484614271681c26503f2c68eef9c4d2c539 (diff) | |
download | poi-0153f65092cf1c2df060c846821088fe61a9c5f6.tar.gz poi-0153f65092cf1c2df060c846821088fe61a9c5f6.zip |
Patch from bug #45592 - improve xwpf text extraction
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684219 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
4 files changed, 21 insertions, 2 deletions
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 60cfd0f9f3..ac0604d774 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ <!-- Don't forget to update status.xml too! --> <release version="3.5.1-beta2" date="2008-??-??"> + <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action> <action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action> <action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action> <action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action> diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index bc6acc93c2..89114557f1 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ <!-- Don't forget to update changes.xml too! --> <changes> <release version="3.5.1-beta2" date="2008-??-??"> + <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action> <action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action> <action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action> <action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action> diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java index 813ae140d4..09fe099e82 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java +++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java @@ -89,6 +89,20 @@ public class XWPFParagraph extends XMLParagraph * including text from pictures in it. */ public String getText() { + return getParagraphText() + getPictureText(); + } + /** + * Returns the text of the paragraph, but not + * of any objects in the paragraph + */ + public String getParagraphText() { return text.toString(); } + /** + * Returns any text from any suitable + * pictures in the paragraph + */ + public String getPictureText() { + return pictureText.toString(); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index e62dd66ce7..f29d09d02f 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -117,9 +117,12 @@ public class TestXWPFWordExtractor extends TestCase { assertTrue(text.startsWith( " \n(V) ILLUSTRATIVE CASES\n\n" )); - assertTrue(text.endsWith( + assertTrue(text.contains( "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n" )); + assertTrue(text.endsWith( + "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n" + )); // Check number of paragraphs int ps = 0; @@ -127,7 +130,7 @@ public class TestXWPFWordExtractor extends TestCase { for (int i = 0; i < t.length; i++) { if(t[i] == '\n') { ps++; } } - assertEquals(79, ps); + assertEquals(103, ps); } public void testGetWithHyperlinks() throws Exception { |