]> source.dussan.org Git - poi.git/commitdiff
Patch from bug #45592 - improve xwpf text extraction
authorNick Burch <nick@apache.org>
Sat, 9 Aug 2008 10:45:19 +0000 (10:45 +0000)
committerNick Burch <nick@apache.org>
Sat, 9 Aug 2008 10:45:19 +0000 (10:45 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684219 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/status.xml
src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFParagraph.java
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java

index 60cfd0f9f3f87601ad6bb2ff6936eb98f865defd..ac0604d774912af458b912d48e022cebf70a1cfb 100644 (file)
@@ -37,6 +37,7 @@
 
                <!-- Don't forget to update status.xml too! -->
         <release version="3.5.1-beta2" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
            <action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
            <action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
            <action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action>
index bc6acc93c2e52954f9e37d87cd12d3cac3134258..89114557f1b6184dba7f1ebbcefbef174d5ae979 100644 (file)
@@ -34,6 +34,7 @@
        <!-- Don't forget to update changes.xml too! -->
     <changes>
         <release version="3.5.1-beta2" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
            <action dev="POI-DEVELOPERS" type="add">45545 - Improve XSLF usermodel support, and include XSLF comments in extracted text</action>
            <action dev="POI-DEVELOPERS" type="add">45540 - Fix XSSF header and footer support, and include headers and footers in the output of XSSFExcelExtractor</action>
            <action dev="POI-DEVELOPERS" type="add">45431 - Support for .xlsm files, sufficient for simple files to be loaded by excel without warning</action>
index 813ae140d41fe4d5152232dd59cae982a6c22f10..09fe099e8235c3c7b1241c1113374acef2e998ad 100644 (file)
@@ -89,6 +89,20 @@ public class XWPFParagraph extends XMLParagraph
      *  including text from pictures in it.
      */
     public String getText() {
+        return getParagraphText() + getPictureText();
+    }
+    /**
+     * Returns the text of the paragraph, but not
+     *  of any objects in the paragraph
+     */
+    public String getParagraphText() {
         return text.toString();
     }
+    /**
+     * Returns any text from any suitable
+     *  pictures in the paragraph
+     */
+    public String getPictureText() {
+       return pictureText.toString();
+    }
 }
index e62dd66ce7315166a0b98b202628f45d65fec1df..f29d09d02fc180ce1ae3057a03b4104eb112371b 100644 (file)
@@ -117,9 +117,12 @@ public class TestXWPFWordExtractor extends TestCase {
                assertTrue(text.startsWith(
                                "  \n(V) ILLUSTRATIVE CASES\n\n"
                ));
-               assertTrue(text.endsWith(
+               assertTrue(text.contains(
                                "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
                ));
+               assertTrue(text.endsWith(
+                               "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\t\n\n"
+               ));
                
                // Check number of paragraphs
                int ps = 0;
@@ -127,7 +130,7 @@ public class TestXWPFWordExtractor extends TestCase {
                for (int i = 0; i < t.length; i++) {
                        if(t[i] == '\n') { ps++; }
                }
-               assertEquals(79, ps);
+               assertEquals(103, ps);
        }
        
        public void testGetWithHyperlinks() throws Exception {