]> source.dussan.org Git - poi.git/commitdiff
Have XWPFWordExtractor extract headers and footers
authorNick Burch <nick@apache.org>
Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
committerNick Burch <nick@apache.org>
Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@684276 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/status.xml
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java

index f1dd05950fe9fbe0724a4fb6399d7cd5aae9293a..8b9af3340656566f1774fe2b795782e082524391 100644 (file)
@@ -37,6 +37,7 @@
 
                <!-- Don't forget to update status.xml too! -->
         <release version="3.5.1-beta2" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
            <action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
            <action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
            <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
index 60ad6c09218a65c584eca53fbffa9508ec18fc6c..62b1dc4e4fc3305e11d80dff773ae82b58f6f1a4 100644 (file)
@@ -34,6 +34,7 @@
        <!-- Don't forget to update changes.xml too! -->
     <changes>
         <release version="3.5.1-beta2" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
            <action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
            <action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
            <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
index 64c8e3f7804fb31c8c86274d8ef98eba043a9e0e..14031ebc81f4b4d2b07dc884ce939073b4d448c2 100644 (file)
@@ -23,6 +23,7 @@ import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.xwpf.XWPFDocument;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
 import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
 import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@@ -70,21 +71,46 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
        
        public String getText() {
                StringBuffer text = new StringBuffer();
+               XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
                
-                       
+               // Start out with all headers
+               // TODO - put them in where they're needed
+               if(hfPolicy.getFirstPageHeader() != null) {
+                       text.append( hfPolicy.getFirstPageHeader().getText() );
+               }
+               if(hfPolicy.getEvenPageHeader() != null) {
+                       text.append( hfPolicy.getEvenPageHeader().getText() );
+               }
+               if(hfPolicy.getDefaultHeader() != null) {
+                       text.append( hfPolicy.getDefaultHeader().getText() );
+               }
+               
+               // First up, all our paragraph based text
                Iterator<XWPFParagraph> i = document.getParagraphsIterator();
                while(i.hasNext()) {
                        XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
                                        new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
                        text.append(decorator.getText()+"\n");
                }
-                       
+
+               // Then our table based text
                Iterator<XWPFTable> j = document.getTablesIterator();
-               while(j.hasNext())
-               {
+               while(j.hasNext()) {
                        text.append(j.next().getText()+"\n");
                }
                
+               // Finish up with all the footers
+               // TODO - put them in where they're needed
+               if(hfPolicy.getFirstPageFooter() != null) {
+                       text.append( hfPolicy.getFirstPageFooter().getText() );
+               }
+               if(hfPolicy.getEvenPageFooter() != null) {
+                       text.append( hfPolicy.getEvenPageFooter().getText() );
+               }
+               if(hfPolicy.getDefaultFooter() != null) {
+                       text.append( hfPolicy.getDefaultFooter().getText() );
+               }
+               
                return text.toString();
        }
 }
index 36de2291936c97f2db476ae45118d8bf76d2ee86..708944cf7ea9d488c3ecb56be300bfb58804ec71 100644 (file)
@@ -81,15 +81,21 @@ public abstract class XWPFHeaderFooter {
                XWPFParagraph[] paras = getParagraphs();
                for(int i=0; i<paras.length; i++) {
                        if(! paras[i].isEmpty()) {
-                               t.append(paras[i].getText());
-                               t.append('\n');
+                               String text = paras[i].getText();
+                               if(text != null && text.length() > 0) {
+                                       t.append(text);
+                                       t.append('\n');
+                               }
                        }
                }
                
                XWPFTable[] tables = getTables();
                for(int i=0; i<tables.length; i++) {
-                       t.append(tables[i].getText());
-                       t.append('\n');
+                       String text = tables[i].getText();
+                       if(text != null && text.length() > 0) {
+                               t.append(text);
+                               t.append('\n');
+                       }
                }
                
                return t.toString(); 
index 1b26bb58ac3a066e896ed10faf4b9e86ddeb1416..8fc83bc91e1bf4f8eb4fcb512503bb64b9c7e1e5 100644 (file)
@@ -37,12 +37,22 @@ public class TestXWPFWordExtractor extends TestCase {
         */
        private XWPFDocument xmlB;
        private File fileB;
-       
        /**
-        * File with hyperlinks
+        * With a simplish header+footer
         */
        private XWPFDocument xmlC;
        private File fileC;
+       /**
+        * With different header+footer on first/rest
+        */
+       private XWPFDocument xmlD;
+       private File fileD;
+       
+       /**
+        * File with hyperlinks
+        */
+       private XWPFDocument xmlE;
+       private File fileE;
 
        protected void setUp() throws Exception {
                super.setUp();
@@ -56,16 +66,28 @@ public class TestXWPFWordExtractor extends TestCase {
                                File.separator + "IllustrativeCases.docx"
                );
                fileC = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "ThreeColHeadFoot.docx"
+               );
+               fileD = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "DiffFirstPageHeadFoot.docx"
+               );
+               fileE = new File(
                                System.getProperty("HWPF.testdata.path") +
                                File.separator + "TestDocument.docx"
                );
                assertTrue(fileA.exists());
                assertTrue(fileB.exists());
                assertTrue(fileC.exists());
+               assertTrue(fileD.exists());
+               assertTrue(fileE.exists());
                
                xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
                xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
                xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
+               xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString()));
+               xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString()));
        }
 
        /**
@@ -135,7 +157,7 @@ public class TestXWPFWordExtractor extends TestCase {
        
        public void testGetWithHyperlinks() throws Exception {
                XWPFWordExtractor extractor = 
-                       new XWPFWordExtractor(xmlC);
+                       new XWPFWordExtractor(xmlE);
                extractor.getText();
                extractor.setFetchHyperlinks(true);
                extractor.getText();
@@ -160,4 +182,47 @@ public class TestXWPFWordExtractor extends TestCase {
                                extractor.getText()
                );
        }
+       
+       public void testHeadersFooters() throws Exception {
+               XWPFWordExtractor extractor = 
+                       new XWPFWordExtractor(xmlC);
+               extractor.getText();
+               
+               assertEquals(
+                               "First header column!\tMid header\tRight header!\n" +
+                               "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
+                               "\n" +
+                               "HEADING TEXT\n" + 
+                               "\n" +
+                               "More on page one\n" + 
+                               "\n\n" + 
+                               "End of page 1\n\n" +
+                               "This is page two. It also has a three column heading, and a three column footer.\n" +
+                               "Footer Left\tFooter Middle\tFooter Right\n",
+                               extractor.getText()
+               );
+               
+               
+               // Now another file, expect multiple headers
+               //  and multiple footers
+               extractor = 
+                       new XWPFWordExtractor(xmlD);
+               extractor.getText();
+               
+               assertEquals(
+                               "I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
+                               "First header column!\tMid header\tRight header!\n" +
+                               "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
+                               "\n" +
+                               "HEADING TEXT\n" + 
+                               "\n" +
+                               "More on page one\n" + 
+                               "\n\n" + 
+                               "End of page 1\n\n" +
+                               "This is page two. It also has a three column heading, and a three column footer.\n" +
+                               "The footer of the first page\n" +
+                               "Footer Left\tFooter Middle\tFooter Right\n",
+                               extractor.getText()
+               );
+       }
 }
index b2269c290842323f2fd5a89ac37ada9b3c718fc0..9d0e96a175a7488320b2e5e0ec5bda36cebaee73 100644 (file)
@@ -182,12 +182,12 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
                policy = oddEven.getHeaderFooterPolicy();
                
                assertEquals(
-                       "\n[]ODD Page Header text\n\n",
+                       "[]ODD Page Header text\n\n",
                        policy.getDefaultHeader().getText()
                );
                assertEquals(
-                               "\n[This is an Even Page, with a Header]\n\n", 
-                               policy.getEvenPageHeader().getText()
+                       "[This is an Even Page, with a Header]\n\n", 
+                       policy.getEvenPageHeader().getText()
                );
        }
 }