Have XWPFWordExtractor extract headers and footers

author Nick Burch <nick@apache.org>

Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)

committer Nick Burch <nick@apache.org>

Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
author Nick Burch <nick@apache.org>
Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
committer Nick Burch <nick@apache.org>
Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml

index f1dd05950fe9fbe0724a4fb6399d7cd5aae9293a..8b9af3340656566f1774fe2b795782e082524391 100644 (file)
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -37,6 +37,7 @@
  
                 <!-- Don't forget to update status.xml too! -->
          <release version="3.5.1-beta2" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
             <action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
             <action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
             <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 60ad6c09218a65c584eca53fbffa9508ec18fc6c..62b1dc4e4fc3305e11d80dff773ae82b58f6f1a4 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
         <!-- Don't forget to update changes.xml too! -->
      <changes>
          <release version="3.5.1-beta2" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
             <action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
             <action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
             <action dev="POI-DEVELOPERS" type="add">45592 - Improve XWPF text extraction to include tables always, and picture text where possible</action>
diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java

index 64c8e3f7804fb31c8c86274d8ef98eba043a9e0e..14031ebc81f4b4d2b07dc884ce939073b4d448c2 100644 (file)
--- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
@@ -23,6 +23,7 @@ import org.apache.poi.POIXMLDocument;
  import org.apache.poi.POIXMLTextExtractor;
  import org.apache.poi.xwpf.XWPFDocument;
  import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
  import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
  import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
  import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@@ -70,21 +71,46 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
         
         public String getText() {
                 StringBuffer text = new StringBuffer();
+               XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
                 
-                       
+               // Start out with all headers
+               // TODO - put them in where they're needed
+               if(hfPolicy.getFirstPageHeader() != null) {
+                       text.append( hfPolicy.getFirstPageHeader().getText() );
+               }
+               if(hfPolicy.getEvenPageHeader() != null) {
+                       text.append( hfPolicy.getEvenPageHeader().getText() );
+               }
+               if(hfPolicy.getDefaultHeader() != null) {
+                       text.append( hfPolicy.getDefaultHeader().getText() );
+               }
+               
+               // First up, all our paragraph based text
                 Iterator<XWPFParagraph> i = document.getParagraphsIterator();
                 while(i.hasNext()) {
                         XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
                                         new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
                         text.append(decorator.getText()+"\n");
                 }
-                       
+
+               // Then our table based text
                 Iterator<XWPFTable> j = document.getTablesIterator();
-               while(j.hasNext())
-               {
+               while(j.hasNext()) {
                         text.append(j.next().getText()+"\n");
                 }
                 
+               // Finish up with all the footers
+               // TODO - put them in where they're needed
+               if(hfPolicy.getFirstPageFooter() != null) {
+                       text.append( hfPolicy.getFirstPageFooter().getText() );
+               }
+               if(hfPolicy.getEvenPageFooter() != null) {
+                       text.append( hfPolicy.getEvenPageFooter().getText() );
+               }
+               if(hfPolicy.getDefaultFooter() != null) {
+                       text.append( hfPolicy.getDefaultFooter().getText() );
+               }
+               
                 return text.toString();
         }
  }
diff --git a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java

index 36de2291936c97f2db476ae45118d8bf76d2ee86..708944cf7ea9d488c3ecb56be300bfb58804ec71 100644 (file)
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java
@@ -81,15 +81,21 @@ public abstract class XWPFHeaderFooter {
                 XWPFParagraph[] paras = getParagraphs();
                 for(int i=0; i<paras.length; i++) {
                         if(! paras[i].isEmpty()) {
-                               t.append(paras[i].getText());
-                               t.append('\n');
+                               String text = paras[i].getText();
+                               if(text != null && text.length() > 0) {
+                                       t.append(text);
+                                       t.append('\n');
+                               }
                         }
                 }
                 
                 XWPFTable[] tables = getTables();
                 for(int i=0; i<tables.length; i++) {
-                       t.append(tables[i].getText());
-                       t.append('\n');
+                       String text = tables[i].getText();
+                       if(text != null && text.length() > 0) {
+                               t.append(text);
+                               t.append('\n');
+                       }
                 }
                 
                 return t.toString(); 
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java

index 1b26bb58ac3a066e896ed10faf4b9e86ddeb1416..8fc83bc91e1bf4f8eb4fcb512503bb64b9c7e1e5 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@@ -37,12 +37,22 @@ public class TestXWPFWordExtractor extends TestCase {
          */
         private XWPFDocument xmlB;
         private File fileB;
-       
         /**
-        * File with hyperlinks
+        * With a simplish header+footer
          */
         private XWPFDocument xmlC;
         private File fileC;
+       /**
+        * With different header+footer on first/rest
+        */
+       private XWPFDocument xmlD;
+       private File fileD;
+       
+       /**
+        * File with hyperlinks
+        */
+       private XWPFDocument xmlE;
+       private File fileE;
  
         protected void setUp() throws Exception {
                 super.setUp();
@@ -56,16 +66,28 @@ public class TestXWPFWordExtractor extends TestCase {
                                 File.separator + "IllustrativeCases.docx"
                 );
                 fileC = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "ThreeColHeadFoot.docx"
+               );
+               fileD = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "DiffFirstPageHeadFoot.docx"
+               );
+               fileE = new File(
                                 System.getProperty("HWPF.testdata.path") +
                                 File.separator + "TestDocument.docx"
                 );
                 assertTrue(fileA.exists());
                 assertTrue(fileB.exists());
                 assertTrue(fileC.exists());
+               assertTrue(fileD.exists());
+               assertTrue(fileE.exists());
                 
                 xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
                 xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
                 xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
+               xmlD = new XWPFDocument(POIXMLDocument.openPackage(fileD.toString()));
+               xmlE = new XWPFDocument(POIXMLDocument.openPackage(fileE.toString()));
         }
  
         /**
@@ -135,7 +157,7 @@ public class TestXWPFWordExtractor extends TestCase {
         
         public void testGetWithHyperlinks() throws Exception {
                 XWPFWordExtractor extractor = 
-                       new XWPFWordExtractor(xmlC);
+                       new XWPFWordExtractor(xmlE);
                 extractor.getText();
                 extractor.setFetchHyperlinks(true);
                 extractor.getText();
@@ -160,4 +182,47 @@ public class TestXWPFWordExtractor extends TestCase {
                                 extractor.getText()
                 );
         }
+       
+       public void testHeadersFooters() throws Exception {
+               XWPFWordExtractor extractor = 
+                       new XWPFWordExtractor(xmlC);
+               extractor.getText();
+               
+               assertEquals(
+                               "First header column!\tMid header\tRight header!\n" +
+                               "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
+                               "\n" +
+                               "HEADING TEXT\n" + 
+                               "\n" +
+                               "More on page one\n" + 
+                               "\n\n" + 
+                               "End of page 1\n\n" +
+                               "This is page two. It also has a three column heading, and a three column footer.\n" +
+                               "Footer Left\tFooter Middle\tFooter Right\n",
+                               extractor.getText()
+               );
+               
+               
+               // Now another file, expect multiple headers
+               //  and multiple footers
+               extractor = 
+                       new XWPFWordExtractor(xmlD);
+               extractor.getText();
+               
+               assertEquals(
+                               "I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
+                               "First header column!\tMid header\tRight header!\n" +
+                               "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
+                               "\n" +
+                               "HEADING TEXT\n" + 
+                               "\n" +
+                               "More on page one\n" + 
+                               "\n\n" + 
+                               "End of page 1\n\n" +
+                               "This is page two. It also has a three column heading, and a three column footer.\n" +
+                               "The footer of the first page\n" +
+                               "Footer Left\tFooter Middle\tFooter Right\n",
+                               extractor.getText()
+               );
+       }
  }
diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java b/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java

index b2269c290842323f2fd5a89ac37ada9b3c718fc0..9d0e96a175a7488320b2e5e0ec5bda36cebaee73 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java
@@ -182,12 +182,12 @@ public class TestXWPFHeaderFooterPolicy extends TestCase {
                 policy = oddEven.getHeaderFooterPolicy();
                 
                 assertEquals(
-                       "\n[]ODD Page Header text\n\n",
+                       "[]ODD Page Header text\n\n",
                         policy.getDefaultHeader().getText()
                 );
                 assertEquals(
-                               "\n[This is an Even Page, with a Header]\n\n", 
-                               policy.getEvenPageHeader().getText()
+                       "[This is an Even Page, with a Header]\n\n", 
+                       policy.getEvenPageHeader().getText()
                 );
         }
  }
author	Nick Burch <nick@apache.org>
	Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
committer	Nick Burch <nick@apache.org>
	Sat, 9 Aug 2008 15:08:11 +0000 (15:08 +0000)
src/documentation/content/xdocs/changes.xml		patch \| blob \| history
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java		patch \| blob \| history
src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFHeaderFooter.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/xwpf/model/TestXWPFHeaderFooterPolicy.java		patch \| blob \| history