Add header/footer support to HWPF WordExtractor

author Nick Burch <nick@apache.org>

Sat, 9 Aug 2008 22:08:34 +0000 (22:08 +0000)

committer Nick Burch <nick@apache.org>

Sat, 9 Aug 2008 22:08:34 +0000 (22:08 +0000)
author Nick Burch <nick@apache.org>
Sat, 9 Aug 2008 22:08:34 +0000 (22:08 +0000)
committer Nick Burch <nick@apache.org>
Sat, 9 Aug 2008 22:08:34 +0000 (22:08 +0000)
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml

index a18332829ab263ff3f98cd34f438090ce64aa607..9c9a4b702f12d9be6d18e5ac0959384bba857773 100644 (file)
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -37,6 +37,9 @@
  
                 <!-- Don't forget to update status.xml too! -->
          <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
+           <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
             <action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
             <action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
             <action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index b0bc9bb21e4709f1e7300c5905918b96d6cc1175..6170314752eb9c9da08c54de42b6a38662e988d7 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,9 @@
         <!-- Don't forget to update changes.xml too! -->
      <changes>
          <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
+           <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
             <action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
             <action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
             <action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index d73aa337c0752484d70efbadd269d968bae1e35a..63c6a18ca792e41bab1d3ffd5b384c4039a10eff 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -25,6 +25,7 @@ import java.util.Iterator;
  import org.apache.poi.POIOLE2TextExtractor;
  import org.apache.poi.hwpf.HWPFDocument;
  import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
  import org.apache.poi.hwpf.usermodel.Paragraph;
  import org.apache.poi.hwpf.usermodel.Range;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -115,6 +116,65 @@ public class WordExtractor extends POIOLE2TextExtractor {
                 return ret;
         }
         
+       /**
+        * Add the header/footer text, if it's not empty
+        */
+       private void appendHeaderFooter(String text, StringBuffer out) {
+               if(text == null || text.length() == 0)
+                       return;
+
+               text = text.replace('\r', '\n');
+               if(! text.endsWith("\n")) {
+                       out.append(text);
+                       out.append('\n');
+                       return;
+               }
+               if(text.endsWith("\n\n")) {
+                       out.append(text.substring(0, text.length()-1));
+                       return;
+               }
+               out.append(text);
+               return;
+       }
+       /**
+        * Grab the text from the headers
+        */
+       public String getHeaderText() {
+               HeaderStories hs = new HeaderStories(doc);
+               
+               StringBuffer ret = new StringBuffer();
+               if(hs.getFirstHeader() != null) {
+                       appendHeaderFooter(hs.getFirstHeader(), ret);
+               }
+               if(hs.getEvenHeader() != null) {
+                       appendHeaderFooter(hs.getEvenHeader(), ret);
+               }
+               if(hs.getOddHeader() != null) {
+                       appendHeaderFooter(hs.getOddHeader(), ret);
+               }
+               
+               return ret.toString();
+       }
+       /**
+        * Grab the text from the footers
+        */
+       public String getFooterText() {
+               HeaderStories hs = new HeaderStories(doc);
+               
+               StringBuffer ret = new StringBuffer();
+               if(hs.getFirstFooter() != null) {
+                       appendHeaderFooter(hs.getFirstFooter(), ret);
+               }
+               if(hs.getEvenFooter() != null) {
+                       appendHeaderFooter(hs.getEvenFooter(), ret);
+               }
+               if(hs.getOddFooter() != null) {
+                       appendHeaderFooter(hs.getOddFooter(), ret);
+               }
+               
+               return ret.toString();
+       }
+       
         /**
          * Grab the text out of the text pieces. Might also include various
          *  bits of crud, but will work in cases where the text piece -> paragraph
@@ -158,10 +218,16 @@ public class WordExtractor extends POIOLE2TextExtractor {
          */
         public String getText() {
                 StringBuffer ret = new StringBuffer();
+               
+               ret.append(getHeaderText());
+               
                 String[] text = getParagraphText();
                 for(int i=0; i<text.length; i++) {
                         ret.append(text[i]);
                 }
+               
+               ret.append(getFooterText());
+               
                 return ret.toString();
         }
  }
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

index 06dae46dffad4d7ffd04e19b634e45de7b5b977b..704b4d4dd26793f91b11caa9b6030c0a244bb03e 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -55,6 +55,11 @@ public class TestWordExtractor extends TestCase {
         // A word doc embeded in an excel file
         private String filename3;
         
+       // With header and footer
+       private String filename4;
+       // With unicode header and footer
+       private String filename5;
+       
      protected void setUp() throws Exception {
                 String dirname = System.getProperty("HWPF.testdata.path");
                 String pdirname = System.getProperty("POIFS.testdata.path");
@@ -62,6 +67,9 @@ public class TestWordExtractor extends TestCase {
                 String filename = dirname + "/test2.doc";
                 String filename2 = dirname + "/test.doc";
                 filename3 = pdirname + "/excel_with_embeded.xls";
+               filename4 = dirname + "/ThreeColHeadFoot.doc";
+               filename5 = dirname + "/HeaderFooterUnicode.doc";
+               
                 extractor = new WordExtractor(new FileInputStream(filename));
                 extractor2 = new WordExtractor(new FileInputStream(filename2));
                 
@@ -149,4 +157,72 @@ public class TestWordExtractor extends TestCase {
         assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
         assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
      }
+    
+    public void testWithHeader() throws Exception {
+       // Non-unicode
+       HWPFDocument doc = new HWPFDocument(
+                       new FileInputStream(filename4)
+       );
+       extractor = new WordExtractor(doc);
+       
+       assertEquals(
+                       "First header column!\tMid header Right header!\n",
+                       extractor.getHeaderText()
+       );
+       
+       String text = extractor.getText();
+       assertTrue(
+                       text.indexOf("First header column!") > -1
+       );
+       
+       
+       // Unicode
+       doc = new HWPFDocument(
+                       new FileInputStream(filename5)
+       );
+       extractor = new WordExtractor(doc);
+       
+       assertEquals(
+                       "\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+                       extractor.getHeaderText()
+       );
+       text = extractor.getText();
+       assertTrue(
+                       text.indexOf("This is a simple header") > -1
+       );
+    }
+    
+    public void testWithFooter() throws Exception {
+       // Non-unicode
+       HWPFDocument doc = new HWPFDocument(
+                       new FileInputStream(filename4)
+       );
+       extractor = new WordExtractor(doc);
+       
+       assertEquals(
+                       "Footer Left\tFooter Middle Footer Right\n",
+                       extractor.getFooterText()
+       );
+       
+       String text = extractor.getText();
+       assertTrue(
+                       text.indexOf("Footer Left") > -1
+       );
+       
+       
+       // Unicode
+       doc = new HWPFDocument(
+                       new FileInputStream(filename5)
+       );
+       extractor = new WordExtractor(doc);
+       
+       assertEquals(
+                       "\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+                       extractor.getFooterText()
+       );
+       text = extractor.getText();
+       assertTrue(
+                       text.indexOf("The footer, with") > -1
+       );
+    }
  }
author	Nick Burch <nick@apache.org>
	Sat, 9 Aug 2008 22:08:34 +0000 (22:08 +0000)
committer	Nick Burch <nick@apache.org>
	Sat, 9 Aug 2008 22:08:34 +0000 (22:08 +0000)
src/documentation/content/xdocs/changes.xml		patch \| blob \| history
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java		patch \| blob \| history