Add header/footer support to HWPF WordExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684362 13f79535-47bb-0310-9956-ffa450edef68
author: Nick Burch <nick@apache.org> 2008-08-09 22:08:34 +0000
committer: Nick Burch <nick@apache.org> 2008-08-09 22:08:34 +0000
commit: a1f745fee36120699c40d926c45d836024b674ff (patch)
tree: f360b3d5be2a0eb5a45f352b9974e97eac5c7d70 /src
parent: 2eaf0ec59afdc0a3aa91e8327b6b08e6bcbd2c62 (diff)
download: poi-a1f745fee36120699c40d926c45d836024b674ff.tar.gz
poi-a1f745fee36120699c40d926c45d836024b674ff.zip
4 files changed, 148 insertions, 0 deletions
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml
index a18332829a..9c9a4b702f 100644
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@@ -37,6 +37,9 @@
 
 		<!-- Don't forget to update status.xml too! -->
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
+           <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
            <action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
            <action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
            <action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index b0bc9bb21e..6170314752 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,9 @@
 	<!-- Don't forget to update changes.xml too! -->
     <changes>
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
+           <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
            <action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
            <action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
            <action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
index d73aa337c0..63c6a18ca7 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -25,6 +25,7 @@ import java.util.Iterator;
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -116,6 +117,65 @@ public class WordExtractor extends POIOLE2TextExtractor {
 	}
 	
 	/**
+	 * Add the header/footer text, if it's not empty
+	 */
+	private void appendHeaderFooter(String text, StringBuffer out) {
+		if(text == null || text.length() == 0)
+			return;
+
+		text = text.replace('\r', '\n');
+		if(! text.endsWith("\n")) {
+			out.append(text);
+			out.append('\n');
+			return;
+		}
+		if(text.endsWith("\n\n")) {
+			out.append(text.substring(0, text.length()-1));
+			return;
+		}
+		out.append(text);
+		return;
+	}
+	/**
+	 * Grab the text from the headers
+	 */
+	public String getHeaderText() {
+		HeaderStories hs = new HeaderStories(doc);
+		
+		StringBuffer ret = new StringBuffer();
+		if(hs.getFirstHeader() != null) {
+			appendHeaderFooter(hs.getFirstHeader(), ret);
+		}
+		if(hs.getEvenHeader() != null) {
+			appendHeaderFooter(hs.getEvenHeader(), ret);
+		}
+		if(hs.getOddHeader() != null) {
+			appendHeaderFooter(hs.getOddHeader(), ret);
+		}
+		
+		return ret.toString();
+	}
+	/**
+	 * Grab the text from the footers
+	 */
+	public String getFooterText() {
+		HeaderStories hs = new HeaderStories(doc);
+		
+		StringBuffer ret = new StringBuffer();
+		if(hs.getFirstFooter() != null) {
+			appendHeaderFooter(hs.getFirstFooter(), ret);
+		}
+		if(hs.getEvenFooter() != null) {
+			appendHeaderFooter(hs.getEvenFooter(), ret);
+		}
+		if(hs.getOddFooter() != null) {
+			appendHeaderFooter(hs.getOddFooter(), ret);
+		}
+		
+		return ret.toString();
+	}
+	
+	/**
 	 * Grab the text out of the text pieces. Might also include various
 	 *  bits of crud, but will work in cases where the text piece -> paragraph
 	 *  mapping is broken. Fast too.
@@ -158,10 +218,16 @@ public class WordExtractor extends POIOLE2TextExtractor {
 	 */
 	public String getText() {
 		StringBuffer ret = new StringBuffer();
+		
+		ret.append(getHeaderText());
+		
 		String[] text = getParagraphText();
 		for(int i=0; i<text.length; i++) {
 			ret.append(text[i]);
 		}
+		
+		ret.append(getFooterText());
+		
 		return ret.toString();
 	}
 }
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
index 06dae46dff..704b4d4dd2 100644
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -55,6 +55,11 @@ public class TestWordExtractor extends TestCase {
 	// A word doc embeded in an excel file
 	private String filename3;
 	
+	// With header and footer
+	private String filename4;
+	// With unicode header and footer
+	private String filename5;
+	
     protected void setUp() throws Exception {
 		String dirname = System.getProperty("HWPF.testdata.path");
 		String pdirname = System.getProperty("POIFS.testdata.path");
@@ -62,6 +67,9 @@ public class TestWordExtractor extends TestCase {
 		String filename = dirname + "/test2.doc";
 		String filename2 = dirname + "/test.doc";
 		filename3 = pdirname + "/excel_with_embeded.xls";
+		filename4 = dirname + "/ThreeColHeadFoot.doc";
+		filename5 = dirname + "/HeaderFooterUnicode.doc";
+		
 		extractor = new WordExtractor(new FileInputStream(filename));
 		extractor2 = new WordExtractor(new FileInputStream(filename2));
 		
@@ -149,4 +157,72 @@ public class TestWordExtractor extends TestCase {
     	assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
     	assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
     }
+    
+    public void testWithHeader() throws Exception {
+    	// Non-unicode
+    	HWPFDocument doc = new HWPFDocument(
+    			new FileInputStream(filename4)
+    	);
+    	extractor = new WordExtractor(doc);
+    	
+    	assertEquals(
+    			"First header column!\tMid header Right header!\n",
+    			extractor.getHeaderText()
+    	);
+    	
+    	String text = extractor.getText();
+    	assertTrue(
+    			text.indexOf("First header column!") > -1
+    	);
+    	
+    	
+    	// Unicode
+    	doc = new HWPFDocument(
+    			new FileInputStream(filename5)
+    	);
+    	extractor = new WordExtractor(doc);
+    	
+    	assertEquals(
+    			"\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+    			extractor.getHeaderText()
+    	);
+    	text = extractor.getText();
+    	assertTrue(
+    			text.indexOf("This is a simple header") > -1
+    	);
+    }
+    
+    public void testWithFooter() throws Exception {
+    	// Non-unicode
+    	HWPFDocument doc = new HWPFDocument(
+    			new FileInputStream(filename4)
+    	);
+    	extractor = new WordExtractor(doc);
+    	
+    	assertEquals(
+    			"Footer Left\tFooter Middle Footer Right\n",
+    			extractor.getFooterText()
+    	);
+    	
+    	String text = extractor.getText();
+    	assertTrue(
+    			text.indexOf("Footer Left") > -1
+    	);
+    	
+    	
+    	// Unicode
+    	doc = new HWPFDocument(
+    			new FileInputStream(filename5)
+    	);
+    	extractor = new WordExtractor(doc);
+    	
+    	assertEquals(
+    			"\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+    			extractor.getFooterText()
+    	);
+    	text = extractor.getText();
+    	assertTrue(
+    			text.indexOf("The footer, with") > -1
+    	);
+    }
 }
author	Nick Burch <nick@apache.org>	2008-08-09 22:08:34 +0000
committer	Nick Burch <nick@apache.org>	2008-08-09 22:08:34 +0000
commit	a1f745fee36120699c40d926c45d836024b674ff (patch)
tree	f360b3d5be2a0eb5a45f352b9974e97eac5c7d70 /src
parent	2eaf0ec59afdc0a3aa91e8327b6b08e6bcbd2c62 (diff)
download	poi-a1f745fee36120699c40d926c45d836024b674ff.tar.gz poi-a1f745fee36120699c40d926c45d836024b674ff.zip