diff options
author | Nick Burch <nick@apache.org> | 2008-08-09 22:08:34 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2008-08-09 22:08:34 +0000 |
commit | a1f745fee36120699c40d926c45d836024b674ff (patch) | |
tree | f360b3d5be2a0eb5a45f352b9974e97eac5c7d70 /src | |
parent | 2eaf0ec59afdc0a3aa91e8327b6b08e6bcbd2c62 (diff) | |
download | poi-a1f745fee36120699c40d926c45d836024b674ff.tar.gz poi-a1f745fee36120699c40d926c45d836024b674ff.zip |
Add header/footer support to HWPF WordExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@684362 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
4 files changed, 148 insertions, 0 deletions
diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index a18332829a..9c9a4b702f 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,9 @@ <!-- Don't forget to update status.xml too! --> <release version="3.1.1-alpha1" date="2008-??-??"> + <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action> + <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action> + <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action> <action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action> <action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action> <action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action> diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index b0bc9bb21e..6170314752 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,9 @@ <!-- Don't forget to update changes.xml too! --> <changes> <release version="3.1.1-alpha1" date="2008-??-??"> + <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action> + <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action> + <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action> <action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action> <action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action> <action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action> diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index d73aa337c0..63c6a18ca7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -25,6 +25,7 @@ import java.util.Iterator; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.HeaderStories; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -116,6 +117,65 @@ public class WordExtractor extends POIOLE2TextExtractor { } /** + * Add the header/footer text, if it's not empty + */ + private void appendHeaderFooter(String text, StringBuffer out) { + if(text == null || text.length() == 0) + return; + + text = text.replace('\r', '\n'); + if(! text.endsWith("\n")) { + out.append(text); + out.append('\n'); + return; + } + if(text.endsWith("\n\n")) { + out.append(text.substring(0, text.length()-1)); + return; + } + out.append(text); + return; + } + /** + * Grab the text from the headers + */ + public String getHeaderText() { + HeaderStories hs = new HeaderStories(doc); + + StringBuffer ret = new StringBuffer(); + if(hs.getFirstHeader() != null) { + appendHeaderFooter(hs.getFirstHeader(), ret); + } + if(hs.getEvenHeader() != null) { + appendHeaderFooter(hs.getEvenHeader(), ret); + } + if(hs.getOddHeader() != null) { + appendHeaderFooter(hs.getOddHeader(), ret); + } + + return ret.toString(); + } + /** + * Grab the text from the footers + */ + public String getFooterText() { + HeaderStories hs = new HeaderStories(doc); + + StringBuffer ret = new StringBuffer(); + if(hs.getFirstFooter() != null) { + appendHeaderFooter(hs.getFirstFooter(), ret); + } + if(hs.getEvenFooter() != null) { + appendHeaderFooter(hs.getEvenFooter(), ret); + } + if(hs.getOddFooter() != null) { + appendHeaderFooter(hs.getOddFooter(), ret); + } + + return ret.toString(); + } + + /** * Grab the text out of the text pieces. Might also include various * bits of crud, but will work in cases where the text piece -> paragraph * mapping is broken. Fast too. @@ -158,10 +218,16 @@ public class WordExtractor extends POIOLE2TextExtractor { */ public String getText() { StringBuffer ret = new StringBuffer(); + + ret.append(getHeaderText()); + String[] text = getParagraphText(); for(int i=0; i<text.length; i++) { ret.append(text[i]); } + + ret.append(getFooterText()); + return ret.toString(); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index 06dae46dff..704b4d4dd2 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -55,6 +55,11 @@ public class TestWordExtractor extends TestCase { // A word doc embeded in an excel file private String filename3; + // With header and footer + private String filename4; + // With unicode header and footer + private String filename5; + protected void setUp() throws Exception { String dirname = System.getProperty("HWPF.testdata.path"); String pdirname = System.getProperty("POIFS.testdata.path"); @@ -62,6 +67,9 @@ public class TestWordExtractor extends TestCase { String filename = dirname + "/test2.doc"; String filename2 = dirname + "/test.doc"; filename3 = pdirname + "/excel_with_embeded.xls"; + filename4 = dirname + "/ThreeColHeadFoot.doc"; + filename5 = dirname + "/HeaderFooterUnicode.doc"; + extractor = new WordExtractor(new FileInputStream(filename)); extractor2 = new WordExtractor(new FileInputStream(filename2)); @@ -149,4 +157,72 @@ public class TestWordExtractor extends TestCase { assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle()); assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject()); } + + public void testWithHeader() throws Exception { + // Non-unicode + HWPFDocument doc = new HWPFDocument( + new FileInputStream(filename4) + ); + extractor = new WordExtractor(doc); + + assertEquals( + "First header column!\tMid header Right header!\n", + extractor.getHeaderText() + ); + + String text = extractor.getText(); + assertTrue( + text.indexOf("First header column!") > -1 + ); + + + // Unicode + doc = new HWPFDocument( + new FileInputStream(filename5) + ); + extractor = new WordExtractor(doc); + + assertEquals( + "\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n", + extractor.getHeaderText() + ); + text = extractor.getText(); + assertTrue( + text.indexOf("This is a simple header") > -1 + ); + } + + public void testWithFooter() throws Exception { + // Non-unicode + HWPFDocument doc = new HWPFDocument( + new FileInputStream(filename4) + ); + extractor = new WordExtractor(doc); + + assertEquals( + "Footer Left\tFooter Middle Footer Right\n", + extractor.getFooterText() + ); + + String text = extractor.getText(); + assertTrue( + text.indexOf("Footer Left") > -1 + ); + + + // Unicode + doc = new HWPFDocument( + new FileInputStream(filename5) + ); + extractor = new WordExtractor(doc); + + assertEquals( + "\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n", + extractor.getFooterText() + ); + text = extractor.getText(); + assertTrue( + text.indexOf("The footer, with") > -1 + ); + } } |