<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
+ <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
+ <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
<action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
<action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
<action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="add">Include headers and footers int he extracted text from HWPF's WordExtractor</action>
+ <action dev="POI-DEVELOPERS" type="add">Added support to HWPF for headers and footers</action>
+ <action dev="POI-DEVELOPERS" type="fix">Improve how HWPF deals with unicode internally. Should avoid some odd behaviour when manipulating unicode text</action>
<action dev="POI-DEVELOPERS" type="add">45577 - Added implementations for Excel functions NOW and TODAY</action>
<action dev="POI-DEVELOPERS" type="fix">45582 - Fix for workbook streams with extra bytes trailing the EOFRecord</action>
<action dev="POI-DEVELOPERS" type="add">45537 - Include headers and footers (of slides and notes) in the extracted text from HSLF</action>
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
return ret;
}
+ /**
+ * Add the header/footer text, if it's not empty
+ */
+ private void appendHeaderFooter(String text, StringBuffer out) {
+ if(text == null || text.length() == 0)
+ return;
+
+ text = text.replace('\r', '\n');
+ if(! text.endsWith("\n")) {
+ out.append(text);
+ out.append('\n');
+ return;
+ }
+ if(text.endsWith("\n\n")) {
+ out.append(text.substring(0, text.length()-1));
+ return;
+ }
+ out.append(text);
+ return;
+ }
+ /**
+ * Grab the text from the headers
+ */
+ public String getHeaderText() {
+ HeaderStories hs = new HeaderStories(doc);
+
+ StringBuffer ret = new StringBuffer();
+ if(hs.getFirstHeader() != null) {
+ appendHeaderFooter(hs.getFirstHeader(), ret);
+ }
+ if(hs.getEvenHeader() != null) {
+ appendHeaderFooter(hs.getEvenHeader(), ret);
+ }
+ if(hs.getOddHeader() != null) {
+ appendHeaderFooter(hs.getOddHeader(), ret);
+ }
+
+ return ret.toString();
+ }
+ /**
+ * Grab the text from the footers
+ */
+ public String getFooterText() {
+ HeaderStories hs = new HeaderStories(doc);
+
+ StringBuffer ret = new StringBuffer();
+ if(hs.getFirstFooter() != null) {
+ appendHeaderFooter(hs.getFirstFooter(), ret);
+ }
+ if(hs.getEvenFooter() != null) {
+ appendHeaderFooter(hs.getEvenFooter(), ret);
+ }
+ if(hs.getOddFooter() != null) {
+ appendHeaderFooter(hs.getOddFooter(), ret);
+ }
+
+ return ret.toString();
+ }
+
/**
* Grab the text out of the text pieces. Might also include various
* bits of crud, but will work in cases where the text piece -> paragraph
*/
public String getText() {
StringBuffer ret = new StringBuffer();
+
+ ret.append(getHeaderText());
+
String[] text = getParagraphText();
for(int i=0; i<text.length; i++) {
ret.append(text[i]);
}
+
+ ret.append(getFooterText());
+
return ret.toString();
}
}
// A word doc embeded in an excel file
private String filename3;
+ // With header and footer
+ private String filename4;
+ // With unicode header and footer
+ private String filename5;
+
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
String pdirname = System.getProperty("POIFS.testdata.path");
String filename = dirname + "/test2.doc";
String filename2 = dirname + "/test.doc";
filename3 = pdirname + "/excel_with_embeded.xls";
+ filename4 = dirname + "/ThreeColHeadFoot.doc";
+ filename5 = dirname + "/HeaderFooterUnicode.doc";
+
extractor = new WordExtractor(new FileInputStream(filename));
extractor2 = new WordExtractor(new FileInputStream(filename2));
assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
}
+
+ public void testWithHeader() throws Exception {
+ // Non-unicode
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(filename4)
+ );
+ extractor = new WordExtractor(doc);
+
+ assertEquals(
+ "First header column!\tMid header Right header!\n",
+ extractor.getHeaderText()
+ );
+
+ String text = extractor.getText();
+ assertTrue(
+ text.indexOf("First header column!") > -1
+ );
+
+
+ // Unicode
+ doc = new HWPFDocument(
+ new FileInputStream(filename5)
+ );
+ extractor = new WordExtractor(doc);
+
+ assertEquals(
+ "\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+ extractor.getHeaderText()
+ );
+ text = extractor.getText();
+ assertTrue(
+ text.indexOf("This is a simple header") > -1
+ );
+ }
+
+ public void testWithFooter() throws Exception {
+ // Non-unicode
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(filename4)
+ );
+ extractor = new WordExtractor(doc);
+
+ assertEquals(
+ "Footer Left\tFooter Middle Footer Right\n",
+ extractor.getFooterText()
+ );
+
+ String text = extractor.getText();
+ assertTrue(
+ text.indexOf("Footer Left") > -1
+ );
+
+
+ // Unicode
+ doc = new HWPFDocument(
+ new FileInputStream(filename5)
+ );
+ extractor = new WordExtractor(doc);
+
+ assertEquals(
+ "\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+ extractor.getFooterText()
+ );
+ text = extractor.getText();
+ assertTrue(
+ text.indexOf("The footer, with") > -1
+ );
+ }
}