<changes>
<release version="3.5-beta7" date="2009-??-??">
+ <action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
<action dev="POI-DEVELOPERS" type="fix">47415 - Fixed PageSettingsBlock to allow multiple PLS records</action>
<action dev="POI-DEVELOPERS" type="fix">47412 - Fixed concurrency issue with EscherProperties.initProps()</action>
<action dev="POI-DEVELOPERS" type="fix">47143 - Fixed OOM in HSSFWorkbook#getAllPictures when reading .xls files containing metafiles</action>
);
}
+ /**
+ * Returns the range which covers all the Endnotes.
+ */
+ public Range getEndnoteRange() {
+ return new Range(
+ _cpSplit.getEndNoteStart(),
+ _cpSplit.getEndNoteEnd(),
+ this
+ );
+ }
+
+ /**
+ * Returns the range which covers all the Endnotes.
+ */
+ public Range getCommentsRange() {
+ return new Range(
+ _cpSplit.getCommentsStart(),
+ _cpSplit.getCommentsEnd(),
+ this
+ );
+ }
+
/**
* Returns the range which covers all "Header Stories".
* A header story contains a header, footer, end note
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
+import java.util.Arrays;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
* Get the text from the word file, as an array with one String
* per paragraph
*/
- public String[] getParagraphText() {
- String[] ret;
-
- // Extract using the model code
- try {
- Range r = doc.getRange();
-
- ret = new String[r.numParagraphs()];
- for(int i=0; i<ret.length; i++) {
- Paragraph p = r.getParagraph(i);
- ret[i] = p.text();
-
- // Fix the line ending
- if(ret[i].endsWith("\r")) {
- ret[i] = ret[i] + "\n";
- }
- }
- } catch(Exception e) {
- // Something's up with turning the text pieces into paragraphs
- // Fall back to ripping out the text pieces
- ret = new String[1];
- ret[0] = getTextFromPieces();
- }
-
- return ret;
- }
-
- /**
+ public String[] getParagraphText() {
+ String[] ret;
+
+ // Extract using the model code
+ try {
+ Range r = doc.getRange();
+
+ ret = getParagraphText(r);
+ } catch (Exception e) {
+ // Something's up with turning the text pieces into paragraphs
+ // Fall back to ripping out the text pieces
+ ret = new String[1];
+ ret[0] = getTextFromPieces();
+ }
+
+ return ret;
+ }
+
+ public String[] getFootnoteText() {
+ Range r = doc.getFootnoteRange();
+
+ return getParagraphText(r);
+ }
+
+ public String[] getEndnoteText() {
+ Range r = doc.getEndnoteRange();
+
+ return getParagraphText(r);
+ }
+
+ public String[] getCommentsText() {
+ Range r = doc.getCommentsRange();
+
+ return getParagraphText(r);
+ }
+
+ private String[] getParagraphText(Range r) {
+ String[] ret;
+ ret = new String[r.numParagraphs()];
+ for (int i = 0; i < ret.length; i++) {
+ Paragraph p = r.getParagraph(i);
+ ret[i] = p.text();
+
+ // Fix the line ending
+ if (ret[i].endsWith("\r")) {
+ ret[i] = ret[i] + "\n";
+ }
+ }
+ return ret;
+ }
+
+ /**
* Add the header/footer text, if it's not empty
*/
private void appendHeaderFooter(String text, StringBuffer out) {
private String filename4;
// With unicode header and footer
private String filename5;
+ // With footnote
+ private String filename6;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
filename3 = pdirname + "/excel_with_embeded.xls";
filename4 = dirname + "/ThreeColHeadFoot.doc";
filename5 = dirname + "/HeaderFooterUnicode.doc";
+ filename6 = dirname + "/footnote.doc";
extractor = new WordExtractor(new FileInputStream(filename));
extractor2 = new WordExtractor(new FileInputStream(filename2));
text.indexOf("The footer, with") > -1
);
}
+
+ public void testFootnote() throws Exception {
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(filename6)
+ );
+ extractor = new WordExtractor(doc);
+
+ String[] text = extractor.getFootnoteText();
+ StringBuffer b = new StringBuffer();
+ for (int i=0; i<text.length; i++) {
+ b.append(text[i]);
+ }
+
+ assertTrue(b.toString().contains("TestFootnote"));
+ }
+
+ public void testEndnote() throws Exception {
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(filename6)
+ );
+ extractor = new WordExtractor(doc);
+
+ String[] text = extractor.getEndnoteText();
+ StringBuffer b = new StringBuffer();
+ for (int i=0; i<text.length; i++) {
+ b.append(text[i]);
+ }
+
+ assertTrue(b.toString().contains("TestEndnote"));
+ }
+
+ public void testComments() throws Exception {
+ HWPFDocument doc = new HWPFDocument(
+ new FileInputStream(filename6)
+ );
+ extractor = new WordExtractor(doc);
+
+ String[] text = extractor.getCommentsText();
+ StringBuffer b = new StringBuffer();
+ for (int i=0; i<text.length; i++) {
+ b.append(text[i]);
+ }
+
+ assertTrue(b.toString().contains("TestComment"));
+ }
}