From 8e1165ea7e33f1829965ba3cd0987f79f532de5d Mon Sep 17 00:00:00 2001 From: Yegor Kozlov Date: Sat, 27 Jun 2009 10:39:51 +0000 Subject: [PATCH] Support fo text extraction of footnotes, endnotes and comments in HWPF, see Bugzilla 47400 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@788949 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../src/org/apache/poi/hwpf/HWPFDocument.java | 22 +++++ .../poi/hwpf/extractor/WordExtractor.java | 81 ++++++++++++------ .../org/apache/poi/hwpf/data/footnote.doc | Bin 0 -> 9728 bytes .../poi/hwpf/extractor/TestWordExtractor.java | 48 +++++++++++ 5 files changed, 124 insertions(+), 28 deletions(-) create mode 100755 src/scratchpad/testcases/org/apache/poi/hwpf/data/footnote.doc diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 34fe02d10f..35554d1143 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF 47415 - Fixed PageSettingsBlock to allow multiple PLS records 47412 - Fixed concurrency issue with EscherProperties.initProps() 47143 - Fixed OOM in HSSFWorkbook#getAllPictures when reading .xls files containing metafiles diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index a43852be8b..8b6d2fdae7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -344,6 +344,28 @@ public final class HWPFDocument extends POIDocument ); } + /** + * Returns the range which covers all the Endnotes. + */ + public Range getEndnoteRange() { + return new Range( + _cpSplit.getEndNoteStart(), + _cpSplit.getEndNoteEnd(), + this + ); + } + + /** + * Returns the range which covers all the Endnotes. + */ + public Range getCommentsRange() { + return new Range( + _cpSplit.getCommentsStart(), + _cpSplit.getCommentsEnd(), + this + ); + } + /** * Returns the range which covers all "Header Stories". * A header story contains a header, footer, end note diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index e63ad4f5a4..10ac954263 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.io.FileInputStream; import java.io.UnsupportedEncodingException; import java.util.Iterator; +import java.util.Arrays; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFDocument; @@ -95,34 +96,58 @@ public final class WordExtractor extends POIOLE2TextExtractor { * Get the text from the word file, as an array with one String * per paragraph */ - public String[] getParagraphText() { - String[] ret; - - // Extract using the model code - try { - Range r = doc.getRange(); - - ret = new String[r.numParagraphs()]; - for(int i=0; iL@jmDN{7Zh# zh~t=UEt0SG_5FYDIp^Mc?!D*t{%`AUzwvhShq55-kt&(Z2c_CmZjlbTc@jR-S~i={ z=iOqGmGBU0;9h}OokN>R^aGvqFMt3Df)(J4U?r#lwO|#{K6QooAriU5sz(DyCCe`- zVL2!n;wibzjFui6Suq}Rb9SH@1+Tp*)~?jL%J}mdC`D&p{k!(Noc*t7f$sl$uo^Ug zHJ}l!1x;WbXa?&+3)lc20j(emHiAuHGl+mMfh}Mw*anoZDDigiXd&N5{1}LVogfa{ z!7k7Nc7smP1-d~G*aP;0eV`ZY2Yny``oRG(01kpf;Bk-yPkR09z*Arp90yN>XTVp%v*0;!0*ry@!3!V_PJ&Zl983U}`D=xEerEqSDT~pnFz`yf zI_P6mkzbg9Ii&M9boSI#b~<}H7e1by8s9OHJ%4F(B9k*`!qAA84P;N5TY;wA$e8-J zF8R3o=8F!YV*i$NjVRWnXk9%kY6Z8DXE~7^?nO>>P5CJA)0lIt&W_ogcCQ_`2kmZ- zI9;(-5lKg6XGEeAE5WjBm;R^>r>f)#IFM?R$>chF8Vn{aIh?X3le7b4#A)iDPj=fm za38+sVVSWtthz$v0&OD~9_fa0Vhna;Q=3-&{DoI8yq*Q5lTDlk4euDlWJ$j(O&pu>#m@&H-p)p`+ zB&-2wG#8}xa#{){wE|JNKVqmf-G}?qcuB#FB~zB#j-aeX?slUsU5CBsLABT@tFQk< z8m`Ysb;QD24yUSFfYmI(7PIJ_1=oTd>kla{wP&^R2%1)X$IZX$q948VP}&dG6=-ku zb!;(I)xtg&deQoh6((JcR`)X#iNg3kW1o8spRZbK69clr?3FnG`)Ju=xLgOl8AER+ zT-KWHu{&VtPB;-@J*b>NHtQlNo6QV&(qDPnV?PN6KXaiy)gEs^j+;gf<;Gn#FD5&hNI+bwN-{!Ta=enyIbT zR1MIPH0b$+8cf!$HxXCukNIdfGt!k}Llo+k z;&^{`X>C-D*k%Uc|+(;ZYVu-b}~GgzHm96nLdBkJYMyEq3v|MJ|KVj?z_L{kz&%C zC-j*=->uev{>n0G&6g4W{u9d@3I|n;`ky=f9kpXv)&6o`rn48)nJ%aF6C(ueMv1!z z05vLO$kW0-h1{ZiewDH9syj{VMt0L~8#h*+aT6-JAPE#Rv`yC{IQ`vgJ5P?oP;a{lbq)vY!#Gs?E(uH9RIW{dUe1Los zdaB=}#HyD{`2T1?*U4w0Z=ZZnRKP2&1B}i?8+>AY3cL8B6~DrGMFS;hz>n8YA4BRv z&ykwp-^Z^81n24w6};>j<7=OwwuY13?RmgFpC;+89(b+u_Eo?9I8ZOW(^KyWmGC*y z0CzkeXaJ3%2{eNi&;#~@V_+0K4fK6}Pd@&e+?IPkmDfKfLVQ+itf@MuLq8R;VyV@0 z4%l-zV~-}sr6VA{Q5lI!XFzrbq$?oZ0i1<$Fp0k&uh1ZI-^()RtW_ti>zq}mt?RIT zE`X;>c13jpn2X_*A2*xdkPt5ib#olldH7l*u~fZyOX^KY6 zb*}Xdz;k6j{Chqra@U*r(3|;_H}gkt<`3RX@hgb8WhL!uKyNh$e2>zYPgRMCbVQ^* zVs%EWv8#Ng0WSc@lgd4ATsu2o4V=E~oxXvXwBGo3TlmI%iH$ch_9n3TTE>om_+h?I zT{~mNE^6O)a@*ok8`F#@EcSlZ2aNC5Z*KjtZGGsqAMk#D^zK^&l=&Bw={F_)PU8v{ z(tbeS$cvRzJr_RB@4w2!!f!}(19=k0<)j}42^L@8)OmQ3FKQ=ouPry!BdyA7gXQ>5 z-H|NDy}-FzZdk$SIiEj9`_kUYEg+5LKl7+=TMP>LMN>oe+bA^34*UPAxs zzSVF4PjJ&X&5TT$k{r(7EG3JFZH%uEb@fG*I{uew`+kAsaWmF9UtO|%4^vK@TTr2T r$*8#u`FB9o^}G20YvkqWqpOVZPcwdQNy$oYU?#rsS4uDM_y7L@y&PDn literal 0 HcmV?d00001 diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index 0eef5fc9c2..261c9e6b66 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -60,6 +60,8 @@ public final class TestWordExtractor extends TestCase { private String filename4; // With unicode header and footer private String filename5; + // With footnote + private String filename6; protected void setUp() throws Exception { String dirname = System.getProperty("HWPF.testdata.path"); @@ -70,6 +72,7 @@ public final class TestWordExtractor extends TestCase { filename3 = pdirname + "/excel_with_embeded.xls"; filename4 = dirname + "/ThreeColHeadFoot.doc"; filename5 = dirname + "/HeaderFooterUnicode.doc"; + filename6 = dirname + "/footnote.doc"; extractor = new WordExtractor(new FileInputStream(filename)); extractor2 = new WordExtractor(new FileInputStream(filename2)); @@ -226,4 +229,49 @@ public final class TestWordExtractor extends TestCase { text.indexOf("The footer, with") > -1 ); } + + public void testFootnote() throws Exception { + HWPFDocument doc = new HWPFDocument( + new FileInputStream(filename6) + ); + extractor = new WordExtractor(doc); + + String[] text = extractor.getFootnoteText(); + StringBuffer b = new StringBuffer(); + for (int i=0; i