From 5c8d2a20d11afb30d23a31a95b5c06198302447d Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Mon, 13 Feb 2006 12:59:00 +0000 Subject: [PATCH] Friendly wrapper on HWPF for extracting text from Word Documents git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@377372 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/hwpf/data/test2.doc | Bin 0 -> 19968 bytes .../hwpf/extractor/TestDifferentRoutes.java | 87 +++++++++++++++++ .../poi/hwpf/extractor/TestWordExtractor.java | 88 ++++++++++++++++++ 3 files changed, 175 insertions(+) create mode 100755 src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/test2.doc new file mode 100755 index 0000000000000000000000000000000000000000..06921df39545bd0544f062a88c4532632cbea80e GIT binary patch literal 19968 zcmeHPU2GKB6+Sb&>mM)}C>Zlo7!s1w#IhhwBHD+<1k*Su!6`UO-;!PLtarh?vzcAT zfGX9dEk$Y`Ts1#^Knkg-ln7F}4G(>&+9a|PDQN?$M5MmaDt%~$REMA_ksNQo@9tbP zyZ+lv+O)tu@;B$+bMHC#-ZS^k%$d2~3+Go}dg<5e|0-2 zE7Furmr5myw*W{Qx{f?>`Im1<H& z&>U@(PSk#7ouzQ4qI7nqRBD|O&X?8|POHS}NI6d)PE$Hpes#IJI{r4${a&mroQt=h z=mg5ib^Yfs@{f@}73h6lz8ZFH*l7!vP%WG*>L-mNX()cwEOH$AZ$W;6@j zYx#VfcEhCYX|DBj9i8umUK@m;1pcY!tjDWFwVmob{Szi_CoHQat^2i@J#An2L-ShN zn5&Mqr`vxvm`B@RSh@fC?QikIs7TMdH&M=|rS*bh(fxMekL?h0{*iROBTLHpKL`3c zuglA&U;L=;YD?NbY-b~cdYqc;_H|x!tv?^v^{Tm^kDAY?ujPM3zc>#Uqw02aUi(?+ z=i}PGzCMKc1kSQPzhOCbbbhgN?dMuKZ!WZNOUuk0W8RSQK>Z%*OJ@oW{^Cv{lg$sh z&amevout=4ly!5(Wy>BaI_Wrzb3@sL>pNb`$;bWpfFIANp_p?rsC2-~x-2QC-K^t2 z?dF`68!ry|u9NTvlO3oNAI$VS@m$gw$|YTYFq3nWPSHIK-DRo=iA<5bfW5er%%oDT zkJg=(hXb_=nWJvFYGA(L6xgyK80$n8^cQwK^u!;|6kO+N*FRD|Bs@Qp8|YkCzmINc z4cu*>z;j+d-~xw$Ja7~^2AlyVfVTlY4^Ca3x;*v4)WxZHrfz7r{4WQ$W=+~|UMFqp z_`n?v`7|&18^hAwd$&pN-iUk$h$Ahn;nMr=uqyKRHjP%~?`axsC|h63=lsn$uMW@+ z>;XoAQ9z&nvq--Rya${IE&!YhpM{-t-DJX@zNyoEdRVUGyy=_T6*uSLZOgJcWXD+A z+B|-&(;0tj*Ou|7##?|}$44c4 zfUc#q#Bk$*|E34JM7lmK+0rE~Gn21D_K7FHWF;E9Xt7EzyZ2}J-`Ss3LFE@0B3<$f%ky(zy;tc@N+=DY9b#A zz%{GdDGjgBaxK>8f+)Yv3a2@U2iwMA%C;LKv1rVWM1Oo#9Ecg4C{hg})raMh#WF0Q zcpvyM__B-?&}1zO*4~ljd!G z7cLqZv_zfz1Dz#N+fq8=^^5Ws0n#-cM;?G?T>9ZFzBG@@Q%xt@W%2?v(-=KGdFU6n zHv>#t>H@eGDK2Q95}x453i3*+QiXI&L?$^6tyZjh;s-|N3O96=(i_kq@v)@Fw{C^MMPQbeB4Y52ezoAiD{x zC#`NbPH>EigO$V;l@D%i!&ni%Bq2{C)sNdISDU^$@&eKW`2f}(Z=*$M&z6K+H)-Pm z;{oFV&h~jL+yw3b3xCfl*JB?^)M0x)-oS9&FRC3bl!eQ~Y1ZB6 z`PukjXVNP!tiJK3IqV!z z>GyxXhr0;qR}546cWc1;_(!=R;{oFV;{oFV;{oFV;{oFV;{oFV;{oFVlbst@{Okq9{~e$Hit#_cHZTrmOqvE5m*)Y-`#%I2?>__Z9l^7}THqwWub!s? z#_wkV#_#6WGc>A)35+r}|}t+aMbWD+^ctJnb)RvA)tT(mv;|x!#67cypyKoc#!H@EmpZwVU_oIrpwP-NEle msBL?X9N>O7h}1QO=}ukVxl^uw^Edy=wN~HCFtr!+z`p^Qy8uc6 literal 0 HcmV?d00001 diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java new file mode 100644 index 0000000000..23ff64f1a3 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java @@ -0,0 +1,87 @@ +package org.apache.poi.hwpf.extractor; + +import java.io.FileInputStream; +import java.util.Iterator; + +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; + +import junit.framework.TestCase; + +/** + * Test the different routes to extracting text + * + * @author Nick Burch (nick at torchbox dot com) + */ +public class TestDifferentRoutes extends TestCase { + private String[] p_text = new String[] { + "This is a simple word document\r", + "\r", + "It has a number of paragraphs in it\r", + "\r", + "Some of them even feature bold, italic and underlined text\r", + "\r", + "\r", + "This bit is in a different font and size\r", + "\r", + "\r", + "This bit features some red text.\r", + "\r", + "\r", + "It is otherwise very very boring.\r" + }; + + private HWPFDocument doc; + + protected void setUp() throws Exception { + String dirname = System.getProperty("HWPF.testdata.path"); + + String filename = dirname + "/test2.doc"; + doc = new HWPFDocument(new FileInputStream(filename)); + } + + /** + * Test model based extraction + */ + public void testExtractFromModel() { + Range r = doc.getRange(); + + String[] text = new String[r.numParagraphs()]; + for(int i=0; i < r.numParagraphs(); i++) { + Paragraph p = r.getParagraph(i); + text[i] = p.text(); + } + + assertEquals(p_text.length, text.length); + for(int i=0; i flat extraction + */ + public void testGetText() { + assertEquals(p_text1_block, extractor.getText()); + + // On second one, should fall back to text piece + assertEquals(extractor2.getTextFromPieces(), extractor2.getText()); + } + + /** + * Test textPieces based extraction + */ + public void testExtractFromTextPieces() throws Exception { + String text = extractor.getTextFromPieces(); + assertEquals(p_text1_block, text); + } +} -- 2.39.5