From 97e8e39eb675fe57f95416c7dfd4b22df9402ae8 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 12 Aug 2008 19:57:04 +0000 Subject: [PATCH] Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685283 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hwpf/extractor/WordExtractor.java | 8 +++ .../poi/hwpf/usermodel/HeaderStories.java | 31 +++++++++- .../org/apache/poi/hwpf/usermodel/Range.java | 57 ++++++++++++++++++ .../apache/poi/hwpf/data/HeaderWithMacros.doc | Bin 0 -> 22528 bytes .../poi/hwpf/usermodel/TestHeaderStories.java | 15 +++++ .../poi/hwpf/usermodel/TestProblems.java | 1 - .../apache/poi/hwpf/usermodel/TestRange.java | 53 ++++++++++++++++ 7 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index 63c6a18ca7..deaffc79af 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor { return ret.toString(); } + + /** + * Removes any fields (eg macros, page markers etc) + * from the string. + */ + public static String stripFields(String text) { + return Range.stripFields(text); + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java index 95cee57d1b..8300361720 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java @@ -35,6 +35,8 @@ public class HeaderStories { private Range headerStories; private PlexOfCps plcfHdd; + private boolean stripFields = false; + public HeaderStories(HWPFDocument doc) { this.headerStories = doc.getHeaderStoryRange(); FileInformationBlock fib = doc.getFileInformationBlock(); @@ -157,8 +159,15 @@ public class HeaderStories { return ""; } - // Return the contents - return headerStories.text().substring(prop.getStart(), prop.getEnd()); + // Grab the contents + String text = + headerStories.text().substring(prop.getStart(), prop.getEnd()); + + // Strip off fields and macros if requested + if(stripFields) { + return Range.stripFields(text); + } + return text; } public Range getRange() { @@ -167,4 +176,22 @@ public class HeaderStories { protected PlexOfCps getPlcfHdd() { return plcfHdd; } + + /** + * Are fields currently being stripped from + * the text that this {@link HeaderStories} returns? + * Default is false, but can be changed + */ + public boolean areFieldsStripped() { + return stripFields; + } + /** + * Should fields (eg macros) be stripped from + * the text that this class returns? + * Default is not to strip. + * @param stripFields + */ + public void setAreFieldsStripped(boolean stripFields) { + this.stripFields = stripFields; + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index 0a145d5fd4..d9b679e42d 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -299,6 +299,63 @@ public class Range } return sb.toString(); } + + /** + * Removes any fields (eg macros, page markers etc) + * from the string. + * Normally used to make some text suitable for showing + * to humans, and the resultant text should not normally + * be saved back into the document! + */ + public static String stripFields(String text) { + // First up, fields can be nested... + // A field can be 0x13 [contents] 0x15 + // Or it can be 0x13 [contents] 0x14 [real text] 0x15 + + // If there are no fields, all easy + if(text.indexOf('\u0013') == -1) return text; + + // Loop over until they're all gone + // That's when we're out of both 0x13s and 0x15s + while( text.indexOf('\u0013') > -1 && + text.indexOf('\u0015') > -1) { + int first13 = text.indexOf('\u0013'); + int next13 = text.indexOf('\u0013', first13+1); + int first14 = text.indexOf('\u0014', first13+1); + int last15 = text.lastIndexOf('\u0015'); + + // If they're the wrong way around, give up + if(last15 < first13) { + break; + } + + // If no more 13s and 14s, just zap + if(next13 == -1 && first14 == -1) { + text = text.substring(0, first13) + + text.substring(last15+1); + break; + } + + // If a 14 comes before the next 13, then + // zap from the 13 to the 14, and remove + // the 15 + if(first14 != -1 && (first14 < next13 || next13 == -1)) { + text = text.substring(0, first13) + + text.substring(first14+1, last15) + + text.substring(last15+1); + continue; + } + + // Another 13 comes before the next 14. + // This means there's nested stuff, so we + // can just zap the lot + text = text.substring(0, first13) + + text.substring(last15+1); + continue; + } + + return text; + } /** * Used to get the number of sections in a range. If this range is smaller diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc new file mode 100644 index 0000000000000000000000000000000000000000..934970f58b3d0d2de6caa560b182f2879acb4d3c GIT binary patch literal 22528 zcmeHP2V7Ih)}MsXI|3r0To4r-RS{hgP!LclA{Oj{1PF!%LqIGjDp+uRVsEUnh+PC% zv0+0Lupw4ZcNMWKC{|-DA~#B5EX(D7QR;J|sY&-!qNO-p z;D97ztSU_-4Kb9)|A@+pNDAU0U?f_`l&3Dh-wb>$pnGJL&c6cP2#0BXR8|iu_Uwu* z-JI?=f=md_X*^V70zW}!mT#rKO$f<^inmM&*(IwlQ-Nwe1=gyS zTuRJM5K|FCZY&ka6>)8~wUxBBS@f@umxrfU5ak`0%Wt@Af% zA?)!C0Y(60Kx;r7fGNNn&=z0;pa52YegHSXC_o4x6c7gB0>S}P0n-4}0l0r`c;8T4 zE&cbn^4H2Ml?@jgE><`E4GjCA=%R{o( z)(x`=iQwSSOI&vYN8*qc$>oY26V`Ubs@KmX#&tn zD?_J75&oZ8Ac|!~3}7l~!YomRGavj%2n&}5vIel>Pv8s(M1O{Om@$8bmSpsUx-I4X zZ}7K4e`&esh|%$u=)rF&)4XmZ?2ml_&G~=FbS_Lc?6)HT4nD{UNS6UB0kr^_ibM}U z0qg8UGw~!^GS5N}&Z#^JmxP1tIFK%e zmIgvx1c;FQ#*j$h4TKg5AQwXr;gQ!(dx2`TmCCkP(IUZ6BNQZXq4bySH7Bj#!8Gl0 zU4<|-l=#w!GH}?)W)e*hVnZfTZ<-2#5?RaaRa7N%C^7`}#Y%KDC4Gk-HnDJLkXJyp zz)bk2wpURhKA@Wo=RaI%H@uCBF3~PhBTOHIG;0R)^Q#^5Y{^I6Ou}H;f;?SWea$cG zQ)B@28|{FhAE7{>0R#0JeA0(D5kk9Yefoc*&r+gK|C2uX`d{h`B5MhQI5NP(+J>_7 zTBJX;hY$Zys{pJV44M-pcF`iO*;Y4tnA%wev>AqN>VKqwG%1>y<8Q)9TX<7^Y6`MZ{$ ziz>9&^QuevIJaznh3K%&=8Dox-zkaRF6)}y-_t#6NpcVMGk2y`9}GOK)?M6QFSK;+ z^x?ag+{@50S^hfs$?)-!DV)T6+jhSB@Vs+=c^8L3rMaDUy-{C2zPxF`}^;*_RYcxmgEOug#&b+L5va(yQr@d;_%*Rf5_Lqjo>P`#RZnr z@v#%%x;?dfq%x*Lch`RJCsku-ow97~ek*8_?ef@9>m*vEc&JllfQq-KIox z*Jf6vmDw$`XSV{1RW$O|lCxgaZFl{+dr|$}6^|(6bvslFs|L88i@Y1}dVJQLGbveH zJE)CGh?%!(`mMYPM*=Tx>U+V#Y_rdP`;`W|3vL7)3EJJCbLouUZV$Xzf?g|z@a~8Hyv^?3u%l_) zxI3=4E6Y1rf4%Xm9i23FmGNBYU@E{IAdcilbM54Q{n!F_1XqNUL+iYsMj@r=emC*w zDQP!)h2`6Lh5mAGcm6vvAl=lXIOFohTjdY#&Fxis$MD+v!P+6+(p9xnxfg5~9H^bq zZL8U~n?B~JyYcT98$EoIU~*>P;#PNjOJav6w!fB^lAZS(bv>`^HOm*>ej9(tI?8pC z_sGXr>N;0>J?q+0gX!}1#;?lDu37~z#y~h)#!Ze~v!z{dz-Cv61PE_A8y?rKm_mt# zVV&YlRpU}C_N?#jXLd?3za4Mks*1o&_shPCW;gF8F$xc}=NnZ-&u5RBZCtVWLysGM z_-)H_a=n9=ogOuQnR7*Ic0X%_`gi;_=lZ+ zCkoDc{b+O~HKw!Pd$q?!eBPlEW5%UA`1M^;X;O0AbjkLBvcTnUoJCcxDme?6O*;`0 zn&I0jBBek@HEwa*up3-v+tBOpk^)XFnRox#E4{*?0?%7J*KRzpZ*rV^PQ;=$qjUG~ z+bx`V<1pVNCYPU(8WU`vU1yfvb&PVN(!{8C<i@0jRLc@)4d+mE-no} zkpHw&yQFI5quR!<+1KVR>XotJ(*21#*==vWh`6d*vSx41p!cRH%A?xMm^UZVCUfv! zlh>ZzQ}ayj+{=WQ0B6)C)^^^YqKf%8&#MxwZLVw8_9@uh=7;;|4q0B%^qo)^oOe`r zm{uEyO?6M}>vA@@c@^t!XzTX=$@{y7_f*|R_b=TMou2$)xZUx_>Wjq|o7^_ceEsTD z>uU)iMNPjKb~@hBFtw-N<%o`@i|jTwysJL5D5qazW4^aht>0{K?)$VN&&>7r)oNaw z7&?uZ+@VwZ$;XTe)vmlsyoBeB#JKT zC+IC)m}!(gC9{I@TvXa;Z18Bd_vZzj-|%v5LsFVX&GVqA6>+P7pEJ1Mh{v{GZHILAbIfYrXDe(#~yzN(zGC zbH@gBsTKQ}cppvj3CVl7iX;y9zf%ynr^|20&TZMV$S}2C@TDWOd)u`?@ZO`>$;qQr z+Ll>&33E$R)yp)gnLJ$i&I7@`$5U#R{EZKP(CP4x{D2&l1H8uzdT8poI_ckB9k}Bs z4f8AaUz~N=6u2Z>YwXzG^O*0eo)sG{eY@Yd%CBPi4oCI}&-!u6KZJ<$$FknkUjuahjnsH{_ii;85_wBu@!5vwry}z?_chQmD0quqrsYIPE(dMn= zsDJP*o9}V7G+oDca_FG;3Fn_4o~x-ryf#Is@Pt;wBP-oLh}p`tm^nd<=4`xEXVvJX8fVL+ouLk zWnDay_4=J>pI@@l)4O?iN9SA{E0|VN%)4b)xYoGcJ;NimaTn)@sI?w&%)q%iz%P9H z11I5r(dl2Uw-xqG$@`Fg(!Z@wO7NIb1D$)E4>o3YyOh4G;hdlSGq27Q+c);i8Wl2k z;`{O}^26cX9qu{29Dd`*X8q_NKCD|A)NiP#R%WHz;tJ2|u$bdI!^=8l80_vi#H-9M zX33W9=q_ta50 zdun=cM_s(Gm)_GY>p0vKmHtH=KWyNZ*tlU3m#Hx2Wolo{YFnY|Mhdss%9-f@t`ccn$ z1(}wE?jBQp;+ESq^sdtfZhzsn0mpRnj0)zf_-UO~$^3m;bl)`FwD=g-;^l4k6=eB_ zTnW!!zJK(wkO?tc$JDyA`Q2}XG$i_mwZ5?ST6>)_fcb^Xi}7;8KkqlRFQskV z4>@M=lE$kk@^3Qf-b2EXdl}32B7Pbt?G2Ob=1fr&3a=d2TiWrr1!dV+m6GrG)OL2W z9$Hb*Kb>h6=6Ks7a)+r}pC>CPuPcZ?Sp5(8AY=c9H-6ycPMq@( zVXqpah^2#V{5o0(+1NU)5$SXqv^>2mHMl-$x67s_Bko&m+Tu}?U*=O6mzeQ#J!8gc z>!YE1)31>>s{+?G?QL{1T^M0!VLa~RP>8_i>ND*_bq!(R>noma!hrJ>^^T2U8+&urln3@bjfI83(;zrGcrwZm1Uw zVaj-fz+ zAhr+N)hbdfj_G4(7aIm!MRu&MFor9Dyl|l?nk|N;D8h~-Vow3V(R}>6=4=h=4fyVfZmgSuTz;z@Y{wkAM>mhuVNv`r@XQNpRd<>jEjzQHw`P zOsENS37pQ)z_dGQ_Y%$_U{!%1(nOMyl1LriUC|}X+GW7fBWkr!S6UHHFm#Dhtv#f5 z39A-WGl+UE3TPgfL_<=KKpku{md9oq$oLH4qfv$t&06@lLykks^vF4EA8CSLHNsLt z)3_-yB)au$v4SKUC|)B;1hC+|76Ybn6OX{|BHGA+S4Yr0K`j;lOF$SP9FPEz%3*@6 zmVwU!>PJgF0AN7Pcrt+>r@TjWBls!G(%Fw_s|1qO=2-+kHsNO|=^4-GwqafDN0U2~ zHXP28iP(R490|-+CBxweU=lz`rob<9Q8^u?rLZ)Pt8Db-;y5$B+ogdhCR0U8nWfBB zV$JVDC}1biawLj?5e&yK(Qwod3)%iarvMECy?7iYfc%~yhe?!`8BB(%3R78Xacu49 zkc81;WGd7Ng_?XwIoV3IYpSrA7+Tcs0Y@-A$-xI9y`*(?m9c^nll6j0GNw~#6=~Qg zqXQ)6d7umHLJL3Xg08Y8y4b*$>nTH06OXVWWj!OrLuwzEGE`ELX?sRV9wtd?d2QG) zX*((62U5f!2R;bGq;ODMU_xr~gHWVI$pmq6$M~?4fha;n@}%R_jz=caiOeEOXc$S( zXRfq;FgcZoEJk5AIC|JG~nSv z3&Cd;W~kRT=`x6QAR+f*1%ZoCvs!}WgCvx3{Hi=lTz{GcJy_Qy?*j=R7c&GMG9as| z$v~FuBsosRV^#{@D?(TT<0MZ#9{N)7%=093nu2-#`H&S?1Y83+K*OO9eHa=G2dvVd zPP`<~MHrO8H6sk_>%)O_v_zf=Rs@Q81D`KBwiUorgomm4p85-gkFapU>h@19VgRQy zW#g;*fJnWaOVvo%PJ3_Qyvl$K_@*pb#nXUd%{2gg&`SU~Qda>uG>-we7{38XR!*pk z3$P(je81TP#Ubho6yGvqf#Nm*7ZW@v7X!t;LlRJUsgngj^?>3Ug3HGmpr{`w7rxF^N5Y*ZHU~Cv6l~I9t0ELb+*VSao>f>E^?tgt1aaia=gY^j ztUqqs__iFOI9gH`zNZtJ(Gj#jY;z}>DNEJb00fhw=7Eo9~ zVF85&6c$ieKw$xe1r!!gSU_QcmMri`{y+2Dg=^`yW_rt4ga6;HemDC6df3JWMKps;|#0tyTKAFu%W!|30l4~>2{`tj)J;`e&|f8FpqJ${=< z{~djE^waU1Jo^3U%cGx<-|*3&$8X^1k7GUb`|(>h-b91{$coT6(b_^B^DTf<084-s zpgq7E&;fw?jR5r1F-`-bitrz{0A5_dt1T2=eAi&;@u&GQNY{-&V>M{pP3+VM_Glyv25WOGPy(ME~zeBcb;qxS-<+ zRQ_7fx5&XVrl236A=(DtT93X$s0>#J@iG!#I7Hj~!YJT%Bk5nnis32|{yk*N!3OlB z#qrtw*8by}+h_gFd;Wdxago{od-{)OX|#R@{4FY8w~QeHa6u{xbG|DW!0JWo{?`7Z zg72H