From 80f89a3674aaf346d10b5aa1f2bdb7dea75ba831 Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Sun, 28 Aug 2022 12:19:08 +0000 Subject: [PATCH] [bug-63575] support capitalized text in XWPFWordExtractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1903729 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/xwpf/usermodel/XWPFRun.java | 18 ++++++++++++------ .../xwpf/extractor/TestXWPFWordExtractor.java | 9 +++++++++ test-data/document/capitalized.docx | Bin 0 -> 12104 bytes 3 files changed, 21 insertions(+), 6 deletions(-) create mode 100644 test-data/document/capitalized.docx diff --git a/poi-ooxml/src/main/java/org/apache/poi/xwpf/usermodel/XWPFRun.java b/poi-ooxml/src/main/java/org/apache/poi/xwpf/usermodel/XWPFRun.java index 13decacd78..5947125249 100644 --- a/poi-ooxml/src/main/java/org/apache/poi/xwpf/usermodel/XWPFRun.java +++ b/poi-ooxml/src/main/java/org/apache/poi/xwpf/usermodel/XWPFRun.java @@ -27,6 +27,7 @@ import java.math.RoundingMode; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Locale; import javax.xml.namespace.QName; @@ -35,10 +36,7 @@ import org.apache.poi.ooxml.POIXMLException; import org.apache.poi.ooxml.util.DocumentHelper; import org.apache.poi.ooxml.util.POIXMLUnits; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; -import org.apache.poi.util.HexDump; -import org.apache.poi.util.Internal; -import org.apache.poi.util.Removal; -import org.apache.poi.util.Units; +import org.apache.poi.util.*; import org.apache.poi.wp.usermodel.CharacterRun; import org.apache.xmlbeans.*; import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl; @@ -1381,7 +1379,13 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun { // come up as instances of CTText, but we don't want them // in the normal text output if (!("instrText".equals(node.getLocalName()) && XWPFDocument.NS_OOXML_WP_MAIN.equals(node.getNamespaceURI()))) { - text.append(((CTText) o).getStringValue()); + String textValue = ((CTText) o).getStringValue(); + if (textValue != null) { + if (isCapitalized() || isSmallCaps()) { + textValue = textValue.toUpperCase(LocaleUtil.getUserLocale()); + } + text.append(textValue); + } } } @@ -1391,7 +1395,9 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun { if (ctfldChar.getFldCharType() == STFldCharType.BEGIN) { if (ctfldChar.getFfData() != null) { for (CTFFCheckBox checkBox : ctfldChar.getFfData().getCheckBoxList()) { - text.append((checkBox.getDefault() != null && POIXMLUnits.parseOnOff(checkBox.getDefault().xgetVal())) ? "|X|" : "|_|"); + String textValue = checkBox.getDefault() != null && POIXMLUnits.parseOnOff(checkBox.getDefault().xgetVal()) ? + "|X|" : "|_|"; + text.append(textValue); } } } diff --git a/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 6e5716549b..12c6bc97b6 100644 --- a/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/poi-ooxml/src/test/java/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -478,4 +478,13 @@ class TestXWPFWordExtractor { assertEquals(expected, actual); } } + + @Test + void testCapitalizedFlag() throws IOException { + try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("capitalized.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc)) { + String txt = extractor.getText(); + assertEquals( "The following word is: CAPITALIZED.", txt.trim()); + } + } } diff --git a/test-data/document/capitalized.docx b/test-data/document/capitalized.docx new file mode 100644 index 0000000000000000000000000000000000000000..9658e94b39af4b94e899dd9d928651d645d26a6e GIT binary patch literal 12104 zcmeHtRzgoyLL$Xc~7Xc!ImThv4oW+}&M+y*+d0+?mOo z`~Cy>><{(pT~*K8wX2sryVg>WhJnQfyZ|5o000ueO3s9}78C$L3dX}7Y zb2FO^UM6oS*W#F#h59+Xt=XEI-^g%4F_G1F#tbVX$){we_`TA&1w_)uN2!s!MXn;Nccl1A}2OQy1pLLJQu^luyS`59niOm9D2D0g4IjaT#A~X z^9&-4P|+icNm->DnM-NU)bM!`P)&TI8w-SJG|{Vg0Nx!r|5q}Cd| z8u-MlmJp*YU@}41quaCdDWN$d_LoqW*&BjA%jMRkBbKWG=DkTT-DHVr^HKOKn~e8_ zecKE7Hn%VU!1FTM6?PE5v*xjCkY zeOisvJ;h5tSwrR>#clbOH!KADP!{L>@<7TnVT=3tQlXKe@p`pIvMDZh4knBycE+D< zAaU!rfR&RVDZjyl?R`Qv>=o<)&nasuEfS{jqRWt+tqruvV{5b~Tekpc^-QZ~OGSjS zZ0I^WZ>hiKUohWc^o7?3(knY)i1lVOWyBQ3^;=^I@Glb#cee4+D-Xrzgq_CEpSs^Y zE@(#Qgj&PKzXSN!NM`nv4y8^a3MXu;7=Y<`D&?qXwpAD6^gT_Q1SfUE|u%TK7{_-Ynhs zru2Jnpc>@F5`G!!IBA$;pU8$U7WzHsj!B6sRfMxWJ`a;}W&q(Z!-|Ex0<^qK2CzDu z4Yl9a^Wje7z#ZDW0EE2OVLD424=lp>*>o$5S;G^PsU2M*y15h_C8B( zHVP0(jYek@7-i-Re_d8>z?JgrZ}PDgbTFcGtu7l=9Cf%`IdjqtsDis@ZjEeRjo%`| z%2tdbJPS@0A;Nl^p3@*DM%M236RyTtEZBjX0Q3;H#Z{!yEHM>X_zDt#>#e+G8j|#@ zEYD)N5;oT#Owi8eXw0oFA3vU-XMA9A2NaJqK1x;Svu>00;zhy=#eA2c^;0LH_a#6g zM)7rkf5363I2FZtw^mr8v{!8RNrC)yO2UhwV|>>^p!u86jTpa^-NWh+^2U~67rz;& z2+qa3uXjHVNz>xFJnY*77uA;6W>Xo!cKf(54L|zc>U7W`Qntkq6 zb?fL`0v&(iJ*%2U<`6iMwVTq$uFW~+`C37fMs3Zt+Vr}(hWpfj^dqltgL9quG94s1 zqWMoh>0=0<@-JNVET`t=4@2bIH3?+1H2aM-kLQo2xbywRyRcSvoSHe4l~a_)c=3ll zTuS2>4$Sb?ouvg0!m#U^w9LKrl~FrcZ>Mif)XdY{4^;&)tFOWs(;8yd$b69Xdu7X3UbWKws3B-Cfvbm{#1XTa`BX?x~Kb=cM9`V}s^I z=2;|Nb3Im6`Yttzynx#s5}ue+JHzVPa+xqpGa; zKtaW8g-GAA`H)g8wL(9C)h%~9v_IewQc)mky10E@iAt?x)c=Oh>LlZz!)FPq4O2yn z?%IKp?MQH`Uin_JMpF0@)1U`y;+#^%O}%c#>x5-l2_pm%`lpEPS^^GDysawjM35;x z<9Qr;1?MJd4^kd!2`qFvVC9SI{*Qe(iXOHz1z%%>??G_!nBWEF7Tp*6Au(?4=IauU z6sRAWiuQt1`cScUajO&HS)e^9s9fTMa1)R?wDf+sHw|2GNeOjLVa{Xfj=DQErV!)) zKuJY_hchglnQf}kKIH`Dy|!Pnt)DD?_3#pKFYqO4PpOB)4Zrp}&Kxlg1qb9(Gz{$Q zNxUs*Smj(ea(0~Tz*pd~reEbOI*3SZ4B*VLp4FLZZJyjK(RPIy2s}W(UCtf83XI^) z2p%`7@t!`TCslqQn4;_0!3k(#mi2y9q$E`yE}$qK%h2P}unW~8f3w(z5tzF2I0sPWlw_Tm-JYNJn$rnX zW+End|faeC*FQ*Y~>ay+I>5 zCG>$ebxrRP$->`+B|OJ{>4-oKbLNKqh%m|0Er%(UVj#Qm-bK|U(@utIFxz--zUz9G z0gAs3u8F6`n8qDDq%sS2^@_UA`#z#vC_5|2nB-rneBR4_+rjW5=^Fx5G@>I{%UW@>t#vB`2qnL5< zU{@}2*TVVm6T{T@P`76y!Gc5q(vS{4V)dfytEg9zsdyV-;-kSDqVFi8C_Cy2-n|mx zArH^CeeblRk#i>@L;tDV*g|Zi;-p&Ank*oT495Y@QBUmnT#>57et+sKrQ(}=9|ECJ zBd4}_Wkg%`;It0}AFJ3F>!~X{9{1;4BtJ;!YWnV`R(_0!U32D_n?=X!Y+{)yHco0P zv1dg}<&FL58G*00Rc-gSk@rA*uF7TgXshV!omlfr&XqJ?9oWmGlacSPDFo=$2=JG> zj-E3q&lkfWP5yK*>9kLVjxW;FYEY70?<&C(Z+Mlk#IAEJS1}Kf-6||71B7n2n!;9wJfn3!IPg=Pzh;aeNH`n}?f{Rs~C1y%`d3JFvD2#WQ*abD@Tyq9K* z69K&Hk@S`3yRbFD>EyTP_Ved~AD4X%Q=kP4!LQ5=C(hmDlg|f?(l}>R6KeOB{1`9_ zKLc?Fsk2$MxiO^8e$FJL90gr4w~a@rO&Zx`Uo!#?-)@b?tbM)(_77Mmot#yYu2Un) zjd@3Ole}pDROHJK6@OaB^evAHns(~2ZK&*_32EEws=pYfnymvQKS9A$%jB1ApXv8C zeu~b_v9UFiCU0}JKh6Q~c~oZnD>Bt$?Rzv2E_j4+r^q_KcPd?($hjC-U}!m1vDdo+ z8T2($!Fwy^wPl;G)AcNG^D@H;aImjCa}B0Wa*kcZc)>};22SHOM&B3q(3Tg_oNqclGoYl-)>qXi+)LDhOEy# z+6g~5Bje;SD^JXdA9s}2us-5I-|L)w0>VE-2F}?VYA8rB5kmV5MRzoHa-4K+=9v|q_Rv(_c9LVvowQvI=!36F>SD-MSODB8#SJx6bl9P%1uAzFhn!D%JTwlbZSJ}v$;yNNo|8X zRnO|ELXR{$r$7+|@Qmi{4?ynVcWJ66T55>z?(`x^nO9CUMR&Z!g!y`p$zv{n7MA%n z-L)Agw{5&2lV-O0dAlDkR>!Ho-7BHH*cji+m0g5?l-HR;!yL7@!`_7}FNjmfsAER0 zIyRbpmKpau!Z84&Di%ZA&y45yrorpy13dBo>LpJ+gs#(-FZ#vZ$RByweH6(?8Jamm zO)gTVdGuta1ccz0?`*ExeCk*loQ2>VW;m1@S8BU-b2%MC099G2h7*~^QA~}CZm`9Hm}8!yXudT}e1Tb}z(%SM*|vSQ#ja>FF0j4`sTRIb zJ5Lh4OY2_kr+ACv?f&ZGq!PIWv)4J$R&P$?RwLGm@9cC;nF^qPS|p4agSl<;)d3Pt zIRO>v;aBYCd%JhUvlj=fNEQcW_7lM1PCDs|q~-9v=I38O{hTsv)uJnH?6Pc@cNO{vMlV{gr_zF6G)vB6`lr8t+{eN~6Qf=BT+ zr5+cECkv-5gka^Wq>r-A(DH*0JSdV7v7=9BT^SZcO}G8#2^O(k1^9u_|9qmJl#6;X zf)Me7yq;7vxcRR868kLrgv83(J^b{=!e;>7T+cq#^}|qQkDtdaU5SJ@=?oqXYNxDE zi3Ac&tr}`RRMiZ;xg4&E8o3KcfU0)z$22WEoNsb5T1O&pXdR2|o{zgC{8;$pC#fWY zF-Mb9Wsl*U$ptYoEmO}N>;a_DqJqP zBF{N-ysF25@61K((j>#u&2)(ETCi2H}>`n6u7Nx(UN(G zqh(;x9uSCv62&%qS>PRT_WGT;Bf%bcne9R3Uvs}!fdZ-~G7gp-dG;xhXb_=>AjUMqQwZZmnl`gV z@6op<-Ra46%3gd*!}wjpSXCX|L-UAlK6>`?dQ=$|M7shMG1IJS7yGjOwtKNdD_?QM z;N-@O`UWhv%t7DgT?1NusR=m|V;z*ao`;kzT0Hb0A%_7K-Q+ed&{QO1&0ML%)pa+;EGRf@f@lRo4JiC1l)?Bhxus?l?q%d-f zz|bTkKnZh`lA2?7Be{#`GN?0(=>0{Gz8838VmG>2xbMNb=El|q`;0Fkr0--@%%xu6 zWjQJ<%LLS+^!1{89GOw9)#G7D(9?s5VIVOMC;G)^x8>{fh;7qvT!c|!t|P2!O|i&n z2WaL>rX^F1KOVV^_cOH)yI8ak_F#O=){dA3WkV;_vGcQGd`}7Nso%%2MxNslj*+#-(P`g22T=C zDBO20aOz*1lck+3$_{BO5A)+Q*RpbR`fTJ0T|LXO;*{&p`Qp&Ou1itD{cw~{fS(6C z%VyCXC{p6kz*l;1g^mzf+4m{8VJciI5QNY^7SGlGoGGG|2xk<%R*=i5Y=4z`d-H5G zq9jf#KCDDiN!Md{=JYWC=ah?wveo{dkAkgQGi-KURRdQm@Qpv~Fh$R|xVMovgdQzN z!jV>F-ve88Lq4r>Ja|dZ>P_C*oH>pRiC`Z!i_Pw5qu*0zebibQDbcH4@Pu@iaWvJ#Hx5$z7Ygu6X=^k+K0enlMq-J=6-!!~`T zvtc3s0@5|d3+bxraXcthbIP?zNc7X?h^-nj?az)8M9Z(#skU|&>tqdkt4G*FDs-EFfGW**SADv1u2bBFbv7Pm?-@R?0O>1S*<52&M?x#(;jsNVl~Y{h7J+ zHk>Z4H^sqVQb?8(cT1In0yi1iS-xi-B)sS$P1>JRfj7qEm4+XS;473`snNft8XIoy zc}p3(=3z=ao=+>wjd8#`j9y)`_RXjJr929~nX8!rb0-#zIBU^6poIwkkFj8$l1z*% zZGM5mpQXS3_&~N_AIQKCE5Gcw^m- zZ(C{AEX#pkBc0@KrX)M4HmRk=I4P&0^#gx?&DMD96r+x#O+6Ed1;54^QS8*EI+fi) zX~_LR3$=VUiG5M6+?--!pIyRNE04u9<7*(J{%f7RnnOdTyVyDluA^&FL|w^n6a zH>Kpw^v@L9Lz*PkWi>v#ty7GnUE(v=%nd@EG+5jbw))O2loP?W@b)0^BST|6a;#ZW zWG0knj42H(i5Z9mzt5!&8z7{YM6hZ+2s8uWY93LCN1hGKX4BC@CwhlUPm zjXwZyC>wYjBfhk}*00|gTc}b^n|shk@wo&(@qN|1ILr7gIMXnOsEYOk@g==VNzC~*o_wo|nH7Cv|UXA&VG7xEzwWgvgYxkMZvjJ}wr*HDTWuxy5Z%9O6c3O1B(8f{pf$q83 zrsSxE#`b+$^mIOcw65pv(A0;6_gs5;g2e=*thaKFV&k+8wXE78##Dm2_2m!2@BAKDqdH0szer9d)Hk}Vf3k;n##*x4 zeyEPQvcHeOl6$ngb`#UoF9Z$(4Ckpd4wxj=9~oBe0JKl zjsHk&l!Ro7rU;fR{;mU-gx&AjA;MX5xox=gLDK~u2UP6HKs{90k#pra%(8DTNqpWU zbdGT1#609d`);PsiAnDlq#IjywRN6_-Av%KT4vdUxP0O7GCVEEb+}%}J&)+mzsSzY z>r>(PecPDol$s^>;$SyB?a`^Ju>~8wF);fB*BMr0AjK4roA872<*Q-&-Qu+LF?BIP zsN8%UKb+|(VY)-{07(>^z<8d;0#9f)paen;o8igcKbXRR~b76>ycv-#>98=-iHYy!Exxw$a&wmFI zON&YW0b(bIl|*4$gUG6wLjOTg%O6@r!Y*Y4U5Z`r>?^f0}|QgCx)b~SvYuIccEtf;}dPOr>pYH z4GbeLG|J#3A&2cW)--_J;`1vmEZYZKofAx7pXr2X7HXqP-SCx9Pn+?hCa&1JIWO5< z-Ei=fR-fm-xgmU2&g*9dK2`CmY%T}h-0t6IenqWQRZ_Q1nifpVdu|xOLU9Fpa+Q=> zpAP}!o*sl}c9MRqBp^SlD+|ya$lo|0bt3C|2$ZIG?nJ0NcU(jFB*z!(a|o_(@16hb ze;AI3O|yMY4NFN5DJ&g4%4PTf2J5zN>38=+QuF_TtT#_Zv{FIxB2ke1JodlIx{Ili z@?Xw3;wFZH(9Gz7RdD7By2DI~QZ9X&FinDRP%GHD=Xm+jaIC+qq}sB)N0d2UoKi1K zVNwI;#-t8G;L{zg?;t5?dA5zJv{}T8W||}hYFh`b(GIQlN^od*`=ziIDKop}moV;@ zf^3JM1-oXuo<5R-2JSCpB1-SyOlD^~Y!d}#60veKH;KFIABGM;M3AO7Q4P>|lO>kG)HMnjSfnE&2b zGe|v>ilLFU>2JqIO>yJ4^Gw)*Csap-q$`PBlw+9eDyr%^ONU1LZ`-&~D_<`}%M+ho z^EyM<7+DW&RlT2&S&1kN=!rG7GevXA*R-63b#&D0~rFbIe~e(wJqhCIMKiE(51+ z52P?jxJPH%k{0fFU6)z1FZssd zUM9%xL63%yoF-4ib$MnKG{Ec$J-(msrI;xaAW*hJ-jmU{RG}*}&Qt+wUUCNP@Pi0JR-qj`H<)K!&7XOkv!In$j>_BA zkTvK8!T)&HxQ-5`Zw)cWHHhY5K$4n}Os0Z^-A6}8V>^f6l?GBY^WRJ+Wb49X6y^Px zume_PAJB46#yVmOVKuVMK+jHA+@bMU7b_G3&%2mkyqj{ zjC;@+xi4vTb>DmIadtWJ1f*7p zeFdXpl9DUefEoHSU0Q!`X$OaCh7c{H{43Nxt#Zjr)s@oubT&=9H4XJZZ9&fcrU*Y% zF^e6$itI?#rtdk^3=i>Tu&4LN&pxff8|)Q0!`jp_)66wkCj+t5$`On@rKiV&THCQLzv%Sf#2f{-zvWT&ngO!lH=e1_`fv&p#WJy`tJh%ULg4wEFczQ zV1Fr>{1y0X<=CIlY6#`{M;+O(@V}SO{0RmCju8KV|1ZTfze@VGq~cFmWH0}h0*hZo z{F-9>Q-m6XJN^DHe@(dk3jg)Q`A>Kt&cEURbN>7*_*Vw}CzzG&5AZL1_*V(PQoTPV zgp>av;cvw6SNz|D{-0<7po0