From 2881930651f926afb12ab4aed62b6e7e59dc860d Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Fri, 1 Nov 2013 19:43:46 +0000 Subject: [PATCH] Fix bug #55733 - XWPFWordExtractor need s to handle .docx files with neither headers nor footers git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1538044 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/xwpf/extractor/XWPFWordExtractor.java | 40 +++++++++--------- .../xwpf/extractor/TestXWPFWordExtractor.java | 11 +++++ .../hwpf/extractor/TestWordExtractorBugs.java | 1 - test-data/document/55733.docx | Bin 0 -> 3045 bytes 4 files changed, 32 insertions(+), 20 deletions(-) create mode 100644 test-data/document/55733.docx diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 0f4a52e664..56ba0e7e8d 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -17,7 +17,6 @@ package org.apache.poi.xwpf.extractor; import java.io.IOException; -import java.util.Iterator; import java.util.List; import org.apache.poi.POIXMLDocument; @@ -34,7 +33,6 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlink; import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFRelation; -import org.apache.poi.xwpf.usermodel.XWPFRun; import org.apache.poi.xwpf.usermodel.XWPFSDT; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; @@ -85,24 +83,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { System.out.println(extractor.getText()); } - public String getText() { - StringBuffer text = new StringBuffer(); - XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); + public String getText() { + StringBuffer text = new StringBuffer(); + XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); - // Start out with all headers - extractHeaders(text, hfPolicy); - - // body elements - for (IBodyElement e : document.getBodyElements()){ - appendBodyElementText(text, e); - text.append('\n'); - } - - // Finish up with all the footers - extractFooters(text, hfPolicy); - - return text.toString(); - } + // Start out with all headers + extractHeaders(text, hfPolicy); + + // Process all body elements + for (IBodyElement e : document.getBodyElements()){ + appendBodyElementText(text, e); + text.append('\n'); + } + + // Finish up with all the footers + extractFooters(text, hfPolicy); + + return text.toString(); + } public void appendBodyElementText(StringBuffer text, IBodyElement e){ if (e instanceof XWPFParagraph){ @@ -178,6 +176,8 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { } private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { + if (hfPolicy == null) return; + if(hfPolicy.getFirstPageFooter() != null) { text.append( hfPolicy.getFirstPageFooter().getText() ); } @@ -190,6 +190,8 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { } private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { + if (hfPolicy == null) return; + if(hfPolicy.getFirstPageHeader() != null) { text.append( hfPolicy.getFirstPageHeader().getText() ); } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 9ad3cc102b..6b9f7125ea 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -351,5 +351,16 @@ public class TestXWPFWordExtractor extends TestCase { assertEquals("controlled content loading-"+targ, true, hit); } assertEquals("controlled content loading hit count", targs.length, hits); + ex.close(); + } + + /** No Header or Footer in document */ + public void testBug55733() throws Exception { + XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55733.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + + // Check it gives text without error + extractor.getText(); + extractor.close(); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java index 4af076e5ed..f236b96741 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractorBugs.java @@ -45,5 +45,4 @@ public final class TestWordExtractorBugs extends TestCase { // Check it gives text without error extractor.getText(); } - } diff --git a/test-data/document/55733.docx b/test-data/document/55733.docx new file mode 100644 index 0000000000000000000000000000000000000000..521f50ee8481f053df9e0eb970f22dce1913e6fc GIT binary patch literal 3045 zcmdUxYgm$J7{_0;rp*s?*)sEh*}=-p%4O@IWW_}uv#!uQ0L>s%K~ylpvsv`W#|NDBb|MPqP_Z<;p zJ##Jq0JeZSCT{DfR&_Yk3ILGc`W$cp$b@u^4~DQm1CLE1tvZayx&JgDSax|_F!j+2 zZT7hYom;Zh2j7yF7u*;8_&MoE+x&eFi=Uj{Vz2$}xJX%|bxnjXTFzQg(1MM`A`i^l za<7Xf7?5Y_gp(iMCR(`XfWiokJAkpm^^1d_{&{I(KC>AD&CP~DJ;s_E1|ZoZS%TP7 z4HmT?#u?GlJA1_N#QKYo?08C#4F0-5d-yqB@j~4rrAMzr`t)L{yN9uQ^(%If63@>h z4W#(eRE3pJQLOAB1ud%2BU#fn^mjJ`(H5DpPDDNWt9J`OYtyC*ITF*@yD0zS9(M9b zTo2SIdE{WEtn?|vfDgai+ZNz)q{;YbK@msbTpcSbH0UVz`$L2%HnrB zIyD5|R~NQcf_v0g;?hTN4i*fO2s@!M)Sc&~Yu5`CKkI6zN*)v51bRX2R*Y1icv(f4 zv>P1&z-WvxJC?2VnX(?x@Vzqtzz$dppwqE9qR*=F@27zK#7S5I^MM6{O`5hZ#8#Gn zT1jj8oz?cRyvFjJ7xr^@M@Ukhkd86O@@uNk9KXxeE-!6pVxE5LfqIpGpxcF3BiQT8 zpdE&V(#fu!bC=Bis!OCEI@OO$AjWt*I&0^uwlBDQdOIrZI=x`S<{cLd`9W8Ix4FH! zCLsI=qEnVrY3?q64B{Ml?ImAz$#SmF%D6By8GaI8J>cMyXFXQpUW2HyAsdwRVkH}i zS@}Ijb5K&2@Z{CJvh#XgQp}v*;JS+HvkrTm8iSJ{YwXq#;BLj?nKJG2I?0+eMiloX zXCQ;s{*V)xA(Sh29<#pVRHm4Pu=Q-X?I+q5gvu^{X=Cmlq>jT}3<7Oy3N;Z+A|<9I z5T_l~O9`<Ybm5Y8C=gr`t!2tE{OH8M<-Z_`YSpyE3=qmNXSVWU@{X}ymDpO zw(3>pOh*cFNHzyfuU6%U(&Xa1G+9Bd7p>}@%#j;8^C91@maKow8u7EAuo8NM@fuc- zTa>T!qjb#a6~Q&s%KGMxwg6rhLh3s_SmvHvdPx*8td@?A8J^1rF}23j@-Ce=3>vI+ z^RIV4(Wf@fp6YwU!T^jG%z~@n8qaRf?1*$iD$!>@Asss{Fh>h1+?+SqU&Gn^;JgYUY>pXX*Wc#1FbM3z zk#4zIe2wf#p^9q#_;&dnUUofndJ{RT(`6J2B58>$@BT9Dm!M@u;IcQh?d#OkY2n(t zj+e&GG}Kl_Qaa~RCK%nt9n#V%9aV0f!j4eYphxoEhw(w^OK&Mn$;d^8>aR&j!shc` zokALIr%tJRn!TYl0klj{H*|ln6%D%9aC1N##o*0%0PTw1Mo1xn!vT5^B^66VPA|uL zsIL+7D-TJs?EZEEKB}l&j%Vnzi`TtKI_smM>X|5v_h>Zw84@e0lfiPyganm@FG-Rq zTMzMPohUlx1{JljGh~cwI`w|dIh1jkfAOL5L8Dg6T*-(IK`VrD;*B(1luFh3Wd%ag z_YKMkjp)N6q9lSuTODizSFy4-esK1;4bEM^h+CDTyhk-BHDM_dFl6@g|0&rK?a1Do zb3xiwhzoqvJ%i5=lXH*6`i|XgN^&RsQGm*ej$$1>Ko=_@HogZ1!Z2ZVFVDL-_E6AG zMMIPLXJV*qWkuu0hSp~kgK_30_*OH#fsajlApd61l-VyL#9VGUmAqG<2Lbt?Gm2$# z^QQg2Gyq6J@+S6X+QKd6=1s^{z8$n;LjIAKWLe2PR!u7jW&u+})>3XBFQ#(+Cnoa$ k!p2f