From f56ab225218964b8205b2aeb3ca6034e1865ed22 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 20 Aug 2008 20:13:08 +0000 Subject: [PATCH] HPBF text extractor and unit tests git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@687443 13f79535-47bb-0310-9956-ffa450edef68 --- .../extractor/PublisherTextExtractor.java | 78 +++++++++++++ .../poi/hpbf/model/qcbits/QCTextBit.java | 6 +- .../org/apache/poi/hpbf/data/Simple.pub | Bin 0 -> 65536 bytes .../extractor/TextPublisherTextExtractor.java | 105 ++++++++++++++++++ .../poi/hpbf/model/TestEscherParts.java | 50 +++++++++ 5 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java create mode 100755 src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub create mode 100644 src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java create mode 100644 src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java new file mode 100644 index 0000000000..2257283a0f --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java @@ -0,0 +1,78 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpbf.extractor; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.POIOLE2TextExtractor; +import org.apache.poi.hpbf.HPBFDocument; +import org.apache.poi.hpbf.model.qcbits.QCBit; +import org.apache.poi.hpbf.model.qcbits.QCTextBit; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +/** + * Extract text from HPBF Publisher files + */ +public class PublisherTextExtractor extends POIOLE2TextExtractor { + private HPBFDocument doc; + + public PublisherTextExtractor(HPBFDocument doc) { + super(doc); + this.doc = doc; + } + public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { + this(new HPBFDocument(fs)); + } + public PublisherTextExtractor(InputStream is) throws IOException { + this(new POIFSFileSystem(is)); + } + + public String getText() { + StringBuffer text = new StringBuffer(); + + // Get the text from the Quill Contents + QCBit[] bits = doc.getQuillContents().getBits(); + for(int i=0; i"); + } + + for(int i=0; iEw|nlr zN7uUN-gPW&Cwnyi-o5+n|L^Yi|GQ_;-lHG=bJe-u|C74^5?R;nA|x)17K^B_+(Mor zub2>FWcdBU=;){_6arauT|*4~5`DGdybz?%^JPCiiy#i97*YZ$g_J?cAr+8H2tq(q zL8>7&kXpzj$YjVANE|X1QU{p^nGTr&nF*N%aUruI3CJACT*y4ge8>XGLdbQH>mfHl z7C~-=EQTzBEQQIw1QY2Oyo0F35wBgOF~>Ly#UwFC+cUhyiL8eoYpU^qdbt&8oJQ^053MmFeruw)PLB zqi3|(Fgf8#@??&nBk0j^528ms2c~hXrJw-u0q+k)ZvKhgu--{W@K6FS===EpiCGgLd zPpjs!>SpGD3nGB)yUvC9vpDW@DCvy9ktE=v@3?;Op1^f1b#I_QxkOLo{hw{k_S$6M z&Gp4IZ~-euMBYK*2;U{+;wS&~lb07yi~aUXj<{jbFTTh97WX6!9pcVn_s$8G4S^l^!3p z9I%($|L30hz0ZEJwKMkJ7vTQ|fBJ9q|5B`*seZZm&c3qWRS@p4Q+}R_JnwN$UF`?m zwS#@_2%HW0{>dLHU(tpmPLzC5!C1m76Z#))n=Nscn`gC~p(&XwvrplkKt|4^430*Q z?cRtHaebN*oh?QL>+nh;v{Ya0ig#sc3T0ne?-az6PbjE`KAFofaD^E7;urmEx5bo; zuH6xQ@jW-3>2q#4Go0x&ps!)#%y6d9fWC%_GsBra1Ns^!&J1Vz4Crf^I5V8-GoY_w z;>>WS&w##$i8I5QJ_Gt1Ce93J`V8o6m^d?>=`*0OVdBhirq6)BhKV!7nLY#h8Ya#R zXZj52YnV7Qoar;5uVLcMaHh|IzJ`f2!>WS&w##$i8I5QJ_Gt1Ce93J`V8o6 zm^d?>=`*0OVdBhirq6)BhKV!7nLY#h8Ya#RXZj52YnV7Qoar;5uVLcMaHh|IzJ`f2 z!>WS&w##$i8I5QJ_Gt1Ce93J`V8o6m^d?>=`*0OVdBhirq6)BhKV!7nLY#h z8Ya#RXZj52YnV7Qoar;5uVLcMaHh|IzJ`f2!hqQEO2M2J9%om2NE6z&aR!&UY?#7%-Q8> z?$^SK;Y{C+hBL#NJ_Gt1Ce93J`V8o6m^hmd&H{F3mjLN^@v74>ai;H*0yrx?nDm>! zCN&D+?3x_cM0UCW&I00WB71iA+Ae^ztGDx4sAq;Vec}yghBJKz^fgSJ8P4<>(AO|= zW;oMlKwrbenc+;I0euY~yLK9}$7`~hgeEixX@o^|(HO&`dk`PwL z_9wH1cn@F8@TacV@L|P2e$DP=y^uQznqiqnWLhLs$EB^};T@>mAXbY#qET!`?ryO~ ztP@X)n^4juxq0t-)uDhqa0h@sLOYONv3U5A|1x#bQ8~2~qMU`&Ig;>{;9r zI9<2foMq}S{jI!1)}AFXhk4enFFnp9*lWt%6~RW#gM|P06QyzR-9(;Ko{N_CPEqU-2dGK5F;4AastMcHh^Wa~~ zgWr}1|8gGu_B{9IZbW+Dk#54EMNUOT!#Ea|j=~#r;%a4H zH>Sl@dkS0c!$O$(%5JUf%LWfim8$h%qgR^gxw3Sp&l1-VWl3Ro`LIkY3fmNbZT4Z@ zFZgY4@nPBRHTke?y{$gXPTN$RH>cU(mdL0_7v_^3(o0pwFEHggU(yla?YDq=HJzlwgZBgwhe0LgW^o0a{$VXZ+r0m{% zkzM7ZJ?V0_Z}X4gD?TiH4EF_K_xrHS_*8p)$HmnAY4fm@7g7u;5AK7vl#O?^b{5be)_s<99F&5=m>T($9;QZ~iC{I8kEGGeF2X-6kNPZU;#%n*%cFbTrSEW#Z-%52WE|V!u9*)q=%-Sp8VK1d{$Gu{w41oZWO6~W+76MEaC%m zvP7okGR0#wN@`>}1!*Wb3u!nx2dU%4qty2(1Evy)MziR28VbEJjqSHi%UYf(HRZp0J)O`Vfb>o!?yjZ7Ppb5X(_ ztzQn|dnh>@NW|;S)_3+;w!7oc_gGOmIGQg`I;c^aEJj)}%#~^lq;l9rx<#~!ew;{j zh(6(pW_&wwPJV?(tgp4vyBd< zJxJ>zkHR8PK)R7%jB^s=YAtY`7lj zJt!HJeeD9W->csS{VrM86_3afxv0~D_Bv302r2GOqGp&gbv;gLwqx9l7-K7BvDk>u zEkLHoF~x_mHxZq%IDqz7izWCQe;iA2$}=c?*M~Y?vcLUkvk&LG>AvySIyd#3(V|Da<>&!Bxr& ze>`Ij=FhW<(lPU=LVBUHKrftqX(wjNi!a|{W{NlC3!f=|UyQ$&RY_k|7w8MKYGtp! zm(v%u(ifAAFDCpB!u34=+`!fWcZ;dU9~1761pHwor-5lZIUQ*@xd3U&Y3_|I7m0H- zlGHt4ZpFNP6sn0(r@a4rB|H}nb#gDT*{4&Fy!Q+#_a#L#b#^`RV*H(AnWj+c7s=g) zAAJ&~mdo9qeTI>Pt=LeiccZYZ@}utpN#*xv^6$P2AXR^VrvCornQ|U+M2GpzxEA}n zLF|(DVYkyK_gbql8gKNc&ZCAHI*f$BR@tZ3zIo*IA?}+Ge@leol>$#Na(}l47Py1z z#crw>`?~?*ie1>3w&1f7CGBX3JIo%5?38;gwR`37bZSpj;(b;eRo|~<+o2@gR^pUO z@0H7RiW7J0oaxRCCxN4^Igq(7DVitKUC7yI$f}U>VS6#f=d+M|<`2Un?EWvnBB$6r ziq8_4d)HDIKEhu7KpA$5GOcjeqomS_IaN-zQ{$Y(H_PQZVxMV5DuybMSnkpDB_ia6 zoz(Z?`#Oc~`<{)v*PWJl$cx_0^H114b-#rt1E$fqEs7wH+uUM{Vwsl6v{a@avF&&i zeR&@_F(l_x{0E4L_=qJ!!__Ek5s$&w`{4ENtU9zAROi(K?EFUob$}j9bR&;_-=t(Tw#=!`}nQU8va(L zcmidXv}d2;IE5T%2$MTJJQrVEF_Sq{`!REy@c9Tnx5JkY;rR^bWFqSRC(Qzfq5VKV%89NA`bjqG zX|@9U=|vb%8|-)ku+OrvQf}1b`LG?|3%s+6ed6;y`c&md9|77o4w|w3J6I~yMD^cL?xEc7& z8N|H^pFte2Fr{0#3pHMoHH5MtYar{ja02)@(O8;;nY6$jsCcx1+p~zPI1kP8^GkH) zIH>A4udJnIg>sn%98=eiiuvhHH&XI51>0Z^svbf9U#a+k0|Lkq&cVGN* z)k#|nZ++v!4{7iFMQtBON8h$mpZ+%1VXxDUlVhHxHzMuEPUkW7h41?na?KPrc08G@ z5SyQ+mP$Iq7w*^5H7T~ylPRct`JP1yIXlW9vK>SOAl zeeLldq6OStc{#$e@qFVv9XfYXh-YN_tW2Mi>9NSkIMXUnID$< zw`5*$Kk3*})~T2IwJI-We#+*sHgT)!u)Y674{Us?6!ypm0G0}8gdlg#HlD}u_TR`q zqP$XsN4OQYMDd8%WqiwA`2APv@g-Guh4tUYXSG*upEM;IkUVa@jP6-ad{`*Ymb4hui7xr4ldD z+t5jQdl~f7-bOhiVGg`aqQaqKq<4fYv|a-*F({RPkdeI^Mi9pQu?N!Gn^#d@aO^=R z)tj3!q(3NAzOv{*5D>CTNA@CnRC16z0vT^l$*43dn+@w{^^cVy*Pxf(V$ZMw5*lN%OAh<7Z%hT_lByR^G%fxp#05reLG#g2oc8nN?fdOY;IO|WR6o5 zZOHP5ba?}U&ns`a=gvm2UFI;qcwa!f8=76O9Jv_Ex%R{bN2kh}t3x3}D=yZyuJhV$ z-rV5ML^+@TWVzQa%U7h!*Kcj|cwnvxGLPX$9(EscR|zzhaqKjuW+8(2rZiO8mD)Iqt?6m$qSl)Q)rN4xBLO zuW@M>^4rZ{jhFi5{rD9?oMuENgzpUR)A;;*@jTM9 z1W$D5p4f-HCEvnjaN6lyD4h$Za}h6xbL4bcQ99>%Ih-qdxsWLFa$!;G@^Yo3+RK%R8ZTEaYQ3B#CV9CEG1<#iiYZ<$CgNVMN=!|o zbzZJo%=B_KVwRVy6|=qEB$4oPlf@h_H$}|#a&fW1%S{yvyn+FvAxjH^y)HPL%6LQF`~U{G9zARj$gbNUmH$Ovb=gLJmVtL0*S^ z4&l0f-tzxlmBTQdx&#IosbSCmkHXjVeJlJ;pO07{S(HmKYgDc}2DrLaCyKFHaj&Fu zx6ALm0m1Du3Yb^xAm7R4yL@zi>i&hSj0`>%j9U)tE==IiGjrd=cRrDvg1iEG1M)WH zJ;*1J3lP31mVhjStcUQPcNoIEv*zvRpK1D|GOI#cN4xto30)U1V zoLlwb)Qs;@;#?J$H%WbJ(Gm{qp|u&g0n}_0lVzO^$We#ycykv019y$6!|eGo_B=pBFMKZuhow!*WlHtf*&R6O3B0UOaOe@|2=ZEbv9 z##q*(gneQ!_#jveUA{j+3qA6Lu2)8agZBzl*#V5K3;)&24LiA@Rh}XCfQMXd--BAN zY>RKNcA?wF&;3P=t1ONNxVM-6i9!Oxy|lAnLP> ztTZFXabR1GaWuixYNYr%2tI0-wfG){dS{n!-12P+GRG50veE_I^`2_CHl;_aSzD;(MAun$h8`ODcV8U%8299 z;g=yj5MF8~(ifb8tVOZBPkK$1Sc#uM5t3!c`8~1i2GmIW6setf8EGidwix;e{%##f z@ToGI;Q3dPOdXjP%d{jhaJwZ+L3DY7rmAFGEz_FnQBhkmiIY@JPTVpNJ04kHXV38V zO6Rew9lu9EkUjdP>``J7d-V8?>=Azxk0kh;_{8;y{pGm?uj?Ted+pc>?!iOMyJ4A+ z$TWJKk^Gx>S>nQI2%$z!vd`{0>$VMcb$72?;kFF)_C4lq=<6L!x((fJ{rysd4D