From a53a2ef1f8ed53f574b7688cf6035dde62cc554d Mon Sep 17 00:00:00 2001 From: Yegor Kozlov Date: Thu, 12 Nov 2009 07:07:41 +0000 Subject: [PATCH] support for text extraction from PPT master slides, see Bugzilla 48161 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@835271 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../hslf/extractor/PowerPointExtractor.java | 69 +++++++++++------- .../poi/hslf/extractor/TestExtractor.java | 14 +++- test-data/slideshow/master_text.ppt | Bin 0 -> 71168 bytes 4 files changed, 53 insertions(+), 31 deletions(-) create mode 100644 test-data/slideshow/master_text.ppt diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 7725c909d2..c26a9fb685 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 48161 - support for text extraction from PPT master slides 47970 - added a method to set arabic mode in HSSFSheet 48134 - release system resources when using Picture.resize() 48087 - avoid NPE in XSSFChartSheet when calling methods of the superclass diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java index 97c7d71f6a..05ade6ab9e 100644 --- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java @@ -45,6 +45,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { private boolean _slidesByDefault = true; private boolean _notesByDefault = false; private boolean _commentsByDefault = false; + private boolean _masterByDefault = false; /** * Basic extractor. Returns all the text, and optionally all the notes @@ -58,6 +59,8 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { boolean notes = false; boolean comments = false; + boolean master = true; + String file; if (args.length > 1) { notes = true; @@ -70,7 +73,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { } PowerPointExtractor ppe = new PowerPointExtractor(file); - System.out.println(ppe.getText(true, notes, comments)); + System.out.println(ppe.getText(true, notes, comments, master)); } /** @@ -137,12 +140,19 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { this._commentsByDefault = commentsByDefault; } + /** + * Should a call to getText() return text from master? Default is no + */ + public void setMasterByDefault(boolean masterByDefault) { + this._masterByDefault = masterByDefault; + } + /** * Fetches all the slide text from the slideshow, but not the notes, unless * you've called setSlidesByDefault() and setNotesByDefault() to change this */ public String getText() { - return getText(_slidesByDefault, _notesByDefault, _commentsByDefault); + return getText(_slidesByDefault, _notesByDefault, _commentsByDefault, _masterByDefault); } /** @@ -178,14 +188,20 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { * @param getNoteText fetch note text */ public String getText(boolean getSlideText, boolean getNoteText) { - return getText(getSlideText, getNoteText, _commentsByDefault); + return getText(getSlideText, getNoteText, _commentsByDefault, _masterByDefault); } - public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText) { + public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) { StringBuffer ret = new StringBuffer(); if (getSlideText) { - for (int i = 0; i < _slides.length; i++) { + if (getMasterText) { + for (SlideMaster master : _show.getSlidesMasters()) { + textRunsToText(ret, master.getTextRuns()); + } + } + + for (int i = 0; i < _slides.length; i++) { Slide slide = _slides[i]; // Slide header, if set @@ -195,19 +211,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { } // Slide text - TextRun[] runs = slide.getTextRuns(); - for (int j = 0; j < runs.length; j++) { - TextRun run = runs[j]; - if (run != null) { - String text = run.getText(); - ret.append(text); - if (!text.endsWith("\n")) { - ret.append("\n"); - } - } - } + textRunsToText(ret, slide.getTextRuns()); - // Slide footer, if set + // Slide footer, if set if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { ret.append(hf.getFooterText() + "\n"); } @@ -249,17 +255,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { } // Notes text - TextRun[] runs = notes.getTextRuns(); - if (runs != null && runs.length > 0) { - for (int j = 0; j < runs.length; j++) { - TextRun run = runs[j]; - String text = run.getText(); - ret.append(text); - if (!text.endsWith("\n")) { - ret.append("\n"); - } - } - } + textRunsToText(ret, notes.getTextRuns()); // Repeat the notes footer, if set if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { @@ -270,4 +266,21 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor { return ret.toString(); } + + private void textRunsToText(StringBuffer ret, TextRun[] runs) { + if (runs==null) { + return; + } + + for (int j = 0; j < runs.length; j++) { + TextRun run = runs[j]; + if (run != null) { + String text = run.getText(); + ret.append(text); + if (!text.endsWith("\n")) { + ret.append("\n"); + } + } + } + } } diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java index 66316be7df..016a7d0f58 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java @@ -48,7 +48,6 @@ public final class TestExtractor extends TestCase { //private String pdirname; protected void setUp() throws Exception { - ppe = new PowerPointExtractor(slTests.openResourceAsStream("basic_test_ppt_file.ppt")); ppe2 = new PowerPointExtractor(slTests.openResourceAsStream("with_textbox.ppt")); } @@ -63,7 +62,7 @@ public final class TestExtractor extends TestCase { // 1 page example with text boxes sheetText = ppe2.getText(); - expectText = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n"; + expectText = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n"; ensureTwoStringsTheSame(expectText, sheetText); } @@ -112,7 +111,7 @@ public final class TestExtractor extends TestCase { */ public void testMissingCoreRecords() throws Exception { ppe = new PowerPointExtractor(slTests.openResourceAsStream("missing_core_records.ppt")); - + String text = ppe.getText(true, false); String nText = ppe.getNotes(); @@ -265,4 +264,13 @@ public final class TestExtractor extends TestCase { private static boolean contains(String text, String searchString) { return text.indexOf(searchString) >=0; } + + public void testMasterText() throws Exception { + ppe = new PowerPointExtractor(slTests.openResourceAsStream("master_text.ppt")); + ppe.setMasterByDefault(true); + + String text = ppe.getText(); + assertTrue(text.contains("Master Header Text")); + } + } diff --git a/test-data/slideshow/master_text.ppt b/test-data/slideshow/master_text.ppt new file mode 100644 index 0000000000000000000000000000000000000000..a748e8b2170172d92202d60ffd74743de0b036b2 GIT binary patch literal 71168 zcmeI2dvF!i9mjur@8$s{?oFVefaF4HK|u`!lqi842to=WsR~tF3(BHYkg5^jQbIs-|b(=M0jmMy{_x} zx4UqHOFjNa2pmb(lYK}Pp#RwCMNly`1R4qrgG!)M=oH9-Fa)FwDu;$cBOvB665%N5 zROmElG&BYp3!M&)gDRjnG#;7&eIA+!eF6F+GzppvodKN*O@XFDXF+E}=RlRvG-x_h z1&4K1Z38)sD2hE4-pasx{&_ZYtR1bX#S`00L8la_6 zBh&BYoN8zI_Ns61L}mXhi-tr3Ec>-hc-YrK{rFUK(|7-LAOI2p^K2yTBMAl z}2X#{OtFsUd8FquDZE%TKS@WZ}yOo6Pyo`+EF;6Ro;@b^ z(6@_a%GATpaD6Z6caLi_{S?|^M4!gA?;R1@g^+WC2YnH3xW+S{=~%~~q^@}n7-dCM zecg(M%j)Y)EIp>VvuH{E(#7-rysc)US}S>SS>tIH9Xf0c(?`dX4SYp3-_lX zemc%-akc=S=`47?O4wDx;ylFF!sDF_PgiYLcF&*rbw>6_Jsf#AdLbsgCGSPq#@_&kB8HWdFZRkhC zAB8a8;c(1!8L!P*RBq;y$Wso-1NTu88G_-PkncLD04MzAk-6AIO^ACQUwXY=0Ss}B zfrUrgb>GJs$F`R%E@0{28b{jU@&0LlZ^e;vurve9+Z$mI4whzMImzfs428kc{2Uc+ zfzs`Y6?ECT(l|2Az7NKZNOCh%Nnx@Qzc!%K7jX?jVlxgsmfpUUHv9GlvNL!qcaXl!8>5Cc1d;i3?wnCE5SY{?4 zM!7DroQ$K3=Z@?xnuA0Y;`%l==YfB)U`IZQavuj2W<4{FI1-(K8{8(g3=%#)O5_jN zYk%}F*L@oO&+OL8r%`_yT!$0?nsEv2&mAeUXX9MND`O|lc0;+spfW1@Rhi6E*>MNW zD(4EBT?x8z@?qC~KA_61VrG@ugt_vjSv}BLvwD>*JlUIK3gVnyJ3zKrhfS0hx_bYcaO`6 zK~%W9rqiUUcxm2qgQ)mv;)76WG4B|dxi?lg1wEg|kVgq4`&yXpL-zkTL}gM5PLoN+ zOOr_@IL*LvGN}Zo$)w_ylS!q28r@|C?V6e}R^CRh(kHlh1(Lb@@@F9Dkn3`fVM!dn z(rcw`)4k@~a&%wIU2HU?UZeL6!>-ED9E;58KRYJn4ij9U&6K0 zvkliu&o*2uJ=<`t^laDlYo(L5lWs9jlIgybtiE)zcFvw{x-Y#ql+x?9GAQ44UwTiC z*w~r=3CcI!m)_H@^m?syB;A)>ZMdGL`_i}V`b@OakrI0-_JNK>k`aV4DNZ6V&zObS zyg3q0Zo`yx#7<7eeM^QUV+j4j>cZ9-ipWrtjvGGCaoT@^usn6&!hJrTe~708zKy*d zITc`MG_mp~^A=R(4%`*lWPUfB-!10%E?A1Zf{bl>1*d$DV8k!Df0WlP<5{pv9+G;d zfQ6~a%Ti-)9rZ@kk+M{ce>sdTGd zR$RBFt}a2i9&sX1IEDBfZt}n;-#l&d%2dw~_zNoM6Nuy>a4>^F0uSRs1Xx7=1TMEp zV0)}T0oISyNT7}<;t}xbz1=y0lM(g>cB37axpu;?w5QwW+p}<0YtO^=d|b`LZ^B-T zd-e8A*q(3Kz+xWG<1!of7uiefrFN;5i)Qjr{_ z-UvY|_jp>08u)XL{+geNAA*nN-0lKFzK5fq^xnfgWL@{WZrNsidRZrYX*p8zy;iZ< zQTG1aDBjM%zlM7=f;U3ZBpje6J?5ntGU{S^+)+G>``l4Hi7=kpQE-M70UdV~2@@Z3 zM}ah$JV<9JpF!xi7OxQlcN7Ohj0u9te}$3A3z0uMZw(7tVZ0iaCV?M^=TA95`q!F! zte}0+ju|IwB;Q{0mi1C>*{MObMmL~_!L^P9lF!<7aKP@ZW#-AMH7X`$#ZCCiF>v>W zikMp8MxzvK4*g?U%@2o|eS@!4Su1*u-ko^v$>VhQ(C+eJPMmT1ujskfsW5(Ta8j8g zsvrlcgCR(9UgbY~d+~a#ta$0@pb_Q!z7sK`j^FcTj;O*MoOYad^bvKeoH(L_Ik7S) zZbXIge1nguMW~OzM$xy>`)z{vtHf!x-+TL|tKs*yK6Y=0-^0%1nEc+~qEBIM^uI7< zJ`JPTE;vN98x%PlHF4h31p6{ZfnW9)7yic;edab37fIfSurlQ{X>XxT^qFi^=%;X` z@qH#gNcoJI7a95A==}WvcLm44AGmw?gQkw(8h#H#k)N6GvMcbPRx9wIT21&0Jq;}r zxp#P*iT~@_?6SjG{K~2vv+y09PD!D{H@&)^8e=o5CvL*@4>r?%2j3%eZ{$c#55Cn& z!kX8^c3a+rj!dB?H!24}(%-1C7nsSwj+lviCH8%Y=jLTI!bGYgXwTkcNANKxbVneK z-w}M-@I;CCngriN%{E{@1}oMj*@EFA;=Ss4-F)(!iyOS>z0Ba&` z5&HQ@Qu+InAv6EnFH!i5O5r!+~Cba=EO9!0LFe@vL_~|c`XG>mv)(%f-BJ~-TXb?k=^;{RtR{rJsF-g}2TcCXsR%sJBpamcTRxI0=8aaeFju$vx$ zI6oeR_&xtgh=Ys&V>2#wuc3K;TYG$Y^Xd)FYdWuOf6!hJC7=Y9fD%vwNNt4KnW-TC7=Y9fD%vwNNt4KnW-TC7=Y9fD%vwNNt4KnW-TC7=Y9fD%vwNNt4KnW-TC7=Y9fD%vwNNt4KnW-TC7=Y9fD%vwNNt4KnW-TC7=Y9fD%vwN?!##hVkJS02i zg)`m_$}Gy85iiD)`6$0#)=8sWDc1y<4|=a088Iv7$$AWGoDo@_I=4{nYH7`g$?tS1 zO846zwtR9Lkt$}s+yLesr~&E6Wtnv1yd!iaMx%Ti)&)@5^2uvzs)QzKldDiNYtf3b z^F7Ev-}dvpl3z{}djCe#VxG07DAeKnD%7bx#l^QC7*0m{g}7Y=g>651-9O&Tt3WO` z<+MvP2yBpg)L<1#??Bnj;L?WkHJ)kUFU!xj{hvclu|N6c_it-{^_g`H<#Vo#