]> source.dussan.org Git - poi.git/commitdiff
support for text extraction from PPT master slides, see Bugzilla 48161
authorYegor Kozlov <yegor@apache.org>
Thu, 12 Nov 2009 07:07:41 +0000 (07:07 +0000)
committerYegor Kozlov <yegor@apache.org>
Thu, 12 Nov 2009 07:07:41 +0000 (07:07 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@835271 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java
test-data/slideshow/master_text.ppt [new file with mode: 0644]

index 7725c909d2c97eab5ae03848b8a217ccd63e92f2..c26a9fb685f8759fca91714dc3e36d05ad985482 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.6-beta1" date="2009-??-??">
+           <action dev="POI-DEVELOPERS" type="add">48161 - support for text extraction from PPT master slides</action>
            <action dev="POI-DEVELOPERS" type="add">47970 - added a method to set arabic mode in HSSFSheet</action>
            <action dev="POI-DEVELOPERS" type="fix">48134 - release system resources when using Picture.resize()</action>
            <action dev="POI-DEVELOPERS" type="fix">48087 - avoid NPE in XSSFChartSheet  when calling methods of the superclass</action>
index 97c7d71f6ade00d8fdacfb0d5f7d6a85d5af2dc2..05ade6ab9e048f5cc28c281dc16541f9f43faac5 100644 (file)
@@ -45,6 +45,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
        private boolean _slidesByDefault = true;
        private boolean _notesByDefault = false;
        private boolean _commentsByDefault = false;
+    private boolean _masterByDefault = false;
 
        /**
         * Basic extractor. Returns all the text, and optionally all the notes
@@ -58,6 +59,8 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
 
                boolean notes = false;
                boolean comments = false;
+        boolean master = true;
+        
                String file;
                if (args.length > 1) {
                        notes = true;
@@ -70,7 +73,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
                }
 
                PowerPointExtractor ppe = new PowerPointExtractor(file);
-               System.out.println(ppe.getText(true, notes, comments));
+               System.out.println(ppe.getText(true, notes, comments, master));
        }
 
        /**
@@ -137,12 +140,19 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
                this._commentsByDefault = commentsByDefault;
        }
 
+    /**
+     * Should a call to getText() return text from master? Default is no
+     */
+    public void setMasterByDefault(boolean masterByDefault) {
+        this._masterByDefault = masterByDefault;
+    }
+
        /**
         * Fetches all the slide text from the slideshow, but not the notes, unless
         * you've called setSlidesByDefault() and setNotesByDefault() to change this
         */
        public String getText() {
-               return getText(_slidesByDefault, _notesByDefault, _commentsByDefault);
+               return getText(_slidesByDefault, _notesByDefault, _commentsByDefault, _masterByDefault);
        }
 
        /**
@@ -178,14 +188,20 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
         * @param getNoteText fetch note text
         */
        public String getText(boolean getSlideText, boolean getNoteText) {
-               return getText(getSlideText, getNoteText, _commentsByDefault);
+               return getText(getSlideText, getNoteText, _commentsByDefault, _masterByDefault);
        }
 
-       public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText) {
+       public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
                StringBuffer ret = new StringBuffer();
 
                if (getSlideText) {
-                       for (int i = 0; i < _slides.length; i++) {
+            if (getMasterText) {
+                for (SlideMaster master : _show.getSlidesMasters()) {
+                    textRunsToText(ret, master.getTextRuns());
+                }
+            }
+
+            for (int i = 0; i < _slides.length; i++) {
                                Slide slide = _slides[i];
 
                                // Slide header, if set
@@ -195,19 +211,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
                                }
 
                                // Slide text
-                               TextRun[] runs = slide.getTextRuns();
-                               for (int j = 0; j < runs.length; j++) {
-                                       TextRun run = runs[j];
-                                       if (run != null) {
-                                               String text = run.getText();
-                                               ret.append(text);
-                                               if (!text.endsWith("\n")) {
-                                                       ret.append("\n");
-                                               }
-                                       }
-                               }
+                textRunsToText(ret, slide.getTextRuns());
 
-                               // Slide footer, if set
+                // Slide footer, if set
                                if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
                                        ret.append(hf.getFooterText() + "\n");
                                }
@@ -249,17 +255,7 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
                                }
 
                                // Notes text
-                               TextRun[] runs = notes.getTextRuns();
-                               if (runs != null && runs.length > 0) {
-                                       for (int j = 0; j < runs.length; j++) {
-                                               TextRun run = runs[j];
-                                               String text = run.getText();
-                                               ret.append(text);
-                                               if (!text.endsWith("\n")) {
-                                                       ret.append("\n");
-                                               }
-                                       }
-                               }
+                textRunsToText(ret, notes.getTextRuns());
 
                                // Repeat the notes footer, if set
                                if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
@@ -270,4 +266,21 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
 
                return ret.toString();
        }
+
+    private void textRunsToText(StringBuffer ret, TextRun[] runs) {
+        if (runs==null) {
+            return;
+        }
+
+        for (int j = 0; j < runs.length; j++) {
+            TextRun run = runs[j];
+            if (run != null) {
+                String text = run.getText();
+                ret.append(text);
+                if (!text.endsWith("\n")) {
+                    ret.append("\n");
+                }
+            }
+        }
+    }
 }
index 66316be7df7b3cee364a4004aa583d566ee7ad17..016a7d0f58412f4223327e768ce8330f699b589a 100644 (file)
@@ -48,7 +48,6 @@ public final class TestExtractor extends TestCase {
     //private String pdirname;
 
     protected void setUp() throws Exception {
-
                ppe = new PowerPointExtractor(slTests.openResourceAsStream("basic_test_ppt_file.ppt"));
                ppe2 = new PowerPointExtractor(slTests.openResourceAsStream("with_textbox.ppt"));
     }
@@ -63,7 +62,7 @@ public final class TestExtractor extends TestCase {
                
                // 1 page example with text boxes
                sheetText = ppe2.getText();
-               expectText = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n"; 
+               expectText = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
 
                ensureTwoStringsTheSame(expectText, sheetText);
     }
@@ -112,7 +111,7 @@ public final class TestExtractor extends TestCase {
         */
        public void testMissingCoreRecords() throws Exception {
                ppe = new PowerPointExtractor(slTests.openResourceAsStream("missing_core_records.ppt"));
-               
+
                String text = ppe.getText(true, false);
                String nText = ppe.getNotes();
 
@@ -265,4 +264,13 @@ public final class TestExtractor extends TestCase {
        private static boolean contains(String text, String searchString) {
                return text.indexOf(searchString) >=0;
        }
+
+    public void testMasterText() throws Exception {
+               ppe = new PowerPointExtractor(slTests.openResourceAsStream("master_text.ppt"));
+        ppe.setMasterByDefault(true);
+
+               String text = ppe.getText();
+               assertTrue(text.contains("Master Header Text"));
+    }
+
 }
diff --git a/test-data/slideshow/master_text.ppt b/test-data/slideshow/master_text.ppt
new file mode 100644 (file)
index 0000000..a748e8b
Binary files /dev/null and b/test-data/slideshow/master_text.ppt differ