]> source.dussan.org Git - poi.git/commitdiff
Bug 60003 - Regression: HSLF Powerpoint text extractor from footer of master slide
authorAndreas Beeker <kiwiwings@apache.org>
Sat, 8 Oct 2016 18:08:25 +0000 (18:08 +0000)
committerAndreas Beeker <kiwiwings@apache.org>
Sat, 8 Oct 2016 18:08:25 +0000 (18:08 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1763927 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
src/scratchpad/src/org/apache/poi/hslf/model/HSLFMetroShape.java
src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java

index 44aa8a2d3e432d7bc2e53e9e6278ee359e223134..8085482eccd6aa74767a9f6b30191a08d7c87778 100644 (file)
 
 package org.apache.poi.hslf.extractor;
 
-import java.io.*;
-import java.util.*;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
 
 import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.hslf.model.*;
-import org.apache.poi.hslf.usermodel.*;
-import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.hslf.model.Comment;
+import org.apache.poi.hslf.model.HSLFMetroShape;
+import org.apache.poi.hslf.model.HeadersFooters;
+import org.apache.poi.hslf.model.OLEShape;
+import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+import org.apache.poi.hslf.usermodel.HSLFNotes;
+import org.apache.poi.hslf.usermodel.HSLFShape;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.hslf.usermodel.HSLFSlideMaster;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
+import org.apache.poi.hslf.usermodel.HSLFTable;
+import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+import org.apache.poi.hslf.usermodel.HSLFTextShape;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 
 /**
  * This class can be used to extract text from a PowerPoint file. Can optionally
  * also get the notes from one.
- *
- * @author Nick Burch
  */
 public final class PowerPointExtractor extends POIOLE2TextExtractor {
+   private static final POILogger LOG = POILogFactory.getLogger(PowerPointExtractor.class);
+    
    private final HSLFSlideShowImpl _hslfshow;
    private final HSLFSlideShow _show;
    private final List<HSLFSlide> _slides;
@@ -207,20 +229,27 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
                 for (HSLFSlideMaster master : _show.getSlideMasters()) {
                     for(HSLFShape sh : master.getShapes()){
                         if(sh instanceof HSLFTextShape){
-                            if(HSLFMasterSheet.isPlaceholder(sh)) {
-                                // don't bother about boiler
-                                // plate text on master
-                                // sheets
+                            HSLFTextShape hsh = (HSLFTextShape)sh;
+                            final String text = hsh.getText();
+                            if (text == null || "".equals(text) || "*".equals(text)) {
                                 continue;
                             }
-                            HSLFTextShape tsh = (HSLFTextShape)sh;
-                            String text = tsh.getText();
-                            if (text != null){
-                                ret.append(text);
-                                if (!text.endsWith("\n")) {
-                                    ret.append("\n");
+                            
+                            if (HSLFMasterSheet.isPlaceholder(sh)) {
+                                // check for metro shape of complex placeholder
+                                boolean isMetro = new HSLFMetroShape<HSLFShape>(sh).hasMetroBlob();
+                                
+                                if (!isMetro) {
+                                    // don't bother about boiler plate text on master sheets
+                                    LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text);
+                                    continue;
                                 }
                             }
+                            
+                            ret.append(text);
+                            if (!text.endsWith("\n")) {
+                                ret.append("\n");
+                            }
                         }
                     }
                 }
index e5d9f93a9507db50d82819f5e64e0bde7b3242b7..894818eb46d5d06b2dabd8d833296df8709bdf06 100644 (file)
@@ -47,14 +47,20 @@ public class HSLFMetroShape<T extends Shape<?,?>> {
      * @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream \r
      */\r
     public byte[] getMetroBytes() {\r
+        EscherComplexProperty ep = getMetroProp();\r
+        return (ep == null) ? null : ep.getComplexData();\r
+    }\r
+\r
+    /**\r
+     * @return if there's a metro blob to extract\r
+     */\r
+    public boolean hasMetroBlob() {\r
+        return getMetroProp() != null;\r
+    }\r
+    \r
+    private EscherComplexProperty getMetroProp() {\r
         AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID);\r
-        if (opt != null) {\r
-            EscherComplexProperty ep = (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);\r
-            if (ep != null) {\r
-                return ep.getComplexData();\r
-            }\r
-        }\r
-        return null;\r
+        return (opt == null) ? null : (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);\r
     }\r
     \r
     /**\r
index 5e6b71928d82dcebf86bf4e91bda56a98ae06007..0cf408a8f49898ec64508d73d85ff26783b8bcf8 100644 (file)
@@ -431,5 +431,19 @@ public final class TestExtractor {
         String target = "this\tText\tis\twithin\ta\n"+
                 "table\t1\t2\t3\t4";
         assertTrue(text.contains(target));
-    }    
+    }
+
+    // bug 60003
+    @Test
+    public void testExtractMasterSlideFooterText() throws Exception {
+        HSLFSlideShowImpl hslf = new HSLFSlideShowImpl(slTests.openResourceAsStream("60003.ppt"));
+        ppe.close();
+
+        ppe = new PowerPointExtractor(hslf);
+        ppe.setMasterByDefault(true);
+      
+        String text = ppe.getText();
+        assertContains(text, "Prague");
+        hslf.close();
+    }
 }