]> source.dussan.org Git - poi.git/commitdiff
XLSF text extraction improvements relating to TIKA-712
authorNick Burch <nick@apache.org>
Mon, 26 Sep 2011 14:37:50 +0000 (14:37 +0000)
committerNick Burch <nick@apache.org>
Mon, 26 Sep 2011 14:37:50 +0000 (14:37 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java
src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java
src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java

index 9563f664a2acf264497ec4608b5868720d1c251f..812c7ad0e638d720a6d7593565ca14f53fc54288 100644 (file)
@@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.usermodel.DrawingParagraph;
+import org.apache.poi.xslf.usermodel.DrawingTextBody;
+import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
 import org.apache.poi.xslf.usermodel.XSLFComments;
@@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
 import org.apache.poi.xslf.usermodel.XSLFNotes;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
 import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
@@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
          try {
             XSLFNotes notes = slide.getNotes();
             XSLFComments comments = slide.getComments();
+            XSLFSlideLayout layout = slide.getSlideLayout();
             XSLFSlideMaster master = slide.getMasterSheet();
 
             // TODO Do the slide's name
@@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
 
             // Do the slide's text if requested
             if (slideText) {
-               extractText(slide.getCommonSlideData(), text);
+               extractText(slide.getCommonSlideData(), false, text);
                
-               // If there's a master sheet and it's requested, grab text from there
-               if(masterText && master != null) {
-                  extractText(master.getCommonSlideData(), text);
+               // If requested, get text from the master and it's layout 
+               if(masterText) {
+                  if(layout != null) {
+                     extractText(layout.getCommonSlideData(), true, text);
+                  }
+                  if(master != null) {
+                     extractText(master.getCommonSlideData(), true, text);
+                  }
                }
 
                // If the slide has comments, do those too
@@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
 
             // Do the notes if requested
             if (notesText && notes != null) {
-               extractText(notes.getCommonSlideData(), text);
+               extractText(notes.getCommonSlideData(), false, text);
             }
          } catch (Exception e) {
             throw new RuntimeException(e);
@@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
       return text.toString();
    }
        
-       private void extractText(XSLFCommonSlideData data, StringBuffer text) {
-        for (DrawingParagraph p : data.getText()) {
+       private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) {
+          for(DrawingTextBody textBody : data.getDrawingText()) {
+             if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
+                DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody;
+                if(! ph.isPlaceholderCustom()) {
+                   // Skip non-customised placeholder text
+                   continue;
+                }
+             }
+             
+             for (DrawingParagraph p : textBody.getParagraphs()) {
             text.append(p.getText());
             text.append("\n");
-        }
-    }
+             }
+          }
+       }
 }
index 1f40841d35c9c8b144d6ff6103ada9301ec61397..05ae0f3d1232080169db0a47349b803067462f72 100644 (file)
 
 package org.apache.poi.xslf.usermodel;
 
+import java.util.List;
+
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
 
-import java.util.List;
-
 public class DrawingTextBody {
     private final CTTextBody textBody;
 
     public DrawingTextBody(CTTextBody textBody) {
-        this.textBody = textBody;
+       this.textBody = textBody;
     }
 
     public DrawingParagraph[] getParagraphs() {
diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java
new file mode 100644 (file)
index 0000000..ec95bd1
--- /dev/null
@@ -0,0 +1,57 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xslf.usermodel;
+
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder;
+import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType;
+
+/**
+ * A {@link DrawingTextBody} which is a placeholder
+ * @author nick
+ *
+ */
+public class DrawingTextPlaceholder extends DrawingTextBody {
+    private final CTPlaceholder placeholder;
+
+    public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) {
+       super(textBody);
+       this.placeholder = placeholder;
+    }
+    
+    /**
+     * What kind of placeholder is this?
+     */
+    public String getPlaceholderType() {
+       return placeholder.getType().toString();
+    }
+
+    /**
+     * What kind of placeholder is this?
+     */
+    public STPlaceholderType.Enum getPlaceholderTypeEnum() {
+       return placeholder.getType();
+    }
+
+    /**
+     * Is the PlaceHolder text customised?
+     */
+    public boolean isPlaceholderCustom() {
+       return placeholder.getHasCustomPrompt();
+    }
+}
index a5612815521f684a26465c39d9256ea903149ab1..b099b64571e114d1735e0f34911f57834fb99352 100644 (file)
@@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
@@ -42,11 +43,11 @@ public class XSLFCommonSlideData {
     public XSLFCommonSlideData(CTCommonSlideData data) {
         this.data = data;
     }
-
-    public List<DrawingParagraph> getText() {
+    
+    public List<DrawingTextBody> getDrawingText() {
         CTGroupShape gs = data.getSpTree();
 
-        List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();
+        List<DrawingTextBody> out = new ArrayList<DrawingTextBody>();
 
         processShape(gs, out);
 
@@ -77,8 +78,7 @@ public class XSLFCommonSlideData {
                     for (DrawingTableRow row : table.getRows()) {
                         for (DrawingTableCell cell : row.getCells()) {
                             DrawingTextBody textBody = cell.getTextBody();
-
-                            out.addAll(Arrays.asList(textBody.getParagraphs()));
+                            out.add(textBody);
                         }
                     }
                 }
@@ -89,19 +89,31 @@ public class XSLFCommonSlideData {
 
         return out;
     }
+    public List<DrawingParagraph> getText() {
+       List<DrawingParagraph> paragraphs = new ArrayList<DrawingParagraph>();
+       for(DrawingTextBody textBody : getDrawingText()) {
+          paragraphs.addAll(Arrays.asList(textBody.getParagraphs()));
+       }
+       return paragraphs;
+    }
 
-    private void processShape(CTGroupShape gs, List<DrawingParagraph> out) {
+    private void processShape(CTGroupShape gs, List<DrawingTextBody> out) {
         List<CTShape> shapes = gs.getSpList();
-        for (int i = 0; i < shapes.size(); i++) {
-            CTTextBody ctTextBody = shapes.get(i).getTxBody();
+        for (CTShape shape : shapes) {
+            CTTextBody ctTextBody = shape.getTxBody();
             if (ctTextBody==null) {
                 continue;
             }
+            
+            DrawingTextBody textBody;
+            CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr(); 
+            if(nvpr.isSetPh()) {
+               textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh());
+            } else {
+               textBody = new DrawingTextBody(ctTextBody);
+            }
 
-            DrawingTextBody textBody = new DrawingTextBody(ctTextBody);
-
-            out.addAll(Arrays.asList(textBody.getParagraphs()));
+            out.add(textBody);
         }
     }
-
 }
index e26b6bf0f9a49bd08427b6bead97815dd513d23d..f030f3b1013b985579e0293003156896eeb4f5f4 100644 (file)
@@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
                assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
                assertTrue(text.contains("amet\n\n"));
 
-               // Our master text, for tests
+               // Our placeholder master text
+               // This shouldn't show up in the output
                String masterText =
          "Click to edit Master title style\n" +
+         "Click to edit Master subtitle style\n" +
+         "\n\n\n\n\n\n" +
+         "Click to edit Master title style\n" +
          "Click to edit Master text styles\n" +
          "Second level\n" +
          "Third level\n" +
@@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
             "Lorem ipsum dolor sit amet\n" +
             "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
             "\n" +
-            masterText +
-            "\n\n\n" +
             "Lorem ipsum dolor sit amet\n" +
             "Lorem\n" +
             "ipsum\n" +
             "dolor\n" +
             "sit\n" +
             "amet\n" +
-            "\n" +
-            masterText +
-            "\n\n\n"
+            "\n"
             , text
       );
                
@@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase {
             "Lorem ipsum dolor sit amet\n" +
             "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
             "\n" +
-            masterText +
-            "\n\n\n\n\n" +
+            "\n\n" +
             "Lorem ipsum dolor sit amet\n" +
             "Lorem\n" +
             "ipsum\n" +
             "dolor\n" +
             "sit\n" +
             "amet\n" +
-            "\n" +
-            masterText +
-            "\n\n\n\n\n"
+            "\n\n\n"
             , text
       );
                
@@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase {
          new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
       XSLFPowerPointExtractor extractor = 
          new XSLFPowerPointExtractor(xml);
+      extractor.setSlidesByDefault(true);
+      extractor.setNotesByDefault(false);
+      extractor.setMasterByDefault(true);
       
       String text = extractor.getText();
       assertTrue(text.length() > 0);
@@ -183,17 +183,28 @@ public class TestXSLFPowerPointExtractor extends TestCase {
       // Check master text is there
       assertTrue("Unable to find expected word in text\n" + text, 
             text.contains("Footer from the master slide"));
+
+      // Theme text shouldn't show up
+      String themeText = 
+         "Theme Master Title\n" +
+         "Theme Master first level\n" +
+         "And the 2nd level\n" +
+         "Our 3rd level goes here\n" +
+         "And onto the 4th, such fun….\n" +
+         "Finally is the Fifth level\n";
       
       // Check the whole text
       assertEquals(
             "First page title\n" +
             "First page subtitle\n" +
-//            "This text comes from the Master Slide\n" + // TODO
-//            "This is the Master Title\n" + // TODO
-            "\n" + // TODO Should be the above
+            "This is the Master Title\n" +
+            "This text comes from the Master Slide\n" +
+            "\n" +
+            // TODO Detect we didn't have a title, and include the master one
             "2nd page subtitle\n" +
-//          "This text comes from the Master Slide\n" + // TODO
-            "Footer from the master slide\n"
+            "Footer from the master slide\n" +
+            "This is the Master Title\n" +
+            "This text comes from the Master Slide\n"
             , text
       );
        }