From bfe1b0cb66c516438b584600df5d9fa8735505d3 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Mon, 26 Sep 2011 14:37:50 +0000 Subject: [PATCH] XLSF text extraction improvements relating to TIKA-712 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68 --- .../extractor/XSLFPowerPointExtractor.java | 37 +++++++++--- .../poi/xslf/usermodel/DrawingTextBody.java | 6 +- .../usermodel/DrawingTextPlaceholder.java | 57 +++++++++++++++++++ .../xslf/usermodel/XSLFCommonSlideData.java | 36 ++++++++---- .../TestXSLFPowerPointExtractor.java | 43 ++++++++------ 5 files changed, 139 insertions(+), 40 deletions(-) create mode 100644 src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java diff --git a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java index 9563f664a2..812c7ad0e6 100644 --- a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java +++ b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java @@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xslf.XSLFSlideShow; import org.apache.poi.xslf.usermodel.DrawingParagraph; +import org.apache.poi.xslf.usermodel.DrawingTextBody; +import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFCommentAuthors; import org.apache.poi.xslf.usermodel.XSLFComments; @@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData; import org.apache.poi.xslf.usermodel.XSLFNotes; import org.apache.poi.xslf.usermodel.XSLFRelation; import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFSlideLayout; import org.apache.poi.xslf.usermodel.XSLFSlideMaster; import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.presentationml.x2006.main.CTComment; @@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor { try { XSLFNotes notes = slide.getNotes(); XSLFComments comments = slide.getComments(); + XSLFSlideLayout layout = slide.getSlideLayout(); XSLFSlideMaster master = slide.getMasterSheet(); // TODO Do the slide's name @@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor { // Do the slide's text if requested if (slideText) { - extractText(slide.getCommonSlideData(), text); + extractText(slide.getCommonSlideData(), false, text); - // If there's a master sheet and it's requested, grab text from there - if(masterText && master != null) { - extractText(master.getCommonSlideData(), text); + // If requested, get text from the master and it's layout + if(masterText) { + if(layout != null) { + extractText(layout.getCommonSlideData(), true, text); + } + if(master != null) { + extractText(master.getCommonSlideData(), true, text); + } } // If the slide has comments, do those too @@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor { // Do the notes if requested if (notesText && notes != null) { - extractText(notes.getCommonSlideData(), text); + extractText(notes.getCommonSlideData(), false, text); } } catch (Exception e) { throw new RuntimeException(e); @@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor { return text.toString(); } - private void extractText(XSLFCommonSlideData data, StringBuffer text) { - for (DrawingParagraph p : data.getText()) { + private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) { + for(DrawingTextBody textBody : data.getDrawingText()) { + if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) { + DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody; + if(! ph.isPlaceholderCustom()) { + // Skip non-customised placeholder text + continue; + } + } + + for (DrawingParagraph p : textBody.getParagraphs()) { text.append(p.getText()); text.append("\n"); - } - } + } + } + } } diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java index 1f40841d35..05ae0f3d12 100644 --- a/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java @@ -17,16 +17,16 @@ package org.apache.poi.xslf.usermodel; +import java.util.List; + import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; -import java.util.List; - public class DrawingTextBody { private final CTTextBody textBody; public DrawingTextBody(CTTextBody textBody) { - this.textBody = textBody; + this.textBody = textBody; } public DrawingParagraph[] getParagraphs() { diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java new file mode 100644 index 0000000000..ec95bd1d55 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java @@ -0,0 +1,57 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xslf.usermodel; + +import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; +import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder; +import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType; + +/** + * A {@link DrawingTextBody} which is a placeholder + * @author nick + * + */ +public class DrawingTextPlaceholder extends DrawingTextBody { + private final CTPlaceholder placeholder; + + public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) { + super(textBody); + this.placeholder = placeholder; + } + + /** + * What kind of placeholder is this? + */ + public String getPlaceholderType() { + return placeholder.getType().toString(); + } + + /** + * What kind of placeholder is this? + */ + public STPlaceholderType.Enum getPlaceholderTypeEnum() { + return placeholder.getType(); + } + + /** + * Is the PlaceHolder text customised? + */ + public boolean isPlaceholderCustom() { + return placeholder.getHasCustomPrompt(); + } +} diff --git a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java index a561281552..b099b64571 100644 --- a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java +++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java @@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl; import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData; import org.openxmlformats.schemas.drawingml.x2006.main.CTTable; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; +import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps; import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData; import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame; import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; @@ -42,11 +43,11 @@ public class XSLFCommonSlideData { public XSLFCommonSlideData(CTCommonSlideData data) { this.data = data; } - - public List getText() { + + public List getDrawingText() { CTGroupShape gs = data.getSpTree(); - List out = new ArrayList(); + List out = new ArrayList(); processShape(gs, out); @@ -77,8 +78,7 @@ public class XSLFCommonSlideData { for (DrawingTableRow row : table.getRows()) { for (DrawingTableCell cell : row.getCells()) { DrawingTextBody textBody = cell.getTextBody(); - - out.addAll(Arrays.asList(textBody.getParagraphs())); + out.add(textBody); } } } @@ -89,19 +89,31 @@ public class XSLFCommonSlideData { return out; } + public List getText() { + List paragraphs = new ArrayList(); + for(DrawingTextBody textBody : getDrawingText()) { + paragraphs.addAll(Arrays.asList(textBody.getParagraphs())); + } + return paragraphs; + } - private void processShape(CTGroupShape gs, List out) { + private void processShape(CTGroupShape gs, List out) { List shapes = gs.getSpList(); - for (int i = 0; i < shapes.size(); i++) { - CTTextBody ctTextBody = shapes.get(i).getTxBody(); + for (CTShape shape : shapes) { + CTTextBody ctTextBody = shape.getTxBody(); if (ctTextBody==null) { continue; } + + DrawingTextBody textBody; + CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr(); + if(nvpr.isSetPh()) { + textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh()); + } else { + textBody = new DrawingTextBody(ctTextBody); + } - DrawingTextBody textBody = new DrawingTextBody(ctTextBody); - - out.addAll(Arrays.asList(textBody.getParagraphs())); + out.add(textBody); } } - } diff --git a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java index e26b6bf0f9..f030f3b101 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java @@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase { assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n")); assertTrue(text.contains("amet\n\n")); - // Our master text, for tests + // Our placeholder master text + // This shouldn't show up in the output String masterText = "Click to edit Master title style\n" + + "Click to edit Master subtitle style\n" + + "\n\n\n\n\n\n" + + "Click to edit Master title style\n" + "Click to edit Master text styles\n" + "Second level\n" + "Third level\n" + @@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase { "Lorem ipsum dolor sit amet\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "\n" + - masterText + - "\n\n\n" + "Lorem ipsum dolor sit amet\n" + "Lorem\n" + "ipsum\n" + "dolor\n" + "sit\n" + "amet\n" + - "\n" + - masterText + - "\n\n\n" + "\n" , text ); @@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase { "Lorem ipsum dolor sit amet\n" + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + "\n" + - masterText + - "\n\n\n\n\n" + + "\n\n" + "Lorem ipsum dolor sit amet\n" + "Lorem\n" + "ipsum\n" + "dolor\n" + "sit\n" + "amet\n" + - "\n" + - masterText + - "\n\n\n\n\n" + "\n\n\n" , text ); @@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase { new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx"))); XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); + extractor.setSlidesByDefault(true); + extractor.setNotesByDefault(false); + extractor.setMasterByDefault(true); String text = extractor.getText(); assertTrue(text.length() > 0); @@ -183,17 +183,28 @@ public class TestXSLFPowerPointExtractor extends TestCase { // Check master text is there assertTrue("Unable to find expected word in text\n" + text, text.contains("Footer from the master slide")); + + // Theme text shouldn't show up + String themeText = + "Theme Master Title\n" + + "Theme Master first level\n" + + "And the 2nd level\n" + + "Our 3rd level goes here\n" + + "And onto the 4th, such fun….\n" + + "Finally is the Fifth level\n"; // Check the whole text assertEquals( "First page title\n" + "First page subtitle\n" + -// "This text comes from the Master Slide\n" + // TODO -// "This is the Master Title\n" + // TODO - "\n" + // TODO Should be the above + "This is the Master Title\n" + + "This text comes from the Master Slide\n" + + "\n" + + // TODO Detect we didn't have a title, and include the master one "2nd page subtitle\n" + -// "This text comes from the Master Slide\n" + // TODO - "Footer from the master slide\n" + "Footer from the master slide\n" + + "This is the Master Title\n" + + "This text comes from the Master Slide\n" , text ); } -- 2.39.5