]> source.dussan.org Git - poi.git/commitdiff
OOXML pptx text extractor, and test. Also add jar-ooxml ant task
authorNick Burch <nick@apache.org>
Sun, 30 Dec 2007 18:11:55 +0000 (18:11 +0000)
committerNick Burch <nick@apache.org>
Sun, 30 Dec 2007 18:11:55 +0000 (18:11 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607572 13f79535-47bb-0310-9956-ffa450edef68

build.xml
src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java [new file with mode: 0644]
src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java [new file with mode: 0644]

index a11bb9426238e545a870f3af4a6a1a85b4b737c1..5989ddad3eb64a3ee47e5512b086c9e9b88aea31 100644 (file)
--- a/build.xml
+++ b/build.xml
@@ -1124,6 +1124,21 @@ FORREST_HOME environment variable!</echo>
             </manifest>        
         </jar>
     </target>
+    <target name="jar-ooxml" depends="compile-ooxml" description="Creates the ooxml jar files for distribution">
+        <jar destfile="${dist.dir}/${jar.name}-ooxml-${version.id}-${DSTAMP}.jar">
+                       <fileset dir="${ooxml.output.dir}" />
+                       <fileset dir="legal/" />
+            <manifest>
+                <attribute name="Built-By" value="${user.name}"/>
+                <attribute name="Specification-Title" value="Apache POI"/>
+                <attribute name="Specification-Version" value="${version.id}-${DSTAMP}"/>
+                <attribute name="Specification-Vendor" value="Apache"/>
+                <attribute name="Implementation-Title" value="Apache POI"/>
+                <attribute name="Implementation-Version" value="${version.id}-${DSTAMP}"/>
+                <attribute name="Implementation-Vendor" value="Apache"/>
+            </manifest>        
+        </jar>
+    </target>
 
   <target name="dist" depends="clean, fail-unless-tools-are-available, compile, site, jar"
     description="Creates the entire distribution into build/dist, from scratch">
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
new file mode 100644 (file)
index 0000000..b0e7364
--- /dev/null
@@ -0,0 +1,122 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+
+public class HXFPowerPointExtractor extends POIXMLTextExtractor {
+       private HSLFXMLSlideShow slideshow;
+       
+       public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+               this(new HSLFXMLSlideShow(
+                               new HSLFXML(container)
+               ));
+       }
+       public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
+               super(slideshow);
+               this.slideshow = slideshow;
+       }
+
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  HXFPowerPointExtractor <filename.pptx>");
+                       System.exit(1);
+               }
+               POIXMLTextExtractor extractor = 
+                       new HXFPowerPointExtractor(HXFDocument.openPackage(
+                                       new File(args[0])
+                       ));
+               System.out.println(extractor.getText());
+       }
+       
+       /**
+        * Gets the slide and notes text
+        */
+       public String getText() {
+               return getText(true, true);
+       }
+       
+       /**
+        * Gets the requested text from the file
+        * @param slideText Should we retrieve text from slides?
+        * @param notesText Should we retrieve text from notes?
+        */
+       public String getText(boolean slideText, boolean notesText) {
+               StringBuffer text = new StringBuffer();
+               
+               CTSlideIdListEntry[] slideRefs =
+                       slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
+               for (int i = 0; i < slideRefs.length; i++) {
+                       try {
+                               CTSlide slide =
+                                       slideshow._getHSLFXML().getSlide(slideRefs[i]);
+                               CTNotesSlide notes = 
+                                       slideshow._getHSLFXML().getNotes(slideRefs[i]);
+                               
+                               if(slideText) {
+                                       extractText(slide.getCSld().getSpTree(), text);
+                               }
+                               if(notesText && notes != null) {
+                                       extractText(notes.getCSld().getSpTree(), text);
+                               }
+                       } catch(Exception e) {
+                               throw new RuntimeException(e);
+                       }
+               }
+               
+               return text.toString();
+       }
+       
+       private void extractText(CTGroupShape gs, StringBuffer text) {
+               CTShape[] shapes = gs.getSpArray();
+               for (int i = 0; i < shapes.length; i++) {
+                       CTTextBody textBody =
+                               shapes[i].getTxBody();
+                       if(textBody != null) {
+                               CTTextParagraph[] paras = 
+                                       textBody.getPArray();
+                               for (int j = 0; j < paras.length; j++) {
+                                       CTRegularTextRun[] textRuns =
+                                               paras[j].getRArray();
+                                       for (int k = 0; k < textRuns.length; k++) {
+                                               text.append( textRuns[k].getT() );
+                                       }
+                                       // End each paragraph with a new line
+                                       text.append("\n");
+                               }
+                       }
+               }
+       }
+}
index 11e7efd288f8041a30c371437bad7345b9e262b5..9c122da6dc8c2def6c8c2ad751fdb959734ed80d 100644 (file)
@@ -21,7 +21,6 @@ import java.io.File;
 import org.apache.poi.hxf.HXFDocument;
 import org.openxml4j.opc.Package;
 import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
 
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
new file mode 100644 (file)
index 0000000..7c96c29
--- /dev/null
@@ -0,0 +1,101 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFPowerPointExtractor
+ */
+public class TestHXFPowerPointExtractor extends TestCase {
+       /**
+        * A simple file
+        */
+       private HSLFXML xmlA;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               File fileA = new File(
+                               System.getProperty("HSLF.testdata.path") +
+                               File.separator + "sample.pptx"
+               );
+               
+               xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
+       }
+
+       /**
+        * Get text out of the simple file
+        */
+       public void testGetSimpleText() throws Exception {
+               new HXFPowerPointExtractor(xmlA.getPackage());
+               new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
+               
+               HXFPowerPointExtractor extractor = 
+                       new HXFPowerPointExtractor(xmlA.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               // Check Basics
+               assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
+               assertTrue(text.endsWith("amet\n\n\n\n"));
+               
+               // Just slides, no notes
+               text = extractor.getText(true, false);
+               assertEquals(
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                               "\n" +
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Lorem\n" +
+                               "ipsum\n" +
+                               "dolor\n" +
+                               "sit\n" +
+                               "amet\n" +
+                               "\n", text
+               );
+               
+               // Just notes, no slides
+               text = extractor.getText(false, true);
+               assertEquals(
+                               "\n\n\n\n", text
+               );
+               
+               // Both
+               text = extractor.getText(true, true);
+               assertEquals(
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                               "\n\n\n" +
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Lorem\n" +
+                               "ipsum\n" +
+                               "dolor\n" +
+                               "sit\n" +
+                               "amet\n" +
+                               "\n\n\n", text
+               );
+       }
+}