QuickButCruddyTextExtractor - gets all the text (including stuff you might not want...

author Nick Burch <nick@apache.org>

Thu, 9 Jun 2005 15:09:58 +0000 (15:09 +0000)

committer Nick Burch <nick@apache.org>

Thu, 9 Jun 2005 15:09:58 +0000 (15:09 +0000)
author Nick Burch <nick@apache.org>
Thu, 9 Jun 2005 15:09:58 +0000 (15:09 +0000)
committer Nick Burch <nick@apache.org>
Thu, 9 Jun 2005 15:09:58 +0000 (15:09 +0000)
diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java

index 9690fdddf90f616ec0042caba1ee07474418f68c..c6f8094599a3a4c1d7f7440062a4c74b5cde49d1 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@@ -126,7 +126,9 @@ public class PowerPointExtractor
    }
  
    /**
-   * Fetches text from the slideshow, be it slide text or note text
+   * Fetches text from the slideshow, be it slide text or note text.
+   * Because the final block of text in a TextRun normally have their
+   *  last \n stripped, we add it back
     * @param getSlideText fetch slide text
     * @param getNoteText fetch note text
     */
@@ -139,10 +141,12 @@ public class PowerPointExtractor
                         TextRun[] runs = slide.getTextRuns();
                         for(int j=0; j<runs.length; j++) {
                                 TextRun run = runs[j];
-                               String text = run.getText();
-                               ret.append(text);
-                               if(! text.endsWith("\n")) {
-                                       ret.append("\n");
+                               if(run != null) {
+                                       String text = run.getText();
+                                       ret.append(text);
+                                       if(! text.endsWith("\n")) {
+                                               ret.append("\n");
+                                       }
                                 }
                         }
                 }
diff --git a/src/scratchpad/src/org/apache/poi/hslf/extractor/QuickButCruddyTextExtractor.java b/src/scratchpad/src/org/apache/poi/hslf/extractor/QuickButCruddyTextExtractor.java

new file mode 100644 (file)

index 0000000..292fec8
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/QuickButCruddyTextExtractor.java
@@ -0,0 +1,205 @@
+
+/* ====================================================================
+   Copyright 2002-2004   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+        
+
+
+package org.apache.poi.hslf.extractor;
+
+import java.io.*;
+import java.util.Vector;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSDocument;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.util.LittleEndian;
+
+import org.apache.poi.hslf.record.Record;
+import org.apache.poi.hslf.record.TextHeaderAtom;
+import org.apache.poi.hslf.record.TextBytesAtom;
+import org.apache.poi.hslf.record.TextCharsAtom;
+import org.apache.poi.hslf.model.TextRun;
+
+/**
+ * This class will get all the text from a Powerpoint Document, including
+ *  all the bits you didn't want, and in a somewhat random order, but will
+ *  do it very fast.
+ * The class ignores most of the hslf classes, and doesn't use 
+ *  HSLFSlideShow. Instead, it just does a very basic scan through the
+ *  file, grabbing all the text records as it goes. It then returns the
+ *  text, either as a single string, or as a vector of all the individual
+ *  strings.
+ * Because of how it works, it will return a lot of "crud" text that you 
+ *  probably didn't want! It will return text from master slides. It will
+ *  return duplicate text, and some mangled text (powerpoint files often
+ *  have duplicate copies of slide text in them). You don't get any idea
+ *  what the text was associated with.
+ * Almost everyone will want to use @see PowerPointExtractor instead. There
+ *  are only a very small number of cases (eg some performance sensitive
+ *  lucene indexers) that would ever want to use this!
+ *
+ * @author Nick Burch
+ */
+
+public class QuickButCruddyTextExtractor
+{
+       private POIFSFileSystem fs;
+       private InputStream is;
+       private byte[] pptContents;
+
+       /**
+        * Really basic text extractor, that will also return lots of crud text.
+        * Takes a single argument, the file to extract from
+        */
+       public static void main(String args[]) throws IOException
+       {
+               if(args.length < 1) {
+                       System.err.println("Useage:");
+                       System.err.println("\tQuickButCruddyTextExtractor <file>");
+                       System.exit(1);
+               }
+
+               String file = args[0];
+
+               QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file);
+               System.out.println(ppe.getTextAsString());
+               ppe.close();
+       }
+
+       /**
+        * Creates an extractor from a given file name
+        * @param fileName
+        */
+       public QuickButCruddyTextExtractor(String fileName) throws IOException {
+               this(new FileInputStream(fileName));
+       }
+
+       /**
+        * Creates an extractor from a given input stream
+        * @param iStream
+        */
+       public QuickButCruddyTextExtractor(InputStream iStream) throws IOException {
+               this(new POIFSFileSystem(iStream));
+               is = iStream;
+       }
+
+       /**
+        * Creates an extractor from a POIFS Filesystem
+        * @param poifs
+        */
+       public QuickButCruddyTextExtractor(POIFSFileSystem poifs) throws IOException {
+               fs = poifs;
+
+               // Find the PowerPoint bit, and get out the bytes
+               DocumentEntry docProps =
+                       (DocumentEntry)fs.getRoot().getEntry("PowerPoint Document");
+               pptContents = new byte[docProps.getSize()];
+               fs.createDocumentInputStream("PowerPoint Document").read(pptContents);
+       }
+
+
+       /**
+        * Shuts down the underlying streams
+        */
+       public void close() throws IOException {
+               if(is != null) { is.close(); }
+               fs = null;
+       }
+
+       /**
+        * Fetches the ALL the text of the powerpoint file, as a single string
+        */
+       public String getTextAsString() {
+               StringBuffer ret = new StringBuffer();
+               Vector textV = getTextAsVector();
+               for(int i=0; i<textV.size(); i++) {
+                       String text = (String)textV.get(i);
+                       ret.append(text);
+                       if(! text.endsWith("\n")) {
+                               ret.append('\n');
+                       }
+               }
+               return ret.toString();
+       }
+
+       /**
+        * Fetches the ALL the text of the powerpoint file, in a vector of
+        *  strings, one per text record
+        */
+       public Vector getTextAsVector() {
+               Vector textV = new Vector();
+
+               // Set to the start of the file
+               int walkPos = 0;
+
+               // Start walking the file, looking for the records
+               while(walkPos != -1) {
+                       int newPos = findTextRecords(walkPos,textV);
+                       walkPos = newPos;
+               }
+
+               // Return what we find
+               return textV;
+       }
+
+       /**
+        * For the given position, look if the record is a text record, and wind
+        *  on after.
+        * If it is a text record, grabs out the text. Whatever happens, returns
+        *  the position of the next record, or -1 if no more.
+        */
+       public int findTextRecords(int startPos, Vector textV) {
+               // Grab the length, and the first option byte
+               // Note that the length doesn't include the 8 byte atom header
+               int len = (int)LittleEndian.getUInt(pptContents,startPos+4);
+               byte opt = pptContents[startPos];
+
+               // If it's a container, step into it and return
+               // (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
+               int container = (int)opt & 0x0f;
+               if(container == 0x0f) {
+                       return (startPos+8);
+               }
+
+               // Otherwise, check the type to see if it's text
+               long type = LittleEndian.getUShort(pptContents,startPos+2);
+               TextRun trun = null;
+
+               // TextBytesAtom
+               if(type == 4008l) {
+                       TextBytesAtom tba = (TextBytesAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
+                       trun = new TextRun((TextHeaderAtom)null,tba);
+               }
+               // TextCharsAtom
+               if(type == 4000l) {
+                       TextCharsAtom tca = (TextCharsAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
+                       trun = new TextRun((TextHeaderAtom)null,tca);
+               }
+
+               // If we found text, save it in the vector
+               if(trun != null) {
+                       textV.add(trun.getText());
+               }
+               
+               // Wind on by the atom length, and check we're not at the end
+               int newPos = (startPos + 8 + len);
+               if(newPos > (pptContents.length - 8)) { 
+                       newPos = -1;
+               }
+               return newPos;
+       }
+}
author	Nick Burch <nick@apache.org>
	Thu, 9 Jun 2005 15:09:58 +0000 (15:09 +0000)
committer	Nick Burch <nick@apache.org>
	Thu, 9 Jun 2005 15:09:58 +0000 (15:09 +0000)
src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hslf/extractor/QuickButCruddyTextExtractor.java	[new file with mode: 0644]	patch \| blob