]> source.dussan.org Git - poi.git/commitdiff
HPBF text extractor and unit tests
authorNick Burch <nick@apache.org>
Wed, 20 Aug 2008 20:13:08 +0000 (20:13 +0000)
committerNick Burch <nick@apache.org>
Wed, 20 Aug 2008 20:13:08 +0000 (20:13 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@687443 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCTextBit.java
src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub [new file with mode: 0755]
src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java [new file with mode: 0644]

diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java
new file mode 100644 (file)
index 0000000..2257283
--- /dev/null
@@ -0,0 +1,78 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hpbf.HPBFDocument;
+import org.apache.poi.hpbf.model.qcbits.QCBit;
+import org.apache.poi.hpbf.model.qcbits.QCTextBit;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Extract text from HPBF Publisher files 
+ */
+public class PublisherTextExtractor extends POIOLE2TextExtractor {
+       private HPBFDocument doc;
+       
+       public PublisherTextExtractor(HPBFDocument doc) {
+               super(doc);
+               this.doc = doc;
+       }
+       public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
+               this(new HPBFDocument(fs));
+       }
+       public PublisherTextExtractor(InputStream is) throws IOException {
+               this(new POIFSFileSystem(is));
+       }
+       
+       public String getText() {
+               StringBuffer text = new StringBuffer();
+               
+               // Get the text from the Quill Contents
+               QCBit[] bits = doc.getQuillContents().getBits();
+               for(int i=0; i<bits.length; i++) {
+                       if(bits[i] != null && bits[i] instanceof QCTextBit) {
+                               QCTextBit t = (QCTextBit)bits[i];
+                               text.append( t.getText().replace('\r', '\n') );
+                       }
+               }
+               
+               // Get more text
+               // TODO
+               
+               return text.toString();
+       }
+       
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length == 0) {
+                       System.err.println("Use:");
+                       System.err.println("  PublisherTextExtractor <file.pub>");
+               }
+               
+               for(int i=0; i<args.length; i++) {
+                       PublisherTextExtractor te = new PublisherTextExtractor(
+                                       new FileInputStream(args[i])
+                       );
+                       System.out.println(te.getText());
+               }
+       }
+}
index 2c0e98b41aed629dec6711e3cfb5c92771685364..e3c8dcb5898b5e1496c84b3ccdb310eb5a52590e 100644 (file)
@@ -25,7 +25,11 @@ public class QCTextBit extends QCBit {
        public QCTextBit(String thingType, String bitType, byte[] data) {
                super(thingType, bitType, data);
        }
-       
+
+       /**
+        * Returns the text. Note that line endings
+        *  are \r and not \n
+        */
        public String getText() {
                return StringUtil.getFromUnicodeLE(
                                data, 0, data.length/2
diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub
new file mode 100755 (executable)
index 0000000..2397b9d
Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Simple.pub differ
diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
new file mode 100644 (file)
index 0000000..96396e1
--- /dev/null
@@ -0,0 +1,105 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hpbf.HPBFDocument;
+
+import junit.framework.TestCase;
+
+public class TextPublisherTextExtractor extends TestCase {
+       private String dir;
+
+       protected void setUp() throws Exception {
+               dir = System.getProperty("HPBF.testdata.path");
+       }
+
+       public void testBasics() throws Exception {
+               File f = new File(dir, "Sample.pub");
+               HPBFDocument doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               PublisherTextExtractor ext = 
+                       new PublisherTextExtractor(doc);
+               ext.getText();
+               
+               f = new File(dir, "Simple.pub");
+               ext = new PublisherTextExtractor(
+                               new FileInputStream(f)
+               );
+               ext.getText();
+       }
+       
+       public void testContents() throws Exception {
+               File f = new File(dir, "Sample.pub");
+               HPBFDocument doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               PublisherTextExtractor ext = 
+                       new PublisherTextExtractor(doc);
+               String text = ext.getText();
+               
+               assertEquals(
+"This is some text on the first page\n" +
+"It’s in times new roman, font size 10, all normal\n" +
+"" +
+"This is in bold and italic\n" +
+"It’s Arial, 20 point font\n" +
+"It’s in the second textbox on the first page\n" +
+"" +
+"This is the second page\n\n" +
+"" +
+"It is also times new roman, 10 point\n" +
+"" +
+"Table on page 2\nTop right\n" +
+"P2 table left\nP2 table right\n" +
+"Bottom Left\nBottom Right\n" +
+"" +
+"This text is on page two\n" +
+"#This is a link to Apache POI\n" +
+"More normal text\n" +
+"Link to a file\n" +
+"" +
+"More text, more hyperlinks\n" +
+"email link\n" +
+"Final hyperlink\n" +
+"Within doc to page 1\n"
+                               , text
+               );
+               
+               // Now a simpler one
+               f = new File(dir, "Simple.pub");
+               ext = new PublisherTextExtractor(
+                               new FileInputStream(f)
+               );
+               text = ext.getText();
+               assertEquals(
+"0123456789\n" +
+"0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef\n" +
+"0123456789\n" +
+"0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef0123456789abcdef\n"
+                               , text
+               );
+       }
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java b/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
new file mode 100644 (file)
index 0000000..dbaf46c
--- /dev/null
@@ -0,0 +1,50 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.model;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hpbf.HPBFDocument;
+
+import junit.framework.TestCase;
+
+public class TestEscherParts extends TestCase {
+       private String dir;
+
+       protected void setUp() throws Exception {
+               dir = System.getProperty("HPBF.testdata.path");
+       }
+
+       public void testBasics() throws Exception {
+               File f = new File(dir, "Sample.pub");
+               HPBFDocument doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               EscherStm es = doc.getEscherStm();
+               EscherDelayStm eds = doc.getEscherDelayStm();
+               
+               assertNotNull(es);
+               assertNotNull(eds);
+               
+               assertEquals(13, es.getEscherRecords().length);
+               assertEquals(0, eds.getEscherRecords().length);
+               
+               // TODO - check the contents
+       }
+}