--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hpbf.HPBFDocument;
+import org.apache.poi.hpbf.model.qcbits.QCBit;
+import org.apache.poi.hpbf.model.qcbits.QCTextBit;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Extract text from HPBF Publisher files
+ */
+public class PublisherTextExtractor extends POIOLE2TextExtractor {
+ private HPBFDocument doc;
+
+ public PublisherTextExtractor(HPBFDocument doc) {
+ super(doc);
+ this.doc = doc;
+ }
+ public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
+ this(new HPBFDocument(fs));
+ }
+ public PublisherTextExtractor(InputStream is) throws IOException {
+ this(new POIFSFileSystem(is));
+ }
+
+ public String getText() {
+ StringBuffer text = new StringBuffer();
+
+ // Get the text from the Quill Contents
+ QCBit[] bits = doc.getQuillContents().getBits();
+ for(int i=0; i<bits.length; i++) {
+ if(bits[i] != null && bits[i] instanceof QCTextBit) {
+ QCTextBit t = (QCTextBit)bits[i];
+ text.append( t.getText().replace('\r', '\n') );
+ }
+ }
+
+ // Get more text
+ // TODO
+
+ return text.toString();
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ if(args.length == 0) {
+ System.err.println("Use:");
+ System.err.println(" PublisherTextExtractor <file.pub>");
+ }
+
+ for(int i=0; i<args.length; i++) {
+ PublisherTextExtractor te = new PublisherTextExtractor(
+ new FileInputStream(args[i])
+ );
+ System.out.println(te.getText());
+ }
+ }
+}
public QCTextBit(String thingType, String bitType, byte[] data) {
super(thingType, bitType, data);
}
-
+
+ /**
+ * Returns the text. Note that line endings
+ * are \r and not \n
+ */
public String getText() {
return StringUtil.getFromUnicodeLE(
data, 0, data.length/2
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hpbf.HPBFDocument;
+
+import junit.framework.TestCase;
+
+public class TextPublisherTextExtractor extends TestCase {
+ private String dir;
+
+ protected void setUp() throws Exception {
+ dir = System.getProperty("HPBF.testdata.path");
+ }
+
+ public void testBasics() throws Exception {
+ File f = new File(dir, "Sample.pub");
+ HPBFDocument doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+
+ PublisherTextExtractor ext =
+ new PublisherTextExtractor(doc);
+ ext.getText();
+
+ f = new File(dir, "Simple.pub");
+ ext = new PublisherTextExtractor(
+ new FileInputStream(f)
+ );
+ ext.getText();
+ }
+
+ public void testContents() throws Exception {
+ File f = new File(dir, "Sample.pub");
+ HPBFDocument doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+
+ PublisherTextExtractor ext =
+ new PublisherTextExtractor(doc);
+ String text = ext.getText();
+
+ assertEquals(
+"This is some text on the first page\n" +
+"It’s in times new roman, font size 10, all normal\n" +
+"" +
+"This is in bold and italic\n" +
+"It’s Arial, 20 point font\n" +
+"It’s in the second textbox on the first page\n" +
+"" +
+"This is the second page\n\n" +
+"" +
+"It is also times new roman, 10 point\n" +
+"" +
+"Table on page 2\nTop right\n" +
+"P2 table left\nP2 table right\n" +
+"Bottom Left\nBottom Right\n" +
+"" +
+"This text is on page two\n" +
+"#This is a link to Apache POI\n" +
+"More normal text\n" +
+"Link to a file\n" +
+"" +
+"More text, more hyperlinks\n" +
+"email link\n" +
+"Final hyperlink\n" +
+"Within doc to page 1\n"
+ , text
+ );
+
+ // Now a simpler one
+ f = new File(dir, "Simple.pub");
+ ext = new PublisherTextExtractor(
+ new FileInputStream(f)
+ );
+ text = ext.getText();
+ assertEquals(
+"0123456789\n" +
+"0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef\n" +
+"0123456789\n" +
+"0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef\n" +
+"0123456789abcdef0123456789abcdef0123456789abcdef\n"
+ , text
+ );
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.model;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hpbf.HPBFDocument;
+
+import junit.framework.TestCase;
+
+public class TestEscherParts extends TestCase {
+ private String dir;
+
+ protected void setUp() throws Exception {
+ dir = System.getProperty("HPBF.testdata.path");
+ }
+
+ public void testBasics() throws Exception {
+ File f = new File(dir, "Sample.pub");
+ HPBFDocument doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+
+ EscherStm es = doc.getEscherStm();
+ EscherDelayStm eds = doc.getEscherDelayStm();
+
+ assertNotNull(es);
+ assertNotNull(eds);
+
+ assertEquals(13, es.getEscherRecords().length);
+ assertEquals(0, eds.getEscherRecords().length);
+
+ // TODO - check the contents
+ }
+}