<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
+ <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
+ <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
+ <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
+ <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
<action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
<action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
<action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
(the text will then start)
</source>
+ <p>We think that the first 4 bytes of text describes the
+ the function of the data at the offset. The first short is
+ then the count of that type, eg the 2nd will have 1. We
+ think that the second 4 bytes of text describes the format
+ of data block at the offset. The format of the text block
+ is easy, but we're still trying to figure out the others.</p>
</section>
</body>
</document>
<title>Overview</title>
<p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
- <p>Currently, HPBF is in the experimental stage, while we try
- to figure out the file format. Our initial aim is to provide
- a text extractor for the format, with low level code following
- after that if demand and developer interest warrant it.</p>
- <p>At this time, there is no <em>usermodel</em> api or similar.</p>
+ <p>Currently, HPBF is in an early stage, whilst we try to
+ figure out the file format. So far, we have basic text
+ extraction support, and are able to read some parts within
+ the file. Writing is not yet supported, as we are unable
+ to make sense of the Contents stream, which we think has
+ lots of offsets to other parts of the file.</p>
+ <p>Our initial aim is to provude a text extractor for the format
+ (now done), and be able to extract hyperlinks from within
+ the document (not yet supported). Additional low level
+ code to process the file format may follow, if there
+ is demand and developer interest warrant it.</p>
+ <p>At this time, there is no <em>usermodel</em> api or similar.
+ There is only low level support for certain parts of
+ the file, but by no means all of it.</p>
<p>Our current understanding of the file format is documented
<link href="file-format.html">here</link>.</p>
<note>
</section>
<section><title>HPBF for Publisher Documents</title>
<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
- Java. At the moment, we are still figuring out the file format, but we hope
- to have simple text extraction shortly. Please see <link
+ Java. It currently only supports reading at a low level for around
+ half of the file parts, and simple text extraction. Please see <link
href="./hpbf/index.html">the HPBF project page for more
information</link>.</p>
</section>
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
+ <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
+ <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
+ <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
+ <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
<action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
<action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
<action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
field_10_unused2 = data[pos + 34];
field_11_unused3 = data[pos + 35];
bytesRemaining -= 36;
+
int bytesRead = 0;
- if (bytesRemaining > 0)
- {
+ if (bytesRemaining > 0) {
+ // Some older escher formats skip this last record
field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 );
bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory );
}
*/
public int getRecordSize()
{
- return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length);
+ int field_12_size = 0;
+ if(field_12_blipRecord != null) {
+ field_12_size = field_12_blipRecord.getRecordSize();
+ }
+ int remaining_size = 0;
+ if(remainingData != null) {
+ remaining_size = remainingData.length;
+ }
+ return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 +
+ 1 + 1 + field_12_size + remaining_size;
}
/**
public static String stripFields(String text) {
int pos;
+ // Check we really got something to work on
+ if(text == null || text.length() == 0) {
+ return text;
+ }
+
// Firstly, do the easy ones which are static
for(int i=0; i<Field.ALL_FIELDS.size(); i++) {
String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence;
}
// Now do the tricky, dynamic ones
+ // These are things like font sizes and font names
text = text.replaceAll("\\&\\d+", "");
text = text.replaceAll("\\&\".*?,.*?\"", "");
public static final Field TIME_FIELD = new Field("&T");
public static final Field NUM_PAGES_FIELD = new Field("&N");
- public static final Field PICTURE_FIELD = new Field("&P");
+ public static final Field PICTURE_FIELD = new Field("&G");
- public static final PairField BOLD_FIELD = new PairField("&B"); // PAID
+ public static final PairField BOLD_FIELD = new PairField("&B");
public static final PairField ITALIC_FIELD = new PairField("&I");
public static final PairField STRIKETHROUGH_FIELD = new PairField("&S");
public static final PairField SUBSCRIPT_FIELD = new PairField("&Y");
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.dev;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.ddf.DefaultEscherRecordFactory;
+import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.hpbf.HPBFDocument;
+import org.apache.poi.hpbf.model.QuillContents;
+import org.apache.poi.hpbf.model.qcbits.QCBit;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.HexDump;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+
+/**
+ * For dumping out the PLC contents of QC Bits of a
+ * HPBF (Publisher) file, while we try to figure out
+ * what the format of them is.
+ */
+public class PLCDumper {
+ private HPBFDocument doc;
+ private QuillContents qc;
+
+ public PLCDumper(HPBFDocument doc) {
+ this.doc = doc;
+ qc = doc.getQuillContents();
+ }
+ public PLCDumper(POIFSFileSystem fs) throws IOException {
+ this(new HPBFDocument(fs));
+ }
+ public PLCDumper(InputStream inp) throws IOException {
+ this(new POIFSFileSystem(inp));
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" PLCDumper <filename>");
+ System.exit(1);
+ }
+ PLCDumper dump = new PLCDumper(
+ new FileInputStream(args[0])
+ );
+
+ System.out.println("Dumping " + args[0]);
+ dump.dumpPLC();
+ }
+
+ private void dumpPLC() {
+ QuillContents qc = doc.getQuillContents();
+ QCBit[] bits = qc.getBits();
+
+ for(int i=0; i<bits.length; i++) {
+ if(bits[i] == null) continue;
+ if(bits[i].getBitType().equals("PLC ")) {
+ dumpBit(bits[i], i);
+ }
+ }
+ }
+
+ private void dumpBit(QCBit bit, int index) {
+ System.out.println("");
+ System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
+ System.out.println(" Is a " + bit.getThingType() + ", number is " + bit.getOptA());
+ System.out.println(" Starts at " + bit.getDataOffset() + " (" + Integer.toHexString(bit.getDataOffset()) + ")");
+ System.out.println(" Runs for " + bit.getLength() + " (" + Integer.toHexString(bit.getLength()) + ")");
+
+ System.out.println(HexDump.dump(bit.getData(), 0, 0));
+ }
+}
bits[i].setOptA(optA);
bits[i].setOptB(optB);
bits[i].setOptC(optC);
+ bits[i].setDataOffset(from);
} else {
// Doesn't have data
}
protected int optB;
protected int optC;
+ protected int dataOffset;
+
public QCBit(String thingType, String bitType, byte[] data) {
this.thingType = thingType;
this.bitType = bitType;
public void setOptC(int optC) {
this.optC = optC;
}
+
+ public int getDataOffset() {
+ return dataOffset;
+ }
+ public void setDataOffset(int offset) {
+ this.dataOffset = offset;
+ }
+
+ public int getLength() {
+ return data.length;
+ }
}
if(stripFields) {
return Range.stripFields(text);
}
+ // If you create a header/footer, then remove it again, word
+ // will leave \r\r. Turn these back into an empty string,
+ // which is more what you'd expect
+ if(text.equals("\r\r")) {
+ return "";
+ }
+
return text;
}
assertEquals(
"This is some text on the first page\n" +
-"It’s in times new roman, font size 10, all normal\n" +
+"It\u2019s in times new roman, font size 10, all normal\n" +
"" +
"This is in bold and italic\n" +
-"It’s Arial, 20 point font\n" +
-"It’s in the second textbox on the first page\n" +
+"It\u2019s Arial, 20 point font\n" +
+"It\u2019s in the second textbox on the first page\n" +
"" +
"This is the second page\n\n" +
"" +
, text
);
}
+
+ /**
+ * We have the same file saved for Publisher 98, Publisher
+ * 2000 and Publisher 2007. Check they all agree.
+ * @throws Exception
+ */
+ public void testMultipleVersions() throws Exception {
+ File f;
+ HPBFDocument doc;
+
+ f = new File(dir, "Sample.pub");
+ doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+ String s2007 = (new PublisherTextExtractor(doc)).getText();
+
+ f = new File(dir, "Sample2000.pub");
+ doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+ String s2000 = (new PublisherTextExtractor(doc)).getText();
+
+ f = new File(dir, "Sample98.pub");
+ doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+ String s98 = (new PublisherTextExtractor(doc)).getText();
+
+ // Check they all agree
+ assertEquals(s2007, s2000);
+ assertEquals(s2007, s98);
+ }
}
// TODO - check the contents
}
+
+ public void testComplex() throws Exception {
+ File f = new File(dir, "SampleBrochure.pub");
+ HPBFDocument doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+
+ EscherStm es = doc.getEscherStm();
+ EscherDelayStm eds = doc.getEscherDelayStm();
+
+ assertNotNull(es);
+ assertNotNull(eds);
+
+ assertEquals(30, es.getEscherRecords().length);
+ assertEquals(19, eds.getEscherRecords().length);
+
+ // TODO - check contents
+
+
+ // Now do another complex file
+ f = new File(dir, "SampleNewsletter.pub");
+ doc = new HPBFDocument(
+ new FileInputStream(f)
+ );
+
+ es = doc.getEscherStm();
+ eds = doc.getEscherDelayStm();
+
+ assertNotNull(es);
+ assertNotNull(eds);
+
+ assertEquals(51, es.getEscherRecords().length);
+ assertEquals(92, eds.getEscherRecords().length);
+ }
}
extractor = new WordExtractor(doc);
assertEquals(
- "\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+ "This is a simple header, with a \u20ac euro symbol in it.\n\n",
extractor.getHeaderText()
);
text = extractor.getText();
extractor = new WordExtractor(doc);
assertEquals(
- "\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+ "The footer, with Moli\u00e8re, has Unicode in it.\n",
extractor.getFooterText()
);
text = extractor.getText();
assertEquals("", hs.getFirstHeader());
assertEquals("", hs.getEvenHeader());
- assertEquals("\r\r", hs.getOddHeader());
+ assertEquals("", hs.getOddHeader()); // Was \r\r but gets emptied
assertEquals("", hs.getFirstFooter());
public void testUnicode() throws Exception {
HeaderStories hs = new HeaderStories(unicode);
- assertEquals("\r\r", hs.getFirstHeader());
- assertEquals("\r\r", hs.getEvenHeader());
+ assertEquals("", hs.getFirstHeader());
+ assertEquals("", hs.getEvenHeader());
assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader());
- assertEquals("\r\r", hs.getFirstFooter());
- assertEquals("\r\r", hs.getEvenFooter());
+ assertEquals("", hs.getFirstFooter());
+ assertEquals("", hs.getEvenFooter());
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
}
assertTrue(head.areFieldsStripped());
// Now even more complex
- head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G");
- assertEquals("HEADER TEXT &G", head.getCenter());
+ head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G&X END");
+ assertEquals("HEADER TEXT END", head.getCenter());
}
/**