--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ====================================================================
+-->
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
+
+<document>
+ <header>
+ <title>POI-HPBF - A Guide to the Publisher File Format</title>
+ <subtitle>Overview</subtitle>
+ <authors>
+ <person name="Nick Burch" email="nick at torchbox dot com"/>
+ </authors>
+ </header>
+
+ <body>
+ <section><title>Document Streams</title>
+ <p>
+ The file is made up of a number of POIFS streams. A typical
+ file will be made up as follows:
+ </p>
+<source>
+Root Entry -
+ Objects -
+ (no children)
+ SummaryInformation <(0x05)SummaryInformation>
+ DocumentSummaryInformation <(0x05)DocumentSummaryInformation>
+ Escher -
+ EscherStm
+ EscherDelayStm
+ Quill -
+ QuillSub -
+ CONTENTS
+ CompObj <(0x01)CompObj>
+ Envelope
+ Contents
+ Internal <(0x03)Internal>
+ CompObj <(0x01)CompObj>
+ VBA -
+ (no children)
+</source>
+ </section>
+ <section><title>Changing Text</title>
+ <p>If you make a change to the text of a file, but not change
+ how much text there is, then the <em>CONTENTS</em> stream
+ will undergo a small change, and the <em>Contents</em> stream
+ will undergo a large change.</p>
+ <p>If you make a change to the text of a file, and change the
+ amount of text there is, then both the <em>Contents</em> and
+ the <em>CONTENTS</em> streams change.</p>
+ </section>
+ <section><title>Changing Shapes</title>
+ <p>If you alter the size of a textbox, but make no text changes,
+ then both <em>Contents</em> and <em>CONTENTS</em> streams
+ change. There are no changes to the Escher streams.</p>
+ <p>If you set the background colour of a textbox, but make
+ no changes to the text,
+ </section>
+ </body>
+</document>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ====================================================================
+-->
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
+
+<document>
+ <header>
+ <title>POI-HPBF - Java API To Access Microsoft Publisher Format Files</title>
+ <subtitle>Overview</subtitle>
+ <authors>
+ <person name="Nick Burch" email="nick at apache dot org"/>
+ </authors>
+ </header>
+
+ <body>
+ <section>
+ <title>Overview</title>
+
+ <p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
+ <p>Currently, HPBF is in the experimental stage, while we try
+ to figure out the file format. Our initial aim is to provide
+ a text extractor for the format, with low level code following
+ after that if demand and developer interest warrant it.</p>
+ <p>At this time, there is no <em>usermodel</em> api or similar.</p>
+ <p>Our current understanding of the file format is documented
+ <link href="file-format.html">here</a>.</p>
+ <note>
+ This code currently lives the
+ <link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>
+ of the POI SVN repository.
+ Ensure that you have the scratchpad jar or the scratchpad
+ build area in your
+ classpath before experimenting with this code.
+ </note>
+ </section>
+ </body>
+</document>
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
/**
* For dumping out the contents of HPBF (Publisher)
return d;
}
+ /**
+ * Dumps out the given number of bytes as hex,
+ * two chars
+ */
+ private String dumpBytes(byte[] data, int offset, int len) {
+ StringBuffer ret = new StringBuffer();
+ for(int i=0; i<len; i++) {
+ int j = i + offset;
+ int b = data[j];
+ if(b < 0) { b += 256; }
+
+ String bs = Integer.toHexString(b);
+ if(bs.length() == 1)
+ ret.append('0');
+ ret.append(bs);
+ ret.append(' ');
+ }
+ return ret.toString();
+ }
+
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.out.println("");
System.out.println("CONTENTS - " + data.length + " bytes long:");
- // Dump out up to 0x200
+ // Between the start and 0x200 we have
+ // CHNKINK(space) + 24 bytes + 0x1800
+ // TEXT + 6 bytes
+ // TEXT + 8 bytes + 0x1800
+ // STSH + 6 bytes
+ // STSH + 8 bytes + 0x1800
+ // STSH + 6 bytes
+ // STSH + 8 bytes + 0x1800
+ // but towards 0x200 the pattern may
+ // break down a little bit
+
+ // After the second of a given type,
+ // it seems to be 4 bytes giving the start,
+ // then 4 bytes giving the length, then
+ // 18 00
+ System.out.println(
+ new String(data, 0, 8) +
+ dumpBytes(data, 8, 0x22-8)
+ );
+
+ int pos = 0x22;
+ boolean sixNotTen = true;
+ while(pos < 0x200) {
+ String text = new String(data, pos, 4);
+ int blen = 10;
+ if(sixNotTen)
+ blen = 6;
+ System.out.println(
+ text + " " + dumpBytes(data, pos+4, blen)
+ );
+
+ pos += 4 + blen;
+ sixNotTen = ! sixNotTen;
+ }
+
+ // Text from 0x200 onwards until we get
+ // to \r(00)\n(00)(00)(00)
+ int textStop = -1;
+ for(int i=0x200; i<data.length-2 && textStop == -1; i++) {
+ if(data[i] == 0 && data[i+1] == 0 && data[i+2] == 0) {
+ textStop = i;
+ }
+ }
+ if(textStop > 0) {
+ int len = (textStop - 0x200) / 2;
+ System.out.println("");
+ System.out.println(
+ StringUtil.getFromUnicodeLE(data, 0x200, len)
+ );
+ }
+
+ // The font list comes slightly later
- // Text from 0x200 onwards for a bit
+ // The hyperlinks may come before the fonts,
+ // or slightly in front
}
protected void dump001CompObj(DirectoryNode dir) {