git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@686640 13f79535-47bb-0310-9956-ffa450edef68tags/REL_3_2_FINAL
@@ -41,6 +41,7 @@ | |||
<menu-item label="HSLF" href="hslf/index.html"/> | |||
<menu-item label="HSMF" href="hsmf/index.html"/> | |||
<menu-item label="HDGF" href="hdgf/index.html"/> | |||
<menu-item label="HPBF" href="hpbf/index.html"/> | |||
<menu-item label="POI-Ruby" href="poi-ruby.html"/> | |||
<menu-item label="POI-Utils" href="utils/index.html"/> | |||
<menu-item label="Text Extraction" href="text-extraction.html"/> |
@@ -38,19 +38,19 @@ | |||
Root Entry - | |||
Objects - | |||
(no children) | |||
SummaryInformation <(0x05)SummaryInformation> | |||
DocumentSummaryInformation <(0x05)DocumentSummaryInformation> | |||
SummaryInformation <(0x05)SummaryInformation> | |||
DocumentSummaryInformation <(0x05)DocumentSummaryInformation> | |||
Escher - | |||
EscherStm | |||
EscherDelayStm | |||
Quill - | |||
QuillSub - | |||
CONTENTS | |||
CompObj <(0x01)CompObj> | |||
CompObj <(0x01)CompObj> | |||
Envelope | |||
Contents | |||
Internal <(0x03)Internal> | |||
CompObj <(0x01)CompObj> | |||
Internal <(0x03)Internal> | |||
CompObj <(0x01)CompObj> | |||
VBA - | |||
(no children) | |||
</source> | |||
@@ -69,7 +69,7 @@ Root Entry - | |||
then both <em>Contents</em> and <em>CONTENTS</em> streams | |||
change. There are no changes to the Escher streams.</p> | |||
<p>If you set the background colour of a textbox, but make | |||
no changes to the text, | |||
no changes to the text, (to finish off)</p> | |||
</section> | |||
<section><title>Structure of CONTENTS</title> | |||
<p>First we have "CHNKINK ", followed by 24 bytes.</p> | |||
@@ -162,6 +162,8 @@ PL 62 1a 00 00 48 00 00 00 // PL from: 1a62 (6754), len: 48 (72) | |||
00 00 00 00 00 00 | |||
00 00 00 00 00 00 00 00 | |||
00 00 00 00 00 00 00 00 | |||
(the text will then start) | |||
</source> | |||
</section> | |||
</body> |
@@ -39,7 +39,7 @@ | |||
after that if demand and developer interest warrant it.</p> | |||
<p>At this time, there is no <em>usermodel</em> api or similar.</p> | |||
<p>Our current understanding of the file format is documented | |||
<link href="file-format.html">here</a>.</p> | |||
<link href="file-format.html">here</link>.</p> | |||
<note> | |||
This code currently lives the | |||
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link> |
@@ -146,26 +146,30 @@ | |||
href="./hslf/index.html">the HSLF project page for more | |||
information</link>.</p> | |||
</section> | |||
<section><title>HDGF for Visio Documents</title> | |||
<p>HDGF is our port of the Microsoft Viso 97(-2003) file format to pure | |||
Java. It currently only supports reading at a very low level, and | |||
simple text extraction. Please see <link | |||
href="./hdgf/index.html">the HDGF project page for more | |||
information</link>.</p> | |||
</section> | |||
<section><title>HPSF for Document Properties</title> | |||
<p>HPSF is our port of the OLE 2 property set format to pure | |||
Java. Property sets are mostly use to store a document's properties | |||
(title, author, date of last modification etc.), but they can be used | |||
for application-specific purposes as well.</p> | |||
<p>HPSF supports reading and writing of properties. However, you will | |||
need to be using version 3.0 of POI to utilise the write support.</p> | |||
<p>HPSF supports both reading and writing of properties.</p> | |||
<p>Please see <link href="./hpsf/index.html">the HPSF project | |||
page</link> for more information.</p> | |||
</section> | |||
<section><title>HDGF for Visio Documents</title> | |||
<p>HDGF is our port of the Microsoft Viso 97(-2003) file format to pure | |||
Java. It currently only supports reading at a very low level, and | |||
simple text extraction. Please see <link | |||
href="./hdgf/index.html">the HDGF project page for more | |||
information</link>.</p> | |||
</section> | |||
<section><title>HPBF for Publisher Documents</title> | |||
<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure | |||
Java. At the moment, we are still figuring out the file format, but we hope | |||
to have simple text extraction shortly. Please see <link | |||
href="./hpbf/index.html">the HPBF project page for more | |||
information</link>.</p> | |||
</section> | |||
</section> | |||
<section><title>Contributing </title> |
@@ -310,8 +310,30 @@ public class HPBFDumper { | |||
); | |||
} | |||
} | |||
// Text | |||
System.out.println(""); | |||
System.out.println("TEXT:"); | |||
System.out.println(text); | |||
System.out.println(""); | |||
// All the others | |||
for(int i=0; i<20; i++) { | |||
if(startType[i] == null) { | |||
continue; | |||
} | |||
int start = from[i]; | |||
System.out.println( | |||
startType[i] + " -> " + endType[i] + | |||
" @ " + Integer.toHexString(start) + | |||
" (" + start + ")" | |||
); | |||
System.out.println("\t" + dumpBytes(data, start, 4)); | |||
System.out.println("\t" + dumpBytes(data, start+4, 4)); | |||
System.out.println("\t" + dumpBytes(data, start+8, 4)); | |||
System.out.println("\t(etc)"); | |||
} | |||
} | |||
protected void dump001CompObj(DirectoryNode dir) { |
@@ -0,0 +1,34 @@ | |||
This is some text on the first page | |||
It’s in times new roman, font size 10, all normal | |||
We’ve added some more text in here, to push all the offsets about a bit. | |||
This is in bold and italic | |||
It’s Arial, 20 point font | |||
It’s in the second textbox on the first page | |||
Ditto with more text in here. | |||
This is the second page | |||
It is also times new roman, 10 point | |||
Table on page 2 Top right | |||
P2 table left P2 table right | |||
Bottom Left Bottom Right | |||
This text is on page two | |||
This is a link to Apache POI | |||
More normal text | |||
Link to a file | |||
More text, more hyperlinks | |||
email link | |||
Final hyperlink | |||
Within doc to page 1 |
@@ -0,0 +1,29 @@ | |||
This is some text on the first page | |||
It’s in times new roman, font size 10, all normal | |||
This is in bold and italic | |||
It’s Arial, 20 point font | |||
It’s in the second textbox on the first page | |||
This is the second page12345678 | |||
It is also times new roman, 10 point | |||
Table on page 2 Top right | |||
P2 table left P2 table right | |||
Bottom Left Bottom Right | |||
This text is on page two | |||
This is a link to Apache POI | |||
More normal text | |||
Link to a file | |||
More text, more hyperlinks | |||
email link | |||
Final hyperlink | |||
Within doc to page 1 |
@@ -0,0 +1,29 @@ | |||
This is some text on the first page | |||
It’s in times new roman, font size 10, all normal | |||
This is in bold and italic | |||
It’s Arial, 20 point font | |||
It’s in the second textbox on the first page | |||
This is the second page | |||
It is also times new roman, 10 point | |||
Table on page 2 Top right | |||
P2 table left P2 table right | |||
Bottom Left Bottom Right | |||
This text is on page two | |||
This is a link to Apache POI | |||
More normal text | |||
Link to a file | |||
More text, more hyperlinks | |||
email link | |||
Final hyperlink | |||
Within doc to page 1 |