From e12de2c3bfc897abf17dbd0bdcbc7e3dc8a3e257 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sat, 30 Aug 2008 14:47:33 +0000 Subject: [PATCH] Various bug fixes, and hpbf updates git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690517 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/changes.xml | 5 ++ .../content/xdocs/hpbf/file-format.xml | 6 ++ .../content/xdocs/hpbf/index.xml | 19 ++-- src/documentation/content/xdocs/index.xml | 4 +- src/documentation/content/xdocs/status.xml | 5 ++ .../org/apache/poi/ddf/EscherBSERecord.java | 16 +++- .../poi/hssf/usermodel/HeaderFooter.java | 10 ++- .../org/apache/poi/hpbf/dev/PLCDumper.java | 90 +++++++++++++++++++ .../apache/poi/hpbf/model/QuillContents.java | 1 + .../apache/poi/hpbf/model/qcbits/QCBit.java | 13 +++ .../poi/hwpf/usermodel/HeaderStories.java | 7 ++ .../extractor/TextPublisherTextExtractor.java | 38 +++++++- .../poi/hpbf/model/TestEscherParts.java | 34 +++++++ .../poi/hwpf/extractor/TestWordExtractor.java | 4 +- .../poi/hwpf/usermodel/TestHeaderStories.java | 10 +-- .../hssf/usermodel/TestHSSFHeaderFooter.java | 4 +- 16 files changed, 242 insertions(+), 24 deletions(-) create mode 100644 src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 0b538787d5..c165468ee7 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,11 @@ + Impove empty header or footer handling in HWPF HeaderStories + Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out + Avoid NPE in EscherBSERecord on older escher records + Basic text extractraction support in HPBF + Initial, low level support for Publisher files, in the form of HPBF 45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records 45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records 45682 - Fix for cloning of CFRecordsAggregate diff --git a/src/documentation/content/xdocs/hpbf/file-format.xml b/src/documentation/content/xdocs/hpbf/file-format.xml index 97d5a33d7c..e08ebbac04 100644 --- a/src/documentation/content/xdocs/hpbf/file-format.xml +++ b/src/documentation/content/xdocs/hpbf/file-format.xml @@ -165,6 +165,12 @@ PL 62 1a 00 00 48 00 00 00 // PL from: 1a62 (6754), len: 48 (72) (the text will then start) +

We think that the first 4 bytes of text describes the + the function of the data at the offset. The first short is + then the count of that type, eg the 2nd will have 1. We + think that the second 4 bytes of text describes the format + of data block at the offset. The format of the text block + is easy, but we're still trying to figure out the others.

diff --git a/src/documentation/content/xdocs/hpbf/index.xml b/src/documentation/content/xdocs/hpbf/index.xml index c74dc23621..01f49f061f 100755 --- a/src/documentation/content/xdocs/hpbf/index.xml +++ b/src/documentation/content/xdocs/hpbf/index.xml @@ -33,11 +33,20 @@ Overview

HPBF is the POI Project's pure Java implementation of the Visio file format.

-

Currently, HPBF is in the experimental stage, while we try - to figure out the file format. Our initial aim is to provide - a text extractor for the format, with low level code following - after that if demand and developer interest warrant it.

-

At this time, there is no usermodel api or similar.

+

Currently, HPBF is in an early stage, whilst we try to + figure out the file format. So far, we have basic text + extraction support, and are able to read some parts within + the file. Writing is not yet supported, as we are unable + to make sense of the Contents stream, which we think has + lots of offsets to other parts of the file.

+

Our initial aim is to provude a text extractor for the format + (now done), and be able to extract hyperlinks from within + the document (not yet supported). Additional low level + code to process the file format may follow, if there + is demand and developer interest warrant it.

+

At this time, there is no usermodel api or similar. + There is only low level support for certain parts of + the file, but by no means all of it.

Our current understanding of the file format is documented here.

diff --git a/src/documentation/content/xdocs/index.xml b/src/documentation/content/xdocs/index.xml index 17e4336d02..d5369ba9f7 100644 --- a/src/documentation/content/xdocs/index.xml +++ b/src/documentation/content/xdocs/index.xml @@ -165,8 +165,8 @@
HPBF for Publisher Documents

HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure - Java. At the moment, we are still figuring out the file format, but we hope - to have simple text extraction shortly. Please see the HPBF project page for more information.

diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 8b8f25b809..afbd1c899c 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,11 @@ + Impove empty header or footer handling in HWPF HeaderStories + Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out + Avoid NPE in EscherBSERecord on older escher records + Basic text extractraction support in HPBF + Initial, low level support for Publisher files, in the form of HPBF 45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records 45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records 45682 - Fix for cloning of CFRecordsAggregate diff --git a/src/java/org/apache/poi/ddf/EscherBSERecord.java b/src/java/org/apache/poi/ddf/EscherBSERecord.java index be503d73e7..a1c52b4d48 100644 --- a/src/java/org/apache/poi/ddf/EscherBSERecord.java +++ b/src/java/org/apache/poi/ddf/EscherBSERecord.java @@ -87,9 +87,10 @@ public class EscherBSERecord field_10_unused2 = data[pos + 34]; field_11_unused3 = data[pos + 35]; bytesRemaining -= 36; + int bytesRead = 0; - if (bytesRemaining > 0) - { + if (bytesRemaining > 0) { + // Some older escher formats skip this last record field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 ); bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory ); } @@ -168,7 +169,16 @@ public class EscherBSERecord */ public int getRecordSize() { - return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length); + int field_12_size = 0; + if(field_12_blipRecord != null) { + field_12_size = field_12_blipRecord.getRecordSize(); + } + int remaining_size = 0; + if(remainingData != null) { + remaining_size = remainingData.length; + } + return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + + 1 + 1 + field_12_size + remaining_size; } /** diff --git a/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java b/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java index 2a2771e442..0e73689701 100644 --- a/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java +++ b/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java @@ -247,6 +247,11 @@ public abstract class HeaderFooter { public static String stripFields(String text) { int pos; + // Check we really got something to work on + if(text == null || text.length() == 0) { + return text; + } + // Firstly, do the easy ones which are static for(int i=0; i"); + System.exit(1); + } + PLCDumper dump = new PLCDumper( + new FileInputStream(args[0]) + ); + + System.out.println("Dumping " + args[0]); + dump.dumpPLC(); + } + + private void dumpPLC() { + QuillContents qc = doc.getQuillContents(); + QCBit[] bits = qc.getBits(); + + for(int i=0; i