]> source.dussan.org Git - poi.git/commitdiff
Various bug fixes, and hpbf updates
authorNick Burch <nick@apache.org>
Sat, 30 Aug 2008 14:47:33 +0000 (14:47 +0000)
committerNick Burch <nick@apache.org>
Sat, 30 Aug 2008 14:47:33 +0000 (14:47 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690517 13f79535-47bb-0310-9956-ffa450edef68

16 files changed:
src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/hpbf/file-format.xml
src/documentation/content/xdocs/hpbf/index.xml
src/documentation/content/xdocs/index.xml
src/documentation/content/xdocs/status.xml
src/java/org/apache/poi/ddf/EscherBSERecord.java
src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java
src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hpbf/model/QuillContents.java
src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCBit.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
src/testcases/org/apache/poi/hssf/usermodel/TestHSSFHeaderFooter.java

index 0b538787d5461a54d06f09cd829bd9b660de7d5b..c165468ee7916d17e5c21ae42dc175880cf54492 100644 (file)
 
                <!-- Don't forget to update status.xml too! -->
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
+           <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
+           <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
            <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
            <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
            <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
index 97d5a33d7c9c2cbf3c8f3cd0417a6bde508860fc..e08ebbac04db850bca3eb1117179b79643253b21 100644 (file)
@@ -165,6 +165,12 @@ PL   62 1a 00 00 48 00 00 00 // PL   from: 1a62 (6754), len: 48 (72)
 
 (the text will then start)
 </source>
+               <p>We think that the first 4 bytes of text describes the
+                the function of the data at the offset. The first short is
+                then the count of that type, eg the 2nd will have 1. We
+                think that the second 4 bytes of text describes the format
+                of data block at the offset. The format of the text block
+                is easy, but we're still trying to figure out the others.</p>
                </section>
        </body>
 </document>
index c74dc23621a9c469fba35855b3b4cea078997207..01f49f061fc7bff3c4847c8eafa44f633fd8b7a3 100755 (executable)
             <title>Overview</title>
 
             <p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
-            <p>Currently, HPBF is in the experimental stage, while we try
-              to figure out the file format. Our initial aim is to provide
-              a text extractor for the format, with low level code following
-              after that if demand and developer interest warrant it.</p>
-                       <p>At this time, there is no <em>usermodel</em> api or similar.</p>
+            <p>Currently, HPBF is in an early stage, whilst we try to
+              figure out the file format. So far, we have basic text
+              extraction support, and are able to read some parts within
+              the file. Writing is not yet supported, as we are unable
+              to make sense of the Contents stream, which we think has
+              lots of offsets to other parts of the file.</p>
+            <p>Our initial aim is to provude a text extractor for the format
+              (now done), and be able to extract hyperlinks from within
+              the document (not yet supported). Additional low level
+              code to process the file format may follow, if there
+              is demand and developer interest warrant it.</p>
+                       <p>At this time, there is no <em>usermodel</em> api or similar.
+              There is only low level support for certain parts of
+              the file, but by no means all of it.</p>
             <p>Our current understanding of the file format is documented
               <link href="file-format.html">here</link>.</p>
             <note> 
index 17e4336d0231d6328f12713ea047c86521e25f22..d5369ba9f7e8fc05cc1b38a8b9f5d42d90694b25 100644 (file)
         </section>
         <section><title>HPBF for Publisher Documents</title>
        <p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
-         Java. At the moment, we are still figuring out the file format, but we hope
-      to have simple text extraction shortly. Please see <link
+         Java. It currently only supports reading at a low level for around
+      half of the file parts, and simple text extraction.  Please see <link
            href="./hpbf/index.html">the HPBF project page for more
            information</link>.</p>
         </section>
index 8b8f25b809741ab612c7b29f2573dc84cdf1e81f..afbd1c899cc298e17738e707cfe4bc146f3bd942 100644 (file)
        <!-- Don't forget to update changes.xml too! -->
     <changes>
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
+           <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
+           <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
            <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
            <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
            <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
index be503d73e7b9a7c2a3f8511e3cbd643cad962bad..a1c52b4d48d2152a28dfb8adbcc4bbd01e22e820 100644 (file)
@@ -87,9 +87,10 @@ public class EscherBSERecord
         field_10_unused2 = data[pos + 34];
         field_11_unused3 = data[pos + 35];
         bytesRemaining -= 36;
+        
         int bytesRead = 0;
-        if (bytesRemaining > 0)
-        {
+        if (bytesRemaining > 0) {
+               // Some older escher formats skip this last record
             field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 );
             bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory );
         }
@@ -168,7 +169,16 @@ public class EscherBSERecord
      */
     public int getRecordSize()
     {
-        return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length);
+       int field_12_size = 0;
+       if(field_12_blipRecord != null) {
+               field_12_size = field_12_blipRecord.getRecordSize(); 
+       }
+       int remaining_size = 0;
+       if(remainingData != null) {
+               remaining_size = remainingData.length;
+       }
+        return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 +
+            1 + 1 + field_12_size + remaining_size;
     }
 
     /**
index 2a2771e4426a80abc5893d4e67c24287bdf0afd9..0e736897013316a7e9c96381c303c76cb4e1107d 100644 (file)
@@ -247,6 +247,11 @@ public abstract class HeaderFooter {
     public static String stripFields(String text) {
        int pos;
        
+       // Check we really got something to work on
+       if(text == null || text.length() == 0) {
+               return text;
+       }
+       
        // Firstly, do the easy ones which are static
        for(int i=0; i<Field.ALL_FIELDS.size(); i++) {
                String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence;
@@ -257,6 +262,7 @@ public abstract class HeaderFooter {
        }
        
        // Now do the tricky, dynamic ones
+       // These are things like font sizes and font names
        text = text.replaceAll("\\&\\d+", "");
        text = text.replaceAll("\\&\".*?,.*?\"", "");
        
@@ -292,9 +298,9 @@ public abstract class HeaderFooter {
     public static final Field TIME_FIELD = new Field("&T");
     public static final Field NUM_PAGES_FIELD = new Field("&N");
     
-    public static final Field PICTURE_FIELD = new Field("&P");
+    public static final Field PICTURE_FIELD = new Field("&G");
     
-    public static final PairField BOLD_FIELD = new PairField("&B"); // PAID
+    public static final PairField BOLD_FIELD = new PairField("&B");
     public static final PairField ITALIC_FIELD = new PairField("&I");
     public static final PairField STRIKETHROUGH_FIELD = new PairField("&S");
     public static final PairField SUBSCRIPT_FIELD = new PairField("&Y");
diff --git a/src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java b/src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java
new file mode 100644 (file)
index 0000000..368755e
--- /dev/null
@@ -0,0 +1,90 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.dev;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.ddf.DefaultEscherRecordFactory;
+import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.hpbf.HPBFDocument;
+import org.apache.poi.hpbf.model.QuillContents;
+import org.apache.poi.hpbf.model.qcbits.QCBit;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.HexDump;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+
+/**
+ * For dumping out the PLC contents of QC Bits of a
+ *  HPBF (Publisher) file, while we try to figure out
+ *  what the format of them is.
+ */
+public class PLCDumper {
+       private HPBFDocument doc;
+       private QuillContents qc;
+       
+       public PLCDumper(HPBFDocument doc) {
+               this.doc = doc;
+               qc = doc.getQuillContents();
+       }
+       public PLCDumper(POIFSFileSystem fs) throws IOException {
+               this(new HPBFDocument(fs));
+       }
+       public PLCDumper(InputStream inp) throws IOException {
+               this(new POIFSFileSystem(inp));
+       }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  PLCDumper <filename>");
+                       System.exit(1);
+               }
+               PLCDumper dump = new PLCDumper(
+                               new FileInputStream(args[0])
+               );
+               
+               System.out.println("Dumping " + args[0]);
+               dump.dumpPLC();
+       }
+       
+       private void dumpPLC() {        
+               QuillContents qc = doc.getQuillContents();
+               QCBit[] bits = qc.getBits();
+               
+               for(int i=0; i<bits.length; i++) {
+                       if(bits[i] == null) continue;
+                       if(bits[i].getBitType().equals("PLC ")) {
+                               dumpBit(bits[i], i);
+                       }
+               }
+       }
+       
+       private void dumpBit(QCBit bit, int index) {
+               System.out.println("");
+               System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
+               System.out.println("  Is a " + bit.getThingType() + ", number is " + bit.getOptA());
+               System.out.println("  Starts at " + bit.getDataOffset() + " (" + Integer.toHexString(bit.getDataOffset()) + ")");
+               System.out.println("  Runs for  " + bit.getLength() + " (" + Integer.toHexString(bit.getLength()) + ")");
+               
+               System.out.println(HexDump.dump(bit.getData(), 0, 0));
+       }
+}
index ae626fcae39a1534639a4ef72c3c5cd5479a2de4..b8d4ad298ad7541ad4b1fda003905e2145f96689 100644 (file)
@@ -70,6 +70,7 @@ public final class QuillContents extends HPBFPart {
                                bits[i].setOptA(optA);
                                bits[i].setOptB(optB);
                                bits[i].setOptC(optC);
+                               bits[i].setDataOffset(from);
                        } else {
                                // Doesn't have data
                        }
index 61c7955f66910a9288e9f3f8299c0b5fb682a82d..d6a5608bb4f41637b85f1b5173da0f47aa7e7e96 100644 (file)
@@ -28,6 +28,8 @@ public abstract class QCBit {
        protected int optB;
        protected int optC;
        
+       protected int dataOffset;
+       
        public QCBit(String thingType, String bitType, byte[] data) {
                this.thingType = thingType;
                this.bitType = bitType;
@@ -66,4 +68,15 @@ public abstract class QCBit {
        public void setOptC(int optC) {
                this.optC = optC;
        }
+
+       public int getDataOffset() {
+               return dataOffset;
+       }
+       public void setDataOffset(int offset) {
+               this.dataOffset = offset;
+       }
+       
+       public int getLength() {
+               return data.length;
+       }
 }
index 83003617201be1cd774efd773f5334bee5a0bb49..574c065ec3693ede2e9bbc72bee729dd72e92000 100644 (file)
@@ -167,6 +167,13 @@ public class HeaderStories {
                if(stripFields) {
                        return Range.stripFields(text);
                }
+               // If you create a header/footer, then remove it again, word
+               //  will leave \r\r. Turn these back into an empty string,
+               //  which is more what you'd expect
+               if(text.equals("\r\r")) {
+                       return "";
+               }
+               
                return text;
        }
        
index 96396e10734df7b1cec12ec952fbeec48c0e928d..d5b4712227b2a8f1b6948b476ec69d3b818f830d 100644 (file)
@@ -59,11 +59,11 @@ public class TextPublisherTextExtractor extends TestCase {
                
                assertEquals(
 "This is some text on the first page\n" +
-"Its in times new roman, font size 10, all normal\n" +
+"It\u2019s in times new roman, font size 10, all normal\n" +
 "" +
 "This is in bold and italic\n" +
-"Its Arial, 20 point font\n" +
-"Its in the second textbox on the first page\n" +
+"It\u2019s Arial, 20 point font\n" +
+"It\u2019s in the second textbox on the first page\n" +
 "" +
 "This is the second page\n\n" +
 "" +
@@ -102,4 +102,36 @@ public class TextPublisherTextExtractor extends TestCase {
                                , text
                );
        }
+       
+       /**
+        * We have the same file saved for Publisher 98, Publisher
+        *  2000 and Publisher 2007. Check they all agree.
+        * @throws Exception
+        */
+       public void testMultipleVersions() throws Exception {
+               File f;
+               HPBFDocument doc;
+               
+               f = new File(dir, "Sample.pub");
+               doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+               String s2007 = (new PublisherTextExtractor(doc)).getText();
+               
+               f = new File(dir, "Sample2000.pub");
+               doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+               String s2000 = (new PublisherTextExtractor(doc)).getText();
+               
+               f = new File(dir, "Sample98.pub");
+               doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+               String s98 = (new PublisherTextExtractor(doc)).getText();
+               
+               // Check they all agree
+               assertEquals(s2007, s2000);
+               assertEquals(s2007, s98);
+       }
 }
index dbaf46c649666045c1fe25483ab19395989ee55b..631095007d6ac8f0f7ef23b3d66dc7a405781857 100644 (file)
@@ -47,4 +47,38 @@ public class TestEscherParts extends TestCase {
                
                // TODO - check the contents
        }
+       
+       public void testComplex() throws Exception {
+               File f = new File(dir, "SampleBrochure.pub");
+               HPBFDocument doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               EscherStm es = doc.getEscherStm();
+               EscherDelayStm eds = doc.getEscherDelayStm();
+               
+               assertNotNull(es);
+               assertNotNull(eds);
+               
+               assertEquals(30, es.getEscherRecords().length);
+               assertEquals(19, eds.getEscherRecords().length);
+               
+               // TODO - check contents
+               
+               
+               // Now do another complex file
+               f = new File(dir, "SampleNewsletter.pub");
+               doc = new HPBFDocument(
+                               new FileInputStream(f)
+               );
+
+               es = doc.getEscherStm();
+               eds = doc.getEscherDelayStm();
+               
+               assertNotNull(es);
+               assertNotNull(eds);
+               
+               assertEquals(51, es.getEscherRecords().length);
+               assertEquals(92, eds.getEscherRecords().length);
+       }
 }
index 704b4d4dd26793f91b11caa9b6030c0a244bb03e..a1b78752f818d4807b5049d5c616f4bc306afb84 100644 (file)
@@ -183,7 +183,7 @@ public class TestWordExtractor extends TestCase {
        extractor = new WordExtractor(doc);
        
        assertEquals(
-                       "\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+                       "This is a simple header, with a \u20ac euro symbol in it.\n\n",
                        extractor.getHeaderText()
        );
        text = extractor.getText();
@@ -217,7 +217,7 @@ public class TestWordExtractor extends TestCase {
        extractor = new WordExtractor(doc);
        
        assertEquals(
-                       "\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+                       "The footer, with Moli\u00e8re, has Unicode in it.\n",
                        extractor.getFooterText()
        );
        text = extractor.getText();
index 404f6e47a402b007b4db5344d2d3a6be2bbaac86..e68352b508676ef70c4ae5ea95b71c901d4719b4 100644 (file)
@@ -123,7 +123,7 @@ public class TestHeaderStories extends TestCase {
        
                assertEquals("", hs.getFirstHeader());
                assertEquals("", hs.getEvenHeader());
-               assertEquals("\r\r", hs.getOddHeader());
+               assertEquals("", hs.getOddHeader()); // Was \r\r but gets emptied
 
                
                assertEquals("", hs.getFirstFooter());
@@ -181,13 +181,13 @@ public class TestHeaderStories extends TestCase {
     public void testUnicode() throws Exception {
        HeaderStories hs = new HeaderStories(unicode);
        
-               assertEquals("\r\r", hs.getFirstHeader());
-               assertEquals("\r\r", hs.getEvenHeader());
+               assertEquals("", hs.getFirstHeader());
+               assertEquals("", hs.getEvenHeader());
                assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader());
 
                
-               assertEquals("\r\r", hs.getFirstFooter());
-               assertEquals("\r\r", hs.getEvenFooter());
+               assertEquals("", hs.getFirstFooter());
+               assertEquals("", hs.getEvenFooter());
                assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
     }
     
index caa75633b0668ccb0e1c9c3233ecf4e89a980660..3ae1efcaaba00066b25632c40b1facaac13d6859 100644 (file)
@@ -87,8 +87,8 @@ public final class TestHSSFHeaderFooter extends TestCase {
        assertTrue(head.areFieldsStripped());
        
        // Now even more complex
-       head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G");
-       assertEquals("HEADER TEXT &G", head.getCenter());
+       head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G&X END");
+       assertEquals("HEADER TEXT  END", head.getCenter());
        }
 
        /**