More Word 6 / Word 95 Support

author Nick Burch <nick@apache.org>

Fri, 2 Jul 2010 20:59:30 +0000 (20:59 +0000)

committer Nick Burch <nick@apache.org>

Fri, 2 Jul 2010 20:59:30 +0000 (20:59 +0000)
author Nick Burch <nick@apache.org>
Fri, 2 Jul 2010 20:59:30 +0000 (20:59 +0000)
committer Nick Burch <nick@apache.org>
Fri, 2 Jul 2010 20:59:30 +0000 (20:59 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index f55a8537106a96c8410eb017fca99b5b5c394298..acea33a3af608f03aa6769eae9b5df2696197d65 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
  
      <changes>
          <release version="3.7-beta2" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF</action>
             <action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
             <action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
             <action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java

index 00d1162e9077b8455d109585cf4fdd15e0c11508..877075158a5fd84cc3e9521df97f40b44a648280 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@@ -31,7 +31,6 @@ import org.apache.poi.hwpf.model.ComplexFileTable;
  import org.apache.poi.hwpf.model.DocumentProperties;
  import org.apache.poi.hwpf.model.EscherRecordHolder;
  import org.apache.poi.hwpf.model.FSPATable;
-import org.apache.poi.hwpf.model.FileInformationBlock;
  import org.apache.poi.hwpf.model.FontTable;
  import org.apache.poi.hwpf.model.GenericPropertyNode;
  import org.apache.poi.hwpf.model.ListTables;
@@ -83,24 +82,6 @@ public final class HWPFDocument extends HWPFDocumentCore
  
    protected TextPieceTable _tpt;
  
-  /** Contains formatting properties for text*/
-  protected CHPBinTable _cbt;
-
-  /** Contains formatting properties for paragraphs*/
-  protected PAPBinTable _pbt;
-
-  /** Contains formatting properties for sections.*/
-  protected SectionTable _st;
-
-  /** Holds styles for this document.*/
-  protected StyleSheet _ss;
-
-  /** Holds fonts for this document.*/
-  protected FontTable _ft;
-
-  /** Hold list tables */
-  protected ListTables _lt;
-
    /** Holds the save history for this document. */
    protected SavedByTable _sbt;
    
@@ -277,15 +258,11 @@ public final class HWPFDocument extends HWPFDocumentCore
      }
    }
  
-  public StyleSheet getStyleSheet()
+  public TextPieceTable getTextTable()
    {
-    return _ss;
+    return _cft.getTextPieceTable();
    }
  
-  public FileInformationBlock getFileInformationBlock()
-  {
-    return _fib;
-  }
    public CPSplitCalculator getCPSplitCalculator()
    {
         return _cpSplit;
@@ -390,11 +367,6 @@ public final class HWPFDocument extends HWPFDocumentCore
      return length;
    }
  
-  public ListTables getListTables()
-  {
-    return _lt;
-  }
-
    /**
     * Gets a reference to the saved -by table, which holds the save history for the document.
     *
@@ -591,26 +563,6 @@ public final class HWPFDocument extends HWPFDocumentCore
      pfs.writeFilesystem(out);
    }
  
-  public CHPBinTable getCharacterTable()
-  {
-    return _cbt;
-  }
-
-  public PAPBinTable getParagraphTable()
-  {
-    return _pbt;
-  }
-
-  public SectionTable getSectionTable()
-  {
-    return _st;
-  }
-
-  public TextPieceTable getTextTable()
-  {
-    return _cft.getTextPieceTable();
-  }
-
    public byte[] getDataStream()
    {
      return _dataStream;
@@ -629,11 +581,6 @@ public final class HWPFDocument extends HWPFDocumentCore
      return _lt.addList(list.getListData(), list.getOverride());
    }
  
-  public FontTable getFontTable()
-  {
-    return _ft;
-  }
-
    public void delete(int start, int length)
    {
      Range r = new Range(start, start + length, this);
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java

index af17cc2ed257c4316085903133d77d74cb3c4208..f9ebc378013e6c8310ddc67bd28b0f03427b9d6e 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@@ -23,7 +23,15 @@ import java.io.PushbackInputStream;
  
  import org.apache.poi.EncryptedDocumentException;
  import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.CHPBinTable;
  import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.FontTable;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.model.PAPBinTable;
+import org.apache.poi.hwpf.model.SectionTable;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.usermodel.Range;
  import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.DocumentEntry;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -40,6 +48,24 @@ public abstract class HWPFDocumentCore extends POIDocument
    /** The FIB */
    protected FileInformationBlock _fib;
  
+  /** Holds styles for this document.*/
+  protected StyleSheet _ss;
+
+  /** Contains formatting properties for text*/
+  protected CHPBinTable _cbt;
+
+  /** Contains formatting properties for paragraphs*/
+  protected PAPBinTable _pbt;
+
+  /** Contains formatting properties for sections.*/
+  protected SectionTable _st;
+
+  /** Holds fonts for this document.*/
+  protected FontTable _ft;
+
+  /** Hold list tables */
+  protected ListTables _lt;
+
    /** main document stream buffer*/
    protected byte[] _mainStream;
  
@@ -123,6 +149,44 @@ public abstract class HWPFDocumentCore extends POIDocument
      }
    }
  
+  /**
+   * Returns the range which covers the whole of the
+   *  document, but excludes any headers and footers.
+   */
+  public abstract Range getRange();
+  
+  public abstract TextPieceTable getTextTable();
+  
+  public CHPBinTable getCharacterTable()
+  {
+    return _cbt;
+  }
+
+  public PAPBinTable getParagraphTable()
+  {
+    return _pbt;
+  }
+
+  public SectionTable getSectionTable()
+  {
+    return _st;
+  }
+
+  public StyleSheet getStyleSheet()
+  {
+    return _ss;
+  }
+
+  public ListTables getListTables()
+  {
+    return _lt;
+  }
+
+  public FontTable getFontTable()
+  {
+    return _ft;
+  }
+
    public FileInformationBlock getFileInformationBlock()
    {
      return _fib;
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java

index 42cff2ace805d1e8291817724e566de5872aecb6..211dc9a6b7aeb2e613491a67ae0f8ed4d1f427c8 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@@ -18,15 +18,15 @@ package org.apache.poi.hwpf;
  
  import java.io.IOException;
  import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.List;
  
-import org.apache.poi.hwpf.model.CHPX;
  import org.apache.poi.hwpf.model.ComplexFileTable;
  import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.OldPAPBinTable;
+import org.apache.poi.hwpf.model.OldSectionTable;
  import org.apache.poi.hwpf.model.PieceDescriptor;
  import org.apache.poi.hwpf.model.TextPiece;
  import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.usermodel.Range;
  import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  import org.apache.poi.util.LittleEndian;
@@ -34,11 +34,9 @@ import org.apache.poi.util.LittleEndian;
  /**
   * Provides very simple support for old (Word 6 / Word 95)
   *  files.
- * TODO Provide a way to get at the properties associated
- *  with each block of text
   */
  public class HWPFOldDocument extends HWPFDocumentCore {
-    private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>(); 
+    private TextPieceTable tpt;
      
      public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
          this(fs.getRoot(), fs);
@@ -49,14 +47,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
          super(directory, fs);
          
          // Where are things?
+        int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
+        int sedTableSize   = LittleEndian.getInt(_mainStream, 0x8c);
          int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
-        int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+        int chpTableSize   = LittleEndian.getInt(_mainStream, 0xbc);
+        int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
+        int papTableSize   = LittleEndian.getInt(_mainStream, 0xc4);
+        //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
+        //int shfTableSize   = LittleEndian.getInt(_mainStream, 0x64);
          int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
          
          // We need to get hold of the text that makes up the
          //  document, which might be regular or fast-saved
          StringBuffer text = new StringBuffer();
-        TextPieceTable tpt;
          if(_fib.isFComplex()) {
              ComplexFileTable cft = new ComplexFileTable(
                      _mainStream, _mainStream,
@@ -68,11 +71,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
                  text.append( tp.getStringBuffer() );
              }
          } else {
+            // TODO Discover if these older documents can ever hold Unicode Strings?
+            //  (We think not, because they seem to lack a Piece table)
              // TODO Build the Piece Descriptor properly
-            // TODO Can these old documents ever contain Unicode strings?
+            //  (We have to fake it, as they don't seem to have a proper Piece table)
              PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
              pd.setFilePosition(_fib.getFcMin());
  
+            // Generate a single Text Piece Table, with a single Text Piece
+            //  which covers all the (8 bit only) text in the file
              tpt = new TextPieceTable();
              byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
              System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
@@ -85,51 +92,34 @@ public class HWPFOldDocument extends HWPFDocumentCore {
          }
          
          // Now we can fetch the character and paragraph properties
-        OldCHPBinTable chpTable = new OldCHPBinTable(
+        _cbt = new OldCHPBinTable(
                  _mainStream, chpTableOffset, chpTableSize,
                  _fib.getFcMin(), tpt
          );
-        
-        // Finally build up runs
-        for(CHPX chpx : chpTable.getTextRuns()) {
-            String str = text.substring(chpx.getStart(), chpx.getEnd());
-            contents.add(new TextAndCHPX(str,chpx));
-        }
+        _pbt = new OldPAPBinTable(
+                _mainStream, papTableOffset, papTableSize,
+                _fib.getFcMin(), tpt
+        );
+        _st = new OldSectionTable(
+                _mainStream, sedTableOffset, sedTableSize,
+                _fib.getFcMin(), tpt
+        );
+    }
+    
+    public Range getRange() {
+        // Life is easy when we have no footers, headers or unicode!
+        return new Range(
+                0, _fib.getFcMac() - _fib.getFcMin(), this
+        );
+    }
+
+    public TextPieceTable getTextTable()
+    {
+      return tpt;
      }
  
      @Override
      public void write(OutputStream out) throws IOException {
          throw new IllegalStateException("Writing is not available for the older file formats");
      }
-    
-    /**
-     * Retrieves all our text, in order, along with the
-     *  CHPX information on each bit.
-     * Every entry has the same formatting, but as yet 
-     *  we've no way to tell what the formatting is...
-     * Warnings - this will change as soon as we support
-     *  text formatting!
-     */
-    public List<TextAndCHPX> getContents() {
-        return contents;
-    }
-    
-    /**
-     * Warnings - this will change as soon as we support
-     *  text formatting!
-     */
-    public static class TextAndCHPX {
-        private String text;
-        private CHPX chpx;
-        private TextAndCHPX(String text, CHPX chpx) {
-            this.text = text;
-            this.chpx = chpx;
-        }
-        public String getText() {
-            return text;
-        }
-        public CHPX getChpx() {
-            return chpx;
-        }
-    }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java

index 3ea6d42d467da109f63761de840e86ef50d507bb..b40aa396b59750e50af34349796cbeff6b895f8b 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@@ -22,7 +22,6 @@ import java.io.InputStream;
  
  import org.apache.poi.POIOLE2TextExtractor;
  import org.apache.poi.hwpf.HWPFOldDocument;
-import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
  import org.apache.poi.hwpf.usermodel.Range;
  import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -68,12 +67,41 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
                 this.doc = doc;
         }
  
-    @Override
+    /**
+     * Get the text from the word file, as an array with one String
+     *  per paragraph
+     */
+       public String[] getParagraphText() {
+           String[] ret;
+
+           // Extract using the model code
+           try {
+               Range r = doc.getRange();
+
+               ret = WordExtractor.getParagraphText(r);
+           } catch (Exception e) {
+            // Something's up with turning the text pieces into paragraphs
+            // Fall back to ripping out the text pieces
+               ret = new String[doc.getTextTable().getTextPieces().size()];
+               for(int i=0; i<ret.length; i++) {
+                   ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuffer().toString();
+                   
+                   // Fix the line endings
+                   ret[i].replaceAll("\r", "\ufffe");
+                ret[i].replaceAll("\ufffe","\r\n");
+               }
+           }
+
+           return ret;
+       }
+
      public String getText() {
          StringBuffer text = new StringBuffer();
-        for(TextAndCHPX tchpx : doc.getContents()) {
-            text.append( Range.stripFields(tchpx.getText()) );
+        
+        for(String t : getParagraphText()) {
+            text.append(t);
          }
+
          return text.toString();
      }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index fe57f7c474f9da141ac9305b3b89dfa21a3603b4..eaf70c116a1dc930be2d6c895db34281171cb4bd 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -17,13 +17,12 @@
  
  package org.apache.poi.hwpf.extractor;
  
+import java.io.FileInputStream;
  import java.io.IOException;
  import java.io.InputStream;
-import java.io.FileInputStream;
  import java.io.UnsupportedEncodingException;
-import java.util.Iterator;
-import java.util.Arrays;
  import java.util.ArrayList;
+import java.util.Arrays;
  
  import org.apache.poi.POIOLE2TextExtractor;
  import org.apache.poi.hwpf.HWPFDocument;
@@ -133,7 +132,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
                  return getParagraphText(r);
          }
  
-        private String[] getParagraphText(Range r) {
+        protected static String[] getParagraphText(Range r) {
                  String[] ret;
                  ret = new String[r.numParagraphs()];
                  for (int i = 0; i < ret.length; i++) {
@@ -215,10 +214,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
         public String getTextFromPieces() {
         StringBuffer textBuf = new StringBuffer();
  
-       Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
-       while (textPieces.hasNext()) {
-               TextPiece piece = (TextPiece) textPieces.next();
-
+       for(TextPiece piece : doc.getTextTable().getTextPieces()) {
                 String encoding = "Cp1252";
                 if (piece.isUnicode()) {
                         encoding = "UTF-16LE";
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java

index 840f7c249d8a4ccba2da3a75afc4944971ffc59f..4dd20d69dd4e08a9cbc2e36384ef85bb0045381d 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@@ -32,7 +32,7 @@ import org.apache.poi.hwpf.sprm.SprmBuffer;
   *
   * @author Ryan Ackley
   */
-public final class CHPBinTable
+public class CHPBinTable
  {
    /** List of character properties.*/
    protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java

index 3c97652e55474e11a950315e403b3c311e88047a..1431cff0321cb04326e08d00042caac10a90e383 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
@@ -17,9 +17,6 @@
  
  package org.apache.poi.hwpf.model;
  
-import java.util.ArrayList;
-import java.util.List;
-
  import org.apache.poi.poifs.common.POIFSConstants;
  import org.apache.poi.util.LittleEndian;
  
@@ -31,11 +28,8 @@ import org.apache.poi.util.LittleEndian;
   * In common with the rest of the old support, it 
   *  is read only
   */
-public final class OldCHPBinTable
+public final class OldCHPBinTable extends CHPBinTable
  {
-  /** List of character properties.*/
-  protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
-
    /**
     * Constructor used to read an old-style binTable
     *  in from a Word document.
@@ -69,9 +63,4 @@ public final class OldCHPBinTable
        }
      }
    }
-
-  public List<CHPX> getTextRuns()
-  {
-    return _textRuns;
-  }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java

new file mode 100644 (file)

index 0000000..37ec575
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java
@@ -0,0 +1,59 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the paragraph formatting 
+ *  properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ *  same stream.
+ * In common with the rest of the old support, it 
+ *  is read only
+ */
+public final class OldPAPBinTable extends PAPBinTable
+{
+  public OldPAPBinTable(byte[] documentStream, int offset,
+                     int size, int fcMin, TextPieceTable tpt)
+  {
+    PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+    int length = binTable.length();
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = binTable.getProperty(x);
+
+      int pageNum = LittleEndian.getShort(node.getBytes());
+      int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+
+      PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
+        documentStream, pageOffset, fcMin, tpt);
+
+      int fkpSize = pfkp.size();
+
+      for (int y = 0; y < fkpSize; y++)
+      {
+       PAPX papx = pfkp.getPAPX(y);
+        _paragraphs.add(papx);
+      }
+    }
+  }
+}
+
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java

new file mode 100644 (file)

index 0000000..dc992c6
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
@@ -0,0 +1,65 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the section formatting 
+ *  properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ *  same stream.
+ * In common with the rest of the old support, it 
+ *  is read only
+ */
+public final class OldSectionTable extends SectionTable
+{
+  public OldSectionTable(byte[] documentStream, int offset,
+                      int size, int fcMin,
+                      TextPieceTable tpt)
+  {
+    PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+
+    int length = sedPlex.length();
+
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = sedPlex.getProperty(x);
+      SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
+
+      int fileOffset = sed.getFc();
+      int startAt = node.getStart();
+      int endAt = node.getEnd();
+
+      // check for the optimization
+      if (fileOffset == 0xffffffff)
+      {
+        _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+      }
+      else
+      {
+        // The first short at the offset is the size of the grpprl.
+        int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
+        byte[] buf = new byte[sepxSize];
+        fileOffset += LittleEndian.SHORT_SIZE;
+        System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
+        _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+      }
+    }
+  }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java

index 06b6d3f6af4ca1e8714c761b486f7f34ec645ad6..894210f4b3a1231a9de3be8633000654456ed50a 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@@ -34,7 +34,7 @@ import org.apache.poi.util.LittleEndian;
   *
   * @author Ryan Ackley
   */
-public final class PAPBinTable
+public class PAPBinTable
  {
    protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
    byte[] _dataStream;
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java

index 8d77ba5fb20f0db7e7b7d7b065a12cb43681747d..22777a80e7702538554cd48f741fae18263ddd29 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java
@@ -112,7 +112,11 @@ public final class PAPX extends BytePropertyNode {
      {
        return 0;
      }
-      return LittleEndian.getShort(buf);
+    if (buf.length == 1)
+    {
+      return (short)LittleEndian.getUnsignedByte(buf, 0);
+    }
+    return LittleEndian.getShort(buf);
    }
  
    public SprmBuffer getSprmBuf()
@@ -122,6 +126,11 @@ public final class PAPX extends BytePropertyNode {
  
    public ParagraphProperties getParagraphProperties(StyleSheet ss)
    {
+    if(ss == null) {
+        // TODO Fix up for Word 6/95
+        return new ParagraphProperties();
+    }
+      
      short istd = getIstd();
      ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
      ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java

index 3b47c1e91cc7bf53199463817b4481d8e95a30a9..987e49ac89effa3732006f089b038524648600db 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java
@@ -27,12 +27,12 @@ import org.apache.poi.hwpf.model.io.*;
  /**
   * @author Ryan Ackley
   */
-public final class SectionTable
+public class SectionTable
  {
    private static final int SED_SIZE = 12;
  
-  protected ArrayList _sections = new ArrayList();
-  protected List _text;
+  protected ArrayList<SEPX> _sections = new ArrayList<SEPX>();
+  protected List<TextPiece> _text;
  
    /** So we can know if things are unicode or not */
    private TextPieceTable tpt;
@@ -84,7 +84,7 @@ public final class SectionTable
      boolean matchAt = false;
      boolean matchHalf = false;
      for(int i=0; i<_sections.size(); i++) {
-       SEPX s = (SEPX)_sections.get(i);
+       SEPX s = _sections.get(i);
         if(s.getEnd() == mainEndsAt) {
                 matchAt = true;
         } else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
@@ -94,7 +94,7 @@ public final class SectionTable
      if(! matchAt && matchHalf) {
         System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
          for(int i=0; i<_sections.size(); i++) {
-               SEPX s = (SEPX)_sections.get(i);
+               SEPX s = _sections.get(i);
              GenericPropertyNode node = sedPlex.getProperty(i);
  
                 s.setStart( CPtoFC(node.getStart()) );
@@ -106,12 +106,12 @@ public final class SectionTable
    public void adjustForInsert(int listIndex, int length)
    {
      int size = _sections.size();
-    SEPX sepx = (SEPX)_sections.get(listIndex);
+    SEPX sepx = _sections.get(listIndex);
      sepx.setEnd(sepx.getEnd() + length);
  
      for (int x = listIndex + 1; x < size; x++)
      {
-      sepx = (SEPX)_sections.get(x);
+      sepx = _sections.get(x);
        sepx.setStart(sepx.getStart() + length);
        sepx.setEnd(sepx.getEnd() + length);
      }
@@ -129,7 +129,7 @@ public final class SectionTable
  
        for(int i=_text.size()-1; i>-1; i--)
        {
-        TP = (TextPiece)_text.get(i);
+        TP = _text.get(i);
  
          if(CP >= TP.getCP()) break;
        }
@@ -142,7 +142,7 @@ public final class SectionTable
        return FC;
      }
  
-  public ArrayList getSections()
+  public ArrayList<SEPX> getSections()
    {
      return _sections;
    }
@@ -159,7 +159,7 @@ public final class SectionTable
  
      for (int x = 0; x < len; x++)
      {
-      SEPX sepx = (SEPX)_sections.get(x);
+      SEPX sepx = _sections.get(x);
        byte[] grpprl = sepx.getGrpprl();
  
        // write the sepx to the document stream. starts with a 2 byte size
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java

index 1cac46d098a6b965713514dd16f8b75e7ae53c97..86f8bb54fa63e21b6a33aa95882e9c17472d0550 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@@ -20,6 +20,7 @@ package org.apache.poi.hwpf.usermodel;
  import org.apache.poi.util.LittleEndian;
  
  import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFDocumentCore;
  
  import org.apache.poi.hwpf.usermodel.CharacterRun;
  import org.apache.poi.hwpf.usermodel.Paragraph;
@@ -77,7 +78,7 @@ public class Range { // TODO -instantiable superclass
         protected int _end;
  
         /** The document this range blongs to. */
-       protected HWPFDocument _doc;
+       protected HWPFDocumentCore _doc;
  
         /** Have we loaded the section indexes yet */
         boolean _sectionRangeFound;
@@ -144,7 +145,7 @@ public class Range { // TODO -instantiable superclass
          * @param doc
          *            The HWPFDocument the range is based on.
          */
-       public Range(int start, int end, HWPFDocument doc) {
+       public Range(int start, int end, HWPFDocumentCore doc) {
                 _start = start;
                 _end = end;
                 _doc = doc;
@@ -1004,6 +1005,8 @@ public class Range { // TODO -instantiable superclass
          *            The (signed) value that should be added to the FIB CCP fields
          */
         protected void adjustFIB(int adjustment) {
+           assert (_doc instanceof HWPFDocument);
+           
                 // update the FIB.CCPText field (this should happen once per adjustment,
                 // so we don't want it in
                 // adjustForInsert() or it would get updated multiple times if the range
@@ -1011,7 +1014,7 @@ public class Range { // TODO -instantiable superclass
                 // without this, OpenOffice.org (v. 2.2.x) does not see all the text in
                 // the document
  
-               CPSplitCalculator cpS = _doc.getCPSplitCalculator();
+               CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator();
                 FileInformationBlock fib = _doc.getFileInformationBlock();
  
                 // Do for each affected part
@@ -1066,7 +1069,7 @@ public class Range { // TODO -instantiable superclass
                 return _end;
         }
  
-       protected HWPFDocument getDocument() {
+       protected HWPFDocumentCore getDocument() {
  
                 return _doc;
         }
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

index 9634ab9d40cca809efcbf36fe17851776408918e..eabb16b7c83ca075cb56c0f5eabbaa3bea385f13 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -256,6 +256,16 @@ public final class TestWordExtractor extends TestCase {
          assertTrue(text.contains("Paragraph 2"));
          assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
          assertTrue(text.contains("Last (4th) paragraph"));
+        
+        String[] tp = w6e.getParagraphText();
+        assertEquals(7, tp.length);
+        assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
+        assertEquals("\r\n", tp[1]);
+        assertEquals("Paragraph 2\r\n", tp[2]);
+        assertEquals("\r\n", tp[3]);
+        assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
+        assertEquals("\r\n", tp[5]);
+        assertEquals("Last (4th) paragraph.\r\n", tp[6]);
         }
         
         public void testWord6() throws Exception {
@@ -273,5 +283,9 @@ public final class TestWordExtractor extends TestCase {
          String text = w6e.getText();
          
          assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+        
+        String[] tp = w6e.getParagraphText();
+        assertEquals(1, tp.length);
+        assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
         }
  }
author	Nick Burch <nick@apache.org>
	Fri, 2 Jul 2010 20:59:30 +0000 (20:59 +0000)
committer	Nick Burch <nick@apache.org>
	Fri, 2 Jul 2010 20:59:30 +0000 (20:59 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java		patch \| blob \| history