Basic text extraction support for old Word 6 and Word 95 documents via some HWPF...

author Nick Burch <nick@apache.org>

Wed, 30 Jun 2010 15:13:10 +0000 (15:13 +0000)

committer Nick Burch <nick@apache.org>

Wed, 30 Jun 2010 15:13:10 +0000 (15:13 +0000)
author Nick Burch <nick@apache.org>
Wed, 30 Jun 2010 15:13:10 +0000 (15:13 +0000)
committer Nick Burch <nick@apache.org>
Wed, 30 Jun 2010 15:13:10 +0000 (15:13 +0000)
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java

index f0d8b1d8eebb5ae7b095568271ef20013c061e24..bd31f6253dfcce15d00708dababf22ac9db2fc0b 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@@ -17,26 +17,43 @@
  
  package org.apache.poi.hwpf;
  
-import java.io.InputStream;
+import java.io.ByteArrayInputStream;
  import java.io.FileInputStream;
  import java.io.FileNotFoundException;
-import java.io.PushbackInputStream;
  import java.io.IOException;
+import java.io.InputStream;
  import java.io.OutputStream;
-import java.io.ByteArrayInputStream;
-
  import java.util.Iterator;
  
-import org.apache.poi.EncryptedDocumentException;
-import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CPSplitCalculator;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.DocumentProperties;
+import org.apache.poi.hwpf.model.EscherRecordHolder;
+import org.apache.poi.hwpf.model.FSPATable;
+import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.FontTable;
+import org.apache.poi.hwpf.model.GenericPropertyNode;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.model.PAPBinTable;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.PlexOfCps;
+import org.apache.poi.hwpf.model.PropertyNode;
+import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.SavedByTable;
+import org.apache.poi.hwpf.model.SectionTable;
+import org.apache.poi.hwpf.model.ShapesTable;
+import org.apache.poi.hwpf.model.StyleSheet;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.model.io.HWPFFileSystem;
+import org.apache.poi.hwpf.model.io.HWPFOutputStream;
+import org.apache.poi.hwpf.usermodel.HWPFList;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.common.POIFSConstants;
  import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.common.POIFSConstants;
-
-import org.apache.poi.hwpf.model.*;
-import org.apache.poi.hwpf.model.io.*;
-import org.apache.poi.hwpf.usermodel.*;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  
  /**
@@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*;
   *
   * @author Ryan Ackley
   */
-public final class HWPFDocument extends POIDocument
-//  implements Cloneable
+public final class HWPFDocument extends HWPFDocumentCore
  {
-  /** The FIB */
-  protected FileInformationBlock _fib;
    /** And for making sense of CP lengths in the FIB */
    protected CPSplitCalculator _cpSplit;
  
-  /** main document stream buffer*/
-  protected byte[] _mainStream;
-
    /** table stream buffer*/
    protected byte[] _tableStream;
  
@@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument
  
    protected HWPFDocument()
    {
-     super(null, null);
-  }
-
-  /**
-   * Takens an InputStream, verifies that it's not RTF, builds a
-   *  POIFSFileSystem from it, and returns that.
-   */
-  public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
-       // Open a PushbackInputStream, so we can peek at the first few bytes
-       PushbackInputStream pis = new PushbackInputStream(istream,6);
-       byte[] first6 = new byte[6];
-       pis.read(first6);
-
-       // Does it start with {\rtf ? If so, it's really RTF
-       if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
-               && first6[3] == 't' && first6[4] == 'f') {
-               throw new IllegalArgumentException("The document is really a RTF file");
-       }
-
-       // OK, so it's not RTF
-       // Open a POIFSFileSystem on the (pushed back) stream
-       pis.unread(first6);
-       return new POIFSFileSystem(pis);
+     super();
    }
  
    /**
@@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument
     */
    public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
    {
-    // Sort out the hpsf properties
+    // Load the main stream and FIB
+    // Also handles HPSF bits
         super(directory, pfilesystem);
  
-    // read in the main stream.
-    DocumentEntry documentProps = (DocumentEntry)
-       directory.getEntry("WordDocument");
-    _mainStream = new byte[documentProps.getSize()];
-
-    directory.createDocumentInputStream("WordDocument").read(_mainStream);
-
-    // Create our FIB, and check for the doc being encrypted
-    _fib = new FileInformationBlock(_mainStream);
+    // Do the CP Split
      _cpSplit = new CPSplitCalculator(_fib);
-    if(_fib.isFEncrypted()) {
-       throw new EncryptedDocumentException("Cannot process encrypted word files!");
+    
+    // Is this document too old for us?
+    if(_fib.getNFib() < 106) {
+        throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
      }
  
      // use the fib to determine the name of the table stream.
@@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument
        t.printStackTrace();
      }
    }
-
-//  public Object clone()
-//    throws CloneNotSupportedException
-//  {
-//    _tpt;
-//
-//    _cbt;
-//
-//    _pbt;
-//
-//    _st;
-//
-//  }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java

new file mode 100644 (file)

index 0000000..af17cc2
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@@ -0,0 +1,130 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.POIDocument;
+import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+
+/**
+ * This class holds much of the core of a Word document, but
+ *  without some of the table structure information.
+ * You generally want to work with one of
+ *  {@link HWPFDocument} or {@link HWPFOldDocument} 
+ */
+public abstract class HWPFDocumentCore extends POIDocument
+{
+  /** The FIB */
+  protected FileInformationBlock _fib;
+
+  /** main document stream buffer*/
+  protected byte[] _mainStream;
+
+  protected HWPFDocumentCore()
+  {
+     super(null, null);
+  }
+
+  /**
+   * Takens an InputStream, verifies that it's not RTF, builds a
+   *  POIFSFileSystem from it, and returns that.
+   */
+  public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
+       // Open a PushbackInputStream, so we can peek at the first few bytes
+       PushbackInputStream pis = new PushbackInputStream(istream,6);
+       byte[] first6 = new byte[6];
+       pis.read(first6);
+
+       // Does it start with {\rtf ? If so, it's really RTF
+       if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
+               && first6[3] == 't' && first6[4] == 'f') {
+               throw new IllegalArgumentException("The document is really a RTF file");
+       }
+
+       // OK, so it's not RTF
+       // Open a POIFSFileSystem on the (pushed back) stream
+       pis.unread(first6);
+       return new POIFSFileSystem(pis);
+  }
+
+  /**
+   * This constructor loads a Word document from an InputStream.
+   *
+   * @param istream The InputStream that contains the Word document.
+   * @throws IOException If there is an unexpected IOException from the passed
+   *         in InputStream.
+   */
+  public HWPFDocumentCore(InputStream istream) throws IOException
+  {
+    //do Ole stuff
+    this( verifyAndBuildPOIFS(istream) );
+  }
+
+  /**
+   * This constructor loads a Word document from a POIFSFileSystem
+   *
+   * @param pfilesystem The POIFSFileSystem that contains the Word document.
+   * @throws IOException If there is an unexpected IOException from the passed
+   *         in POIFSFileSystem.
+   */
+  public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
+  {
+       this(pfilesystem.getRoot(), pfilesystem);
+  }
+
+  /**
+   * This constructor loads a Word document from a specific point
+   *  in a POIFSFileSystem, probably not the default.
+   * Used typically to open embeded documents.
+   *
+   * @param pfilesystem The POIFSFileSystem that contains the Word document.
+   * @throws IOException If there is an unexpected IOException from the passed
+   *         in POIFSFileSystem.
+   */
+  public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
+  {
+    // Sort out the hpsf properties
+       super(directory, pfilesystem);
+
+    // read in the main stream.
+    DocumentEntry documentProps = (DocumentEntry)
+       directory.getEntry("WordDocument");
+    _mainStream = new byte[documentProps.getSize()];
+
+    directory.createDocumentInputStream("WordDocument").read(_mainStream);
+
+    // Create our FIB, and check for the doc being encrypted
+    _fib = new FileInformationBlock(_mainStream);
+    if(_fib.isFEncrypted()) {
+       throw new EncryptedDocumentException("Cannot process encrypted word files!");
+    }
+  }
+
+  public FileInformationBlock getFileInformationBlock()
+  {
+    return _fib;
+  }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java

new file mode 100644 (file)

index 0000000..42cff2a
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@@ -0,0 +1,135 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.OldCHPBinTable;
+import org.apache.poi.hwpf.model.PieceDescriptor;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Provides very simple support for old (Word 6 / Word 95)
+ *  files.
+ * TODO Provide a way to get at the properties associated
+ *  with each block of text
+ */
+public class HWPFOldDocument extends HWPFDocumentCore {
+    private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>(); 
+    
+    public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
+        this(fs.getRoot(), fs);
+    }
+
+    public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
+            throws IOException {
+        super(directory, fs);
+        
+        // Where are things?
+        int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
+        int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
+        int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
+        
+        // We need to get hold of the text that makes up the
+        //  document, which might be regular or fast-saved
+        StringBuffer text = new StringBuffer();
+        TextPieceTable tpt;
+        if(_fib.isFComplex()) {
+            ComplexFileTable cft = new ComplexFileTable(
+                    _mainStream, _mainStream,
+                    complexTableOffset, _fib.getFcMin()
+            );
+            tpt = cft.getTextPieceTable();
+            
+            for(TextPiece tp : tpt.getTextPieces()) {
+                text.append( tp.getStringBuffer() );
+            }
+        } else {
+            // TODO Build the Piece Descriptor properly
+            // TODO Can these old documents ever contain Unicode strings?
+            PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
+            pd.setFilePosition(_fib.getFcMin());
+
+            tpt = new TextPieceTable();
+            byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
+            System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
+            TextPiece tp = new TextPiece(
+                    0, textData.length, textData, pd, 0
+            );
+            tpt.getTextPieces().add(tp);
+            
+            text.append(tp.getStringBuffer());
+        }
+        
+        // Now we can fetch the character and paragraph properties
+        OldCHPBinTable chpTable = new OldCHPBinTable(
+                _mainStream, chpTableOffset, chpTableSize,
+                _fib.getFcMin(), tpt
+        );
+        
+        // Finally build up runs
+        for(CHPX chpx : chpTable.getTextRuns()) {
+            String str = text.substring(chpx.getStart(), chpx.getEnd());
+            contents.add(new TextAndCHPX(str,chpx));
+        }
+    }
+
+    @Override
+    public void write(OutputStream out) throws IOException {
+        throw new IllegalStateException("Writing is not available for the older file formats");
+    }
+    
+    /**
+     * Retrieves all our text, in order, along with the
+     *  CHPX information on each bit.
+     * Every entry has the same formatting, but as yet 
+     *  we've no way to tell what the formatting is...
+     * Warnings - this will change as soon as we support
+     *  text formatting!
+     */
+    public List<TextAndCHPX> getContents() {
+        return contents;
+    }
+    
+    /**
+     * Warnings - this will change as soon as we support
+     *  text formatting!
+     */
+    public static class TextAndCHPX {
+        private String text;
+        private CHPX chpx;
+        private TextAndCHPX(String text, CHPX chpx) {
+            this.text = text;
+            this.chpx = chpx;
+        }
+        public String getText() {
+            return text;
+        }
+        public CHPX getChpx() {
+            return chpx;
+        }
+    }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java

new file mode 100644 (file)

index 0000000..cfa97bc
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java
@@ -0,0 +1,25 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import org.apache.poi.OldFileFormatException;
+
+public class OldWordFileFormatException extends OldFileFormatException {
+    public OldWordFileFormatException(String s) {
+        super(s);
+    }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java

new file mode 100644 (file)

index 0000000..3ea6d42
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@@ -0,0 +1,79 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to extract the text from old (Word 6 / Word 95) Word Documents.
+ *
+ * This should only be used on the older files, for most uses you
+ *  should call {@link WordExtractor} which deals properly 
+ *  with HWPF.
+ *
+ * @author Nick Burch
+ */
+public final class Word6Extractor extends POIOLE2TextExtractor {
+       private POIFSFileSystem fs;
+       private HWPFOldDocument doc;
+
+       /**
+        * Create a new Word Extractor
+        * @param is InputStream containing the word file
+        */
+       public Word6Extractor(InputStream is) throws IOException {
+               this( new POIFSFileSystem(is) );
+       }
+
+       /**
+        * Create a new Word Extractor
+        * @param fs POIFSFileSystem containing the word file
+        */
+       public Word6Extractor(POIFSFileSystem fs) throws IOException {
+               this(fs.getRoot(), fs);
+       }
+       public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+           this(new HWPFOldDocument(dir,fs));
+       }
+
+       /**
+        * Create a new Word Extractor
+        * @param doc The HWPFOldDocument to extract from
+        */
+       public Word6Extractor(HWPFOldDocument doc) {
+               super(doc);
+               this.doc = doc;
+       }
+
+    @Override
+    public String getText() {
+        StringBuffer text = new StringBuffer();
+        for(TextAndCHPX tchpx : doc.getContents()) {
+            text.append( Range.stripFields(tchpx.getText()) );
+        }
+        return text.toString();
+    }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index 1be78ab1a660e2db1d4bb70b79519ec65bda31db..fe57f7c474f9da141ac9305b3b89dfa21a3603b4 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   * You should use either getParagraphText() or getText() unless
   *  you have a strong reason otherwise.
   *
- * @author Nick Burch (nick at torchbox dot com)
+ * @author Nick Burch
   */
  public final class WordExtractor extends POIOLE2TextExtractor {
         private POIFSFileSystem fs;
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java

index b78cdffc57fb4995b562a3f3c6ac37b4db20a8d6..f56621afad5f425e7ef0f876cf7891e25cce3a07 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java
@@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode
      CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
      return props;
    }
+  
+  public String toString() {
+      return "CHPX from " + getStart() + " to " + getEnd() + 
+         " (in bytes " + getStartBytes() + " to " + getEndBytes() + ")";
+  }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java

new file mode 100644 (file)

index 0000000..3c97652
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java
@@ -0,0 +1,77 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.model;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class holds all of the character formatting 
+ *  properties from Old (Word 6 / Word 95) documents.
+ * Unlike with Word 97+, it all gets held in the
+ *  same stream.
+ * In common with the rest of the old support, it 
+ *  is read only
+ */
+public final class OldCHPBinTable
+{
+  /** List of character properties.*/
+  protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
+
+  /**
+   * Constructor used to read an old-style binTable
+   *  in from a Word document.
+   *
+   * @param documentStream
+   * @param offset
+   * @param size
+   * @param fcMin
+   */
+  public OldCHPBinTable(byte[] documentStream, int offset,
+                     int size, int fcMin, TextPieceTable tpt)
+  {
+    PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
+
+    int length = binTable.length();
+    for (int x = 0; x < length; x++)
+    {
+      GenericPropertyNode node = binTable.getProperty(x);
+
+      int pageNum = LittleEndian.getShort(node.getBytes());
+      int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+
+      CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
+        pageOffset, fcMin, tpt);
+
+      int fkpSize = cfkp.size();
+
+      for (int y = 0; y < fkpSize; y++)
+      {
+        _textRuns.add(cfkp.getCHPX(y));
+      }
+    }
+  }
+
+  public List<CHPX> getTextRuns()
+  {
+    return _textRuns;
+  }
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

index 47e0b431f6c01119861b96c2f2d46fc901b91c81..9634ab9d40cca809efcbf36fe17851776408918e 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor;
  
  import junit.framework.TestCase;
  
+import org.apache.poi.POIDataSamples;
  import org.apache.poi.hwpf.HWPFDocument;
  import org.apache.poi.hwpf.HWPFTestDataSamples;
+import org.apache.poi.hwpf.OldWordFileFormatException;
  import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.POIDataSamples;
-
-import java.io.FileInputStream;
  
  /**
   * Test the different routes to extracting text
@@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase {
  
                 assertTrue(b.toString().contains("TestComment"));
         }
+       
+       public void testWord95() throws Exception {
+           // Too old for the default
+           try {
+               extractor = new WordExtractor(
+                               POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
+               );
+               fail();
+           } catch(OldWordFileFormatException e) {}
+               
+               // Can work with the special one
+           Word6Extractor w6e = new Word6Extractor(
+                POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
+        );
+               String text = w6e.getText();
+               
+               assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+        assertTrue(text.contains("Paragraph 2"));
+        assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
+        assertTrue(text.contains("Last (4th) paragraph"));
+       }
+       
+       public void testWord6() throws Exception {
+        // Too old for the default
+        try {
+               extractor = new WordExtractor(
+                               POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
+               );
+            fail();
+        } catch(OldWordFileFormatException e) {}
+        
+        Word6Extractor w6e = new Word6Extractor(
+                POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
+        );
+        String text = w6e.getText();
+        
+        assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
+       }
  }
diff --git a/test-data/document/Word6.doc b/test-data/document/Word6.doc

new file mode 100644 (file)

index 0000000..a614a07

Binary files /dev/null and b/test-data/document/Word6.doc differ
diff --git a/test-data/document/Word95.doc b/test-data/document/Word95.doc

new file mode 100644 (file)

index 0000000..a214f29

Binary files /dev/null and b/test-data/document/Word95.doc differ
author	Nick Burch <nick@apache.org>
	Wed, 30 Jun 2010 15:13:10 +0000 (15:13 +0000)
committer	Nick Burch <nick@apache.org>
	Wed, 30 Jun 2010 15:13:10 +0000 (15:13 +0000)
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/OldWordFileFormatException.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/CHPX.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java		patch \| blob \| history
test-data/document/Word6.doc	[new file with mode: 0644]	patch \| blob
test-data/document/Word95.doc	[new file with mode: 0644]	patch \| blob