Fix support for sections in old word 6 / word 95 files

author Nick Burch <nick@apache.org>

Fri, 17 Sep 2010 13:46:11 +0000 (13:46 +0000)

committer Nick Burch <nick@apache.org>

Fri, 17 Sep 2010 13:46:11 +0000 (13:46 +0000)
author Nick Burch <nick@apache.org>
Fri, 17 Sep 2010 13:46:11 +0000 (13:46 +0000)
committer Nick Burch <nick@apache.org>
Fri, 17 Sep 2010 13:46:11 +0000 (13:46 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 1332578fff9267140f5088e192ceddb1e3182ae2..0da406cdc32092c35d2a27c9a7d28924f6ab92af 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
  
      <changes>
          <release version="3.7-beta3" date="2010-??-??">
+           <action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
             <action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
             <action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
             <action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index eaf70c116a1dc930be2d6c895db34281171cb4bd..1c6ed2b1d208a8d993c7ca440af1153909608dc8 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -245,22 +245,22 @@ public final class WordExtractor extends POIOLE2TextExtractor {
          *  but slightly slower than getTextFromPieces().
          */
         public String getText() {
-               StringBuffer ret = new StringBuffer();
+          StringBuffer ret = new StringBuffer();
  
-               ret.append(getHeaderText());
+          ret.append(getHeaderText());
  
-                ArrayList<String> text = new ArrayList<String>();
-                text.addAll(Arrays.asList(getParagraphText()));
-                text.addAll(Arrays.asList(getFootnoteText()));
-                text.addAll(Arrays.asList(getEndnoteText()));
+          ArrayList<String> text = new ArrayList<String>();
+          text.addAll(Arrays.asList(getParagraphText()));
+          text.addAll(Arrays.asList(getFootnoteText()));
+          text.addAll(Arrays.asList(getEndnoteText()));
  
-               for(String p : text) {
-                       ret.append(p);
-               }
+          for(String p : text) {
+             ret.append(p);
+          }
  
-               ret.append(getFooterText());
+          ret.append(getFooterText());
  
-               return ret.toString();
+          return ret.toString();
         }
  
         /**
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java

index dc992c69b653f224ab1b038ac7c41a0c8b7ec57b..d16edb16bed7192cbc1c43d555aec560d9ec111d 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java
@@ -34,6 +34,7 @@ public final class OldSectionTable extends SectionTable
                        TextPieceTable tpt)
    {
      PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
+    CharIsBytes charConv = new CharIsBytes(tpt);
  
      int length = sedPlex.length();
  
@@ -49,7 +50,7 @@ public final class OldSectionTable extends SectionTable
        // check for the optimization
        if (fileOffset == 0xffffffff)
        {
-        _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
+        _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
        }
        else
        {
@@ -58,8 +59,32 @@ public final class OldSectionTable extends SectionTable
          byte[] buf = new byte[sepxSize];
          fileOffset += LittleEndian.SHORT_SIZE;
          System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
-        _sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
+        _sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
        }
      }
    }
+  
+  private static class CharIsBytes implements CharIndexTranslator {
+     private TextPieceTable tpt;
+     private CharIsBytes(TextPieceTable tpt) {
+        this.tpt = tpt;
+     }
+
+     public int getCharIndex(int bytePos, int startCP) {
+        return bytePos;
+     }
+     public int getCharIndex(int bytePos) {
+        return bytePos;
+     }
+
+     public boolean isIndexInTable(int bytePos) {
+        return tpt.isIndexInTable(bytePos);
+     }
+     public int lookIndexBackward(int bytePos) {
+        return tpt.lookIndexBackward(bytePos);
+     }
+     public int lookIndexForward(int bytePos) {
+        return tpt.lookIndexForward(bytePos);
+     }
+  }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java

index ccd7cc10f3936aee65af6070e61e2cd4e7b8043e..43bfcf8120afc26c6c61b2b12a955eac8f414315 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@@ -216,6 +216,8 @@ public class TextPieceTable implements CharIndexTranslator {
  
              if (bytePos< pieceStart || bytePos > pieceEnd) {
                  toAdd = bytesLength;
+            } else if (bytePos > pieceStart && bytePos < pieceEnd) {
+               toAdd = (bytePos - pieceStart);
              } else {
                  toAdd = bytesLength - (pieceEnd - bytePos);
              }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java b/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java

index 1a6f23fdfb56a83f352182bf00a524ea469532b1..9bfff7fb6aab0bcbe8df0f8f591a73dd0405e5f5 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java
@@ -37,7 +37,8 @@ public final class SprmIterator
  
    public boolean hasNext()
    {
-    return _offset < _grpprl.length;
+    // A Sprm is at least 2 bytes long
+    return _offset < (_grpprl.length-1);
    }
  
    public SprmOperation next()
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java b/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java

index d739136e76e03551b1ee3daf44326672a1f31229..6b4dd4514abffe745f70f19b99ba91c0f2197d23 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java
@@ -20,22 +20,68 @@ package org.apache.poi.hwpf;
  import junit.framework.Test;
  import junit.framework.TestSuite;
  
-import org.apache.poi.hwpf.model.*;
+import org.apache.poi.hwpf.extractor.TestWordExtractor;
+import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
+import org.apache.poi.hwpf.model.TestCHPBinTable;
+import org.apache.poi.hwpf.model.TestDocumentProperties;
+import org.apache.poi.hwpf.model.TestFileInformationBlock;
+import org.apache.poi.hwpf.model.TestFontTable;
+import org.apache.poi.hwpf.model.TestListTables;
+import org.apache.poi.hwpf.model.TestPAPBinTable;
+import org.apache.poi.hwpf.model.TestPlexOfCps;
+import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
+import org.apache.poi.hwpf.model.TestSavedByTable;
+import org.apache.poi.hwpf.model.TestSectionTable;
+import org.apache.poi.hwpf.model.TestStyleSheet;
+import org.apache.poi.hwpf.model.TestTextPieceTable;
+import org.apache.poi.hwpf.usermodel.TestBug46610;
+import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
+import org.apache.poi.hwpf.usermodel.TestHeaderStories;
+import org.apache.poi.hwpf.usermodel.TestPictures;
+import org.apache.poi.hwpf.usermodel.TestProblems;
+import org.apache.poi.hwpf.usermodel.TestRange;
+import org.apache.poi.hwpf.usermodel.TestRangeDelete;
+import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
+import org.apache.poi.hwpf.usermodel.TestRangeProperties;
+import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
+import org.apache.poi.hwpf.usermodel.TestShapes;
  
  public final class AllHWPFTests {
  
         public static Test suite() {
                 TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
+
+               suite.addTestSuite(TestHWPFPictures.class);
+               suite.addTestSuite(TestHWPFRangeParts.class);
+
+               suite.addTestSuite(TestWordExtractor.class);
+               suite.addTestSuite(TestWordExtractorBugs.class);
+
                 suite.addTestSuite(TestCHPBinTable.class);
                 suite.addTestSuite(TestDocumentProperties.class);
                 suite.addTestSuite(TestFileInformationBlock.class);
                 suite.addTestSuite(TestFontTable.class);
+               suite.addTestSuite(TestListTables.class);
                 suite.addTestSuite(TestPAPBinTable.class);
                 suite.addTestSuite(TestPlexOfCps.class);
+               suite.addTestSuite(TestRevisionMarkAuthorTable.class);
+               suite.addTestSuite(TestSavedByTable.class);
                 suite.addTestSuite(TestSectionTable.class);
                 suite.addTestSuite(TestStyleSheet.class);
                 suite.addTestSuite(TestTextPieceTable.class);
-               suite.addTestSuite(TestListTables.class);
+
+               suite.addTestSuite(TestBug46610.class);
+               suite.addTestSuite(TestHeaderStories.class);
+               suite.addTestSuite(TestHWPFOldDocument.class);
+               suite.addTestSuite(TestPictures.class);
+               suite.addTestSuite(TestProblems.class);
+               suite.addTestSuite(TestRange.class);
+               suite.addTestSuite(TestRangeDelete.class);
+               suite.addTestSuite(TestRangeInsertion.class);
+               suite.addTestSuite(TestRangeProperties.class);
+               suite.addTestSuite(TestRangeReplacement.class);
+               suite.addTestSuite(TestShapes.class);
+
                 return suite;
         }
  }
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java b/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java

index 706fe55fa187c7ca822702185df168068362b5dd..2c5253d9cfa1ed3ba992c4379d6285193ef97728 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java
@@ -17,6 +17,7 @@
  package org.apache.poi.hwpf;\r
  \r
  import org.apache.poi.POIDataSamples;\r
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;\r
  \r
  import java.io.*;\r
  \r
@@ -30,6 +31,14 @@ public class HWPFTestDataSamples {
              throw new RuntimeException(e);\r
          }\r
      }\r
+    public static HWPFOldDocument openOldSampleFile(String sampleFileName) {\r
+       try {\r
+           InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);\r
+           return new HWPFOldDocument(new POIFSFileSystem(is));\r
+       } catch (IOException e) {\r
+           throw new RuntimeException(e);\r
+       }\r
+   }\r
      /**\r
       * Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back\r
       * from a <tt>ByteArrayInputStream</tt>.<p/>\r
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

index ea69824daad467a495a83be197c57d747d7dbd98..1ef21db3a5a445270eb93b98b7d170da965ed6d8 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -52,7 +52,7 @@ public final class TestWordExtractor extends TestCase {
  
         // Well behaved document
         private WordExtractor extractor;
-       // Corrupted document - can't do paragraph based stuff
+       // Slightly iffy document
         private WordExtractor extractor2;
         // A word doc embeded in an excel file
         private String filename3;
@@ -93,8 +93,11 @@ public final class TestWordExtractor extends TestCase {
                         assertEquals(p_text1[i], text[i]);
                 }
  
-               // On second one, should fall back
-               assertEquals(1, extractor2.getParagraphText().length);
+               // Lots of paragraphs with only a few lines in them
+               assertEquals(24, extractor2.getParagraphText().length);
+               assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
+      assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
+      assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
         }
  
         /**
@@ -103,8 +106,11 @@ public final class TestWordExtractor extends TestCase {
         public void testGetText() {
                 assertEquals(p_text1_block, extractor.getText());
  
-               // On second one, should fall back to text piece
-               assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
+               // For the 2nd, should give similar answers for
+               //  the two methods, differing only in line endings
+               assertEquals(
+                     extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), 
+                     extractor2.getText().replaceAll("[\\r\\n]", ""));
         }
  
         /**
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java

new file mode 100644 (file)

index 0000000..fc20c71
--- /dev/null
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
@@ -0,0 +1,122 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.OldFileFormatException;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.HWPFTestCase;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+/**
+ * Tests for Word 6 and Word 95 support
+ */
+public final class TestHWPFOldDocument extends HWPFTestCase {
+   /**
+    * Test a simple Word 6 document
+    */
+   public void testWord6() throws Exception {
+      // Can't open as HWPFDocument
+      try {
+         HWPFTestDataSamples.openSampleFile("Word6.doc");
+         fail("Shouldn't be openable");
+      } catch(OldFileFormatException e) {}
+      
+      // Open
+      HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
+      
+      // Check
+      assertEquals(1, doc.getRange().numSections());
+      assertEquals(1, doc.getRange().numParagraphs());
+      assertEquals(1, doc.getRange().numCharacterRuns());
+      
+      assertEquals(
+            "The quick brown fox jumps over the lazy dog\r",
+            doc.getRange().getParagraph(0).text()
+      );
+   }
+   
+   /**
+    * Test a simple Word 95 document
+    */
+   public void testWord95() throws Exception {
+      // Can't open as HWPFDocument
+      try {
+         HWPFTestDataSamples.openSampleFile("Word95.doc");
+         fail("Shouldn't be openable");
+      } catch(OldFileFormatException e) {}
+      
+      // Open
+      HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
+      
+      // Check
+      assertEquals(1, doc.getRange().numSections());
+      assertEquals(7, doc.getRange().numParagraphs());
+      
+      assertEquals(
+            "The quick brown fox jumps over the lazy dog\r",
+            doc.getRange().getParagraph(0).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(1).text());
+      assertEquals(
+            "Paragraph 2\r",
+            doc.getRange().getParagraph(2).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(3).text());
+      assertEquals(
+            "Paragraph 3. Has some RED text and some " +
+            "BLUE BOLD text in it.\r",
+            doc.getRange().getParagraph(4).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(5).text());
+      assertEquals(
+            "Last (4th) paragraph.\r",
+            doc.getRange().getParagraph(6).text()
+      );
+      
+      assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
+      // Normal, red, normal, blue+bold, normal
+      assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
+      assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
+      // Normal, superscript for 4th, normal
+      assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
+   }
+   
+   /**
+    * Test a word document that has sections,
+    *  as well as the usual paragraph stuff.
+    */
+   public void testWord6Sections() throws Exception {
+      HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
+      
+      assertEquals(3, doc.getRange().numSections());
+      assertEquals(6, doc.getRange().numParagraphs());
+      
+      assertEquals(
+            "This is a test.\r",
+            doc.getRange().getParagraph(0).text()
+      );
+      assertEquals("\r", doc.getRange().getParagraph(1).text());
+      assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
+      assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
+      assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
+      assertEquals("\r", doc.getRange().getParagraph(5).text());
+   }
+}
diff --git a/test-data/document/Word6_sections.doc b/test-data/document/Word6_sections.doc

new file mode 100644 (file)

index 0000000..e7ac989

Binary files /dev/null and b/test-data/document/Word6_sections.doc differ
author	Nick Burch <nick@apache.org>
	Fri, 17 Sep 2010 13:46:11 +0000 (13:46 +0000)
committer	Nick Burch <nick@apache.org>
	Fri, 17 Sep 2010 13:46:11 +0000 (13:46 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/sprm/SprmIterator.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/AllHWPFTests.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/HWPFTestDataSamples.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java	[new file with mode: 0644]	patch \| blob
test-data/document/Word6_sections.doc	[new file with mode: 0644]	patch \| blob