fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003;

author Sergey Vladimirov <sergey@apache.org>

Wed, 7 Sep 2011 12:12:17 +0000 (12:12 +0000)

committer Sergey Vladimirov <sergey@apache.org>

Wed, 7 Sep 2011 12:12:17 +0000 (12:12 +0000)
author Sergey Vladimirov <sergey@apache.org>
Wed, 7 Sep 2011 12:12:17 +0000 (12:12 +0000)
committer Sergey Vladimirov <sergey@apache.org>
Wed, 7 Sep 2011 12:12:17 +0000 (12:12 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index ee7f07317f5bb8f89aee52feb3f8b5e4c8a8f2f1..b1682fecdf4d1a38ce4ddaf0b557cc3f1d3bd1d3 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
  
      <changes>
          <release version="3.8-beta5" date="2011-??-??">
+           <action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
             <action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
             <action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
             <action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java

index d2f949813a7d8d5a8c8ea625ce73e1edac67983c..3a984d6a8f6efe1687ec7d50af011da5573b3fa0 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@@ -101,18 +101,21 @@ public class CHPBinTable
        CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
          pageOffset, translator);
  
-      int fkpSize = cfkp.size();
-
-      for (int y = 0; y < fkpSize; y++)
-      {
-        final CHPX chpx = cfkp.getCHPX(y);
-        if (chpx != null)
-            _textRuns.add(chpx);
-      }
+            for ( CHPX chpx : cfkp.getCHPXs() )
+            {
+                if ( chpx != null )
+                    _textRuns.add( chpx );
+            }
      }
          logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
                  Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                  Integer.valueOf( _textRuns.size() ), " elements)" );
+
+        if ( _textRuns.isEmpty() )
+        {
+            logger.log( POILogger.WARN, "CHPX FKPs are empty" );
+            _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
+        }
      }
  
      public void rebuild( ComplexFileTable complexFileTable )
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java

index 2f1215c4e6d96dea8dea2cf711310cabf73f5ab5..28cd870f4b2db6d5278ea5a4e74e8d77adc2164b 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
@@ -18,6 +18,7 @@
  package org.apache.poi.hwpf.model;
  
  import java.util.ArrayList;
+import java.util.Collections;
  import java.util.List;
  
  import org.apache.poi.hwpf.sprm.SprmBuffer;
@@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
              int bytesStartAt = getStart( x );
              int bytesEndAt = getEnd( x );
  
-            int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+            // );
  
-            // TODO: CHECK!
-            // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
-            // ) );
-            CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
-                    getGrpprl( x ), 0 ) );
-            _chpxList.add( chpx );
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+                    bytesEndAt ) )
+            {
+                CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
+                        getGrpprl( x ), 0 ) );
+                _chpxList.add( chpx );
+            }
          }
      }
  
@@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
        return _chpxList.get(index);
      }
  
+    public List<CHPX> getCHPXs()
+    {
+        return Collections.unmodifiableList( _chpxList );
+    }
+
      public void fill(List<CHPX> filler)
      {
        _chpxList.addAll(filler);
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java

index 6272b1afe7967ce078c15398c8c833332e913e90..b6e48e21d5f4399920bfbd0480269c675b03ee73 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
@@ -31,12 +31,16 @@ public interface CharIndexTranslator {
      int getByteIndex( int charPos );\r
  \r
      /**\r
-     * Calculates the char index of the given byte index.\r
-     * Look forward if index is not in table\r
-     *\r
-     * @param bytePos The character offset to check \r
+     * Calculates the char index of the given byte index. Look forward if index\r
+     * is not in table\r
+     * \r
+     * @param bytePos\r
+     *            The character offset to check\r
       * @return the char index\r
+     * @deprecated This API were based on incorrect assumption that single byte\r
+     *             offset corresponds to single char offset\r
       */\r
+    @Deprecated\r
      int getCharIndex(int bytePos);\r
  \r
      /**\r
@@ -46,16 +50,29 @@ public interface CharIndexTranslator {
       * @param bytePos The character offset to check\r
       * @param startCP look from this characted position \r
       * @return the char index\r
+     * @deprecated This API were based on incorrect assumption that single byte\r
+     *             offset corresponds to single char offset\r
       */\r
+    @Deprecated\r
      int getCharIndex(int bytePos, int startCP);\r
-    \r
+\r
+    /**\r
+     * Finds character ranges that includes specified byte range.\r
+     * \r
+     * @param startBytePosInclusive\r
+     *            start byte range\r
+     * @param endBytePosExclusive\r
+     *            end byte range\r
+     */\r
+    int[][] getCharIndexRanges( int startBytePosInclusive,\r
+            int endBytePosExclusive );\r
+\r
      /**\r
       * Check if index is in table\r
-     *\r
+     * \r
       * @param bytePos\r
       * @return true if index in table, false if not\r
       */\r
-\r
      boolean isIndexInTable(int bytePos);\r
  \r
      /**\r
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java

index 69c0c061969c2e554140201d87f77519ef774c44..7ab0576d6a1f6077127163f7d918306b4a83c312 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@@ -92,12 +92,8 @@ public class PAPBinTable
                          documentStream, dataStream, pageOffset,
                          charIndexTranslator );
  
-                int fkpSize = pfkp.size();
-
-                for ( int y = 0; y < fkpSize; y++ )
+                for ( PAPX papx : pfkp.getPAPXs() )
                  {
-                    PAPX papx = pfkp.getPAPX( y );
-
                      if ( papx != null )
                          _paragraphs.add( papx );
                  }
@@ -107,6 +103,12 @@ public class PAPBinTable
          logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
                  Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                  Integer.valueOf( _paragraphs.size() ), " elements)" );
+
+        if ( _paragraphs.isEmpty() )
+        {
+            logger.log( POILogger.WARN, "PAPX FKPs are empty" );
+            _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
+        }
      }
  
      public void rebuild( final StringBuilder docText,
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java

index 29c838c6ac5bc4dc1f573269b8d94f02f105fa9f..6c78f420b1a0777143f7a3e32798b6d223bef80a 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
@@ -23,6 +23,8 @@ import java.util.Arrays;
  import java.util.Collections;
  import java.util.List;
  
+import org.apache.poi.hwpf.sprm.SprmBuffer;
+
  import org.apache.poi.hwpf.model.io.HWPFOutputStream;
  import org.apache.poi.util.Internal;
  import org.apache.poi.util.LittleEndian;
@@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
              int bytesStartAt = getStart( x );
              int bytesEndAt = getEnd( x );
  
-            int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+            // );
+            // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+            // getParagraphHeight( x ), dataStream );
+            // _papxList.add( papx );
  
-            PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
-                    getParagraphHeight( x ), dataStream );
-            _papxList.add( papx );
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+                    bytesEndAt ) )
+            {
+                PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
+                        getParagraphHeight( x ), dataStream );
+                _papxList.add( papx );
+            }
          }
          _fkp = null;
      }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java

index 82fb71eae0864d103529e91647fd2e722931be82..facb4e472cd00080d7ea4febc3c56a62e9df90e2 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@@ -20,6 +20,7 @@ import java.io.IOException;
  import java.util.ArrayList;
  import java.util.Collections;
  import java.util.Comparator;
+import java.util.LinkedList;
  import java.util.List;
  
  import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
              System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
  
              // And now build the piece
-            _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
-                    pieces[x] ) );
+            final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
+                    pieces[x] );
+
+            _textPieces.add( newTextPiece );
          }
  
          // In the interest of our sanity, now sort the text pieces
@@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
          return byteCount;
      }
  
+    @Deprecated
      public int getCharIndex( int bytePos )
      {
          return getCharIndex( bytePos, 0 );
      }
  
+    @Deprecated
      public int getCharIndex( int startBytePos, int startCP )
      {
          int charCount = 0;
@@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
          return charCount;
      }
  
+    public int[][] getCharIndexRanges( int startBytePosInclusive,
+            int endBytePosExclusive )
+    {
+        List<int[]> result = new LinkedList<int[]>();
+        for ( TextPiece textPiece : _textPiecesFCOrder )
+        {
+            final int tpStart = textPiece.getPieceDescriptor()
+                    .getFilePosition();
+            final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+                    + textPiece.bytesLength();
+            if ( startBytePosInclusive > tpEnd )
+                continue;
+            if ( endBytePosExclusive < tpStart )
+                break;
+
+            final int rangeStartBytes = Math.max( tpStart,
+                    startBytePosInclusive );
+            final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
+            final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
+
+            if ( rangeStartBytes > rangeEndBytes )
+                continue;
+
+            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+
+            final int rangeStartCp = textPiece.getStart()
+                    + ( rangeStartBytes - tpStart ) / encodingMultiplier;
+            final int rangeEndCp = rangeStartCp + rangeLengthBytes
+                    / encodingMultiplier;
+
+            result.add( new int[] { rangeStartCp, rangeEndCp } );
+        }
+
+        return result.toArray( new int[result.size()][] );
+    }
+
      public int getCpMin()
      {
          return _cpMin;
@@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
  
      public int lookIndexForward( final int startBytePos )
      {
-        int bytePos = startBytePos;
-        for ( TextPiece tp : _textPiecesFCOrder )
-        {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
+        if ( _textPiecesFCOrder.isEmpty() )
+            throw new IllegalStateException( "Text pieces table is empty" );
  
-            if ( bytePos >= pieceStart + tp.bytesLength() )
-            {
-                continue;
-            }
+        if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
+            return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
  
-            if ( pieceStart > bytePos )
-            {
-                bytePos = pieceStart;
-            }
+        if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
+                .getPieceDescriptor().getFilePosition() <= startBytePos )
+            return startBytePos;
  
-            break;
+        int low = 0;
+        int high = _textPiecesFCOrder.size() - 1;
+
+        while ( low <= high )
+        {
+            int mid = ( low + high ) >>> 1;
+            final TextPiece textPiece = _textPiecesFCOrder.get( mid );
+            int midVal = textPiece.getPieceDescriptor().getFilePosition();
+
+            if ( midVal < startBytePos )
+                low = mid + 1;
+            else if ( midVal > startBytePos )
+                high = mid - 1;
+            else
+                // found piece with exact start
+                return textPiece.getPieceDescriptor().getFilePosition();
          }
-        return bytePos;
+        assert low == high;
+        assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
+                .getFilePosition() < startBytePos;
+        // last line can't be current, can it?
+        assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
+                .getFilePosition() > startBytePos;
+
+        // shifting to next piece start
+        return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
      }
  
      public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java

index 5566caf468b85515bb22963ee40fe566394425dc..a12baa47d02045abbc0b4801a4cda1aba66b98be 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@@ -226,6 +226,36 @@ public class TestBugs extends TestCase
          assertEquals( extractor1.getText(), extractor2.getText() );
      }
  
+    /**
+     * Bug 44331 - HWPFDocument.write destroys fields
+     */
+    public void test44431_2()
+    {
+        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
+        WordExtractor extractor1 = new WordExtractor( doc1 );
+
+        assertEquals( "File name=FieldsTest.doc\n" + 
+                       "\n" + 
+                       "\n" + 
+                       "STYLEREF test\n" + 
+                       "\n" + 
+                       "\n" + 
+                       "\n" + 
+                       "TEST TABLE OF CONTENTS\n" + 
+                       "\n" + 
+                       "Heading paragraph in next page\t2\n" + 
+                       "Another heading paragraph in further page\t3\n" + 
+                       "Another heading paragraph in further page\t3\n" + 
+                       "\n" + 
+                       "\n" + 
+                       "Heading paragraph in next page\n" + 
+                       "Another heading paragraph in further page\n" + 
+                       "\n" + 
+                       "\n" + 
+                       "\n" + 
+                       "Page 3 of 3", extractor1.getText() );
+    }
+
      /**
       * Bug 45473 - HWPF cannot read file after save
       */
@@ -640,19 +670,20 @@ public class TestBugs extends TestCase
          hwpfDocument.write( new ByteArrayOutputStream() );
      }
  
-
-
      /**
-     * Bug 51678 - Extracting text from Bug51524.zip is slow
-     * Bug 51524 - PapBinTable constructor is slow
+     * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
+     * PapBinTable constructor is slow
       */
      public void test51678And51524()
      {
-        // YK: the test will run only if the poi.test.remote system property is set.
+        // YK: the test will run only if the poi.test.remote system property is
+        // set.
          // TODO: refactor into something nicer!
-        if(System.getProperty("poi.test.remote") != null) {
+        if ( System.getProperty( "poi.test.remote" ) != null )
+        {
              String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
-            HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
+            HWPFDocument hwpfDocument = HWPFTestDataSamples
+                    .openRemoteFile( href );
  
              WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
              wordExtractor.getText();
author	Sergey Vladimirov <sergey@apache.org>
	Wed, 7 Sep 2011 12:12:17 +0000 (12:12 +0000)
committer	Sergey Vladimirov <sergey@apache.org>
	Wed, 7 Sep 2011 12:12:17 +0000 (12:12 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java		patch \| blob \| history