From a45d61c740bf1bfd91ccbb288a09d49dd923dbc3 Mon Sep 17 00:00:00 2001
From: Sergey Vladimirov <sergey@apache.org>
Date: Wed, 7 Sep 2011 12:12:17 +0000
Subject: [PATCH] fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 -
 2003; Replace byte->char translation with byte range -> char range_S_
 translation for PAPX / CHPX tables

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68
---
 src/documentation/content/xdocs/status.xml    |  1 +
 .../apache/poi/hwpf/model/CHPBinTable.java    | 19 ++--
 .../poi/hwpf/model/CHPFormattedDiskPage.java  | 24 +++--
 .../poi/hwpf/model/CharIndexTranslator.java   | 31 +++++--
 .../apache/poi/hwpf/model/PAPBinTable.java    | 12 ++-
 .../poi/hwpf/model/PAPFormattedDiskPage.java  | 20 +++-
 .../apache/poi/hwpf/model/TextPieceTable.java | 91 +++++++++++++++----
 .../apache/poi/hwpf/usermodel/TestBugs.java   | 45 +++++++--
 8 files changed, 187 insertions(+), 56 deletions(-)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index ee7f07317f..b1682fecdf 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.8-beta5" date="2011-??-??">
+           <action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
            <action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
            <action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
            <action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
index d2f949813a..3a984d6a8f 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java
@@ -101,18 +101,21 @@ public class CHPBinTable
       CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
         pageOffset, translator);
 
-      int fkpSize = cfkp.size();
-
-      for (int y = 0; y < fkpSize; y++)
-      {
-        final CHPX chpx = cfkp.getCHPX(y);
-        if (chpx != null)
-            _textRuns.add(chpx);
-      }
+            for ( CHPX chpx : cfkp.getCHPXs() )
+            {
+                if ( chpx != null )
+                    _textRuns.add( chpx );
+            }
     }
         logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
                 Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                 Integer.valueOf( _textRuns.size() ), " elements)" );
+
+        if ( _textRuns.isEmpty() )
+        {
+            logger.log( POILogger.WARN, "CHPX FKPs are empty" );
+            _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
+        }
     }
 
     public void rebuild( ComplexFileTable complexFileTable )
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
index 2f1215c4e6..28cd870f4b 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java
@@ -18,6 +18,7 @@
 package org.apache.poi.hwpf.model;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.poi.hwpf.sprm.SprmBuffer;
@@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
             int bytesStartAt = getStart( x );
             int bytesEndAt = getEnd( x );
 
-            int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+            // );
 
-            // TODO: CHECK!
-            // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
-            // ) );
-            CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
-                    getGrpprl( x ), 0 ) );
-            _chpxList.add( chpx );
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+                    bytesEndAt ) )
+            {
+                CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
+                        getGrpprl( x ), 0 ) );
+                _chpxList.add( chpx );
+            }
         }
     }
 
@@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage
       return _chpxList.get(index);
     }
 
+    public List<CHPX> getCHPXs()
+    {
+        return Collections.unmodifiableList( _chpxList );
+    }
+
     public void fill(List<CHPX> filler)
     {
       _chpxList.addAll(filler);
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
index 6272b1afe7..b6e48e21d5 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java
@@ -31,12 +31,16 @@ public interface CharIndexTranslator {
     int getByteIndex( int charPos );
 
     /**
-     * Calculates the char index of the given byte index.
-     * Look forward if index is not in table
-     *
-     * @param bytePos The character offset to check 
+     * Calculates the char index of the given byte index. Look forward if index
+     * is not in table
+     * 
+     * @param bytePos
+     *            The character offset to check
      * @return the char index
+     * @deprecated This API were based on incorrect assumption that single byte
+     *             offset corresponds to single char offset
      */
+    @Deprecated
     int getCharIndex(int bytePos);
 
     /**
@@ -46,16 +50,29 @@ public interface CharIndexTranslator {
      * @param bytePos The character offset to check
      * @param startCP look from this characted position 
      * @return the char index
+     * @deprecated This API were based on incorrect assumption that single byte
+     *             offset corresponds to single char offset
      */
+    @Deprecated
     int getCharIndex(int bytePos, int startCP);
-    
+
+    /**
+     * Finds character ranges that includes specified byte range.
+     * 
+     * @param startBytePosInclusive
+     *            start byte range
+     * @param endBytePosExclusive
+     *            end byte range
+     */
+    int[][] getCharIndexRanges( int startBytePosInclusive,
+            int endBytePosExclusive );
+
     /**
      * Check if index is in table
-     *
+     * 
      * @param bytePos
      * @return true if index in table, false if not
      */
-
     boolean isIndexInTable(int bytePos);
 
     /**
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
index 69c0c06196..7ab0576d6a 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java
@@ -92,12 +92,8 @@ public class PAPBinTable
                         documentStream, dataStream, pageOffset,
                         charIndexTranslator );
 
-                int fkpSize = pfkp.size();
-
-                for ( int y = 0; y < fkpSize; y++ )
+                for ( PAPX papx : pfkp.getPAPXs() )
                 {
-                    PAPX papx = pfkp.getPAPX( y );
-
                     if ( papx != null )
                         _paragraphs.add( papx );
                 }
@@ -107,6 +103,12 @@ public class PAPBinTable
         logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
                 Long.valueOf( System.currentTimeMillis() - start ), " ms (",
                 Integer.valueOf( _paragraphs.size() ), " elements)" );
+
+        if ( _paragraphs.isEmpty() )
+        {
+            logger.log( POILogger.WARN, "PAPX FKPs are empty" );
+            _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
+        }
     }
 
     public void rebuild( final StringBuilder docText,
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
index 29c838c6ac..6c78f420b1 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java
@@ -23,6 +23,8 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 
+import org.apache.poi.hwpf.sprm.SprmBuffer;
+
 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
@@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage {
             int bytesStartAt = getStart( x );
             int bytesEndAt = getEnd( x );
 
-            int charStartAt = translator.getCharIndex( bytesStartAt );
-            int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+            // int charStartAt = translator.getCharIndex( bytesStartAt );
+            // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+            // );
+            // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+            // getParagraphHeight( x ), dataStream );
+            // _papxList.add( papx );
 
-            PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
-                    getParagraphHeight( x ), dataStream );
-            _papxList.add( papx );
+            for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+                    bytesEndAt ) )
+            {
+                PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
+                        getParagraphHeight( x ), dataStream );
+                _papxList.add( papx );
+            }
         }
         _fkp = null;
     }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
index 82fb71eae0..facb4e472c 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.LinkedList;
 import java.util.List;
 
 import org.apache.poi.hwpf.model.io.HWPFOutputStream;
@@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator
             System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
 
             // And now build the piece
-            _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
-                    pieces[x] ) );
+            final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
+                    pieces[x] );
+
+            _textPieces.add( newTextPiece );
         }
 
         // In the interest of our sanity, now sort the text pieces
@@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator
         return byteCount;
     }
 
+    @Deprecated
     public int getCharIndex( int bytePos )
     {
         return getCharIndex( bytePos, 0 );
     }
 
+    @Deprecated
     public int getCharIndex( int startBytePos, int startCP )
     {
         int charCount = 0;
@@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator
         return charCount;
     }
 
+    public int[][] getCharIndexRanges( int startBytePosInclusive,
+            int endBytePosExclusive )
+    {
+        List<int[]> result = new LinkedList<int[]>();
+        for ( TextPiece textPiece : _textPiecesFCOrder )
+        {
+            final int tpStart = textPiece.getPieceDescriptor()
+                    .getFilePosition();
+            final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+                    + textPiece.bytesLength();
+            if ( startBytePosInclusive > tpEnd )
+                continue;
+            if ( endBytePosExclusive < tpStart )
+                break;
+
+            final int rangeStartBytes = Math.max( tpStart,
+                    startBytePosInclusive );
+            final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
+            final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
+
+            if ( rangeStartBytes > rangeEndBytes )
+                continue;
+
+            final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+
+            final int rangeStartCp = textPiece.getStart()
+                    + ( rangeStartBytes - tpStart ) / encodingMultiplier;
+            final int rangeEndCp = rangeStartCp + rangeLengthBytes
+                    / encodingMultiplier;
+
+            result.add( new int[] { rangeStartCp, rangeEndCp } );
+        }
+
+        return result.toArray( new int[result.size()][] );
+    }
+
     public int getCpMin()
     {
         return _cpMin;
@@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator
 
     public int lookIndexForward( final int startBytePos )
     {
-        int bytePos = startBytePos;
-        for ( TextPiece tp : _textPiecesFCOrder )
-        {
-            int pieceStart = tp.getPieceDescriptor().getFilePosition();
+        if ( _textPiecesFCOrder.isEmpty() )
+            throw new IllegalStateException( "Text pieces table is empty" );
 
-            if ( bytePos >= pieceStart + tp.bytesLength() )
-            {
-                continue;
-            }
+        if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
+            return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
 
-            if ( pieceStart > bytePos )
-            {
-                bytePos = pieceStart;
-            }
+        if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
+                .getPieceDescriptor().getFilePosition() <= startBytePos )
+            return startBytePos;
 
-            break;
+        int low = 0;
+        int high = _textPiecesFCOrder.size() - 1;
+
+        while ( low <= high )
+        {
+            int mid = ( low + high ) >>> 1;
+            final TextPiece textPiece = _textPiecesFCOrder.get( mid );
+            int midVal = textPiece.getPieceDescriptor().getFilePosition();
+
+            if ( midVal < startBytePos )
+                low = mid + 1;
+            else if ( midVal > startBytePos )
+                high = mid - 1;
+            else
+                // found piece with exact start
+                return textPiece.getPieceDescriptor().getFilePosition();
         }
-        return bytePos;
+        assert low == high;
+        assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
+                .getFilePosition() < startBytePos;
+        // last line can't be current, can it?
+        assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
+                .getFilePosition() > startBytePos;
+
+        // shifting to next piece start
+        return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
     }
 
     public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
index 5566caf468..a12baa47d0 100644
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@@ -226,6 +226,36 @@ public class TestBugs extends TestCase
         assertEquals( extractor1.getText(), extractor2.getText() );
     }
 
+    /**
+     * Bug 44331 - HWPFDocument.write destroys fields
+     */
+    public void test44431_2()
+    {
+        HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
+        WordExtractor extractor1 = new WordExtractor( doc1 );
+
+        assertEquals( "File name=FieldsTest.doc\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"STYLEREF test\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"TEST TABLE OF CONTENTS\n" + 
+        		"\n" + 
+        		"Heading paragraph in next page\t2\n" + 
+        		"Another heading paragraph in further page\t3\n" + 
+        		"Another heading paragraph in further page\t3\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"Heading paragraph in next page\n" + 
+        		"Another heading paragraph in further page\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"\n" + 
+        		"Page 3 of 3", extractor1.getText() );
+    }
+
     /**
      * Bug 45473 - HWPF cannot read file after save
      */
@@ -640,19 +670,20 @@ public class TestBugs extends TestCase
         hwpfDocument.write( new ByteArrayOutputStream() );
     }
 
-
-
     /**
-     * Bug 51678 - Extracting text from Bug51524.zip is slow
-     * Bug 51524 - PapBinTable constructor is slow
+     * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
+     * PapBinTable constructor is slow
      */
     public void test51678And51524()
     {
-        // YK: the test will run only if the poi.test.remote system property is set.
+        // YK: the test will run only if the poi.test.remote system property is
+        // set.
         // TODO: refactor into something nicer!
-        if(System.getProperty("poi.test.remote") != null) {
+        if ( System.getProperty( "poi.test.remote" ) != null )
+        {
             String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
-            HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
+            HWPFDocument hwpfDocument = HWPFTestDataSamples
+                    .openRemoteFile( href );
 
             WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
             wordExtractor.getText();
-- 
2.39.5