From a45d61c740bf1bfd91ccbb288a09d49dd923dbc3 Mon Sep 17 00:00:00 2001 From: Sergey Vladimirov Date: Wed, 7 Sep 2011 12:12:17 +0000 Subject: [PATCH] fix Bug 51772 - IllegalArgumentException Parsing MS Word 97 - 2003; Replace byte->char translation with byte range -> char range_S_ translation for PAPX / CHPX tables git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1166144 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../apache/poi/hwpf/model/CHPBinTable.java | 19 ++-- .../poi/hwpf/model/CHPFormattedDiskPage.java | 24 +++-- .../poi/hwpf/model/CharIndexTranslator.java | 31 +++++-- .../apache/poi/hwpf/model/PAPBinTable.java | 12 ++- .../poi/hwpf/model/PAPFormattedDiskPage.java | 20 +++- .../apache/poi/hwpf/model/TextPieceTable.java | 91 +++++++++++++++---- .../apache/poi/hwpf/usermodel/TestBugs.java | 45 +++++++-- 8 files changed, 187 insertions(+), 56 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index ee7f07317f..b1682fecdf 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 51772 - IllegalArgumentException Parsing MS Word 97 - 2003 XSLFPowerPointExtractor support for including comment authors with comment text Converted XSLFPowerPointExtractor to use UserModel for all text extraction XSLF initial UserModel support for Notes and Comments for Slides diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java index d2f949813a..3a984d6a8f 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java @@ -101,18 +101,21 @@ public class CHPBinTable CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, pageOffset, translator); - int fkpSize = cfkp.size(); - - for (int y = 0; y < fkpSize; y++) - { - final CHPX chpx = cfkp.getCHPX(y); - if (chpx != null) - _textRuns.add(chpx); - } + for ( CHPX chpx : cfkp.getCHPXs() ) + { + if ( chpx != null ) + _textRuns.add( chpx ); + } } logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ", Long.valueOf( System.currentTimeMillis() - start ), " ms (", Integer.valueOf( _textRuns.size() ), " elements)" ); + + if ( _textRuns.isEmpty() ) + { + logger.log( POILogger.WARN, "CHPX FKPs are empty" ); + _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) ); + } } public void rebuild( ComplexFileTable complexFileTable ) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java index 2f1215c4e6..28cd870f4b 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPFormattedDiskPage.java @@ -18,6 +18,7 @@ package org.apache.poi.hwpf.model; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.apache.poi.hwpf.sprm.SprmBuffer; @@ -82,15 +83,17 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage int bytesStartAt = getStart( x ); int bytesEndAt = getEnd( x ); - int charStartAt = translator.getCharIndex( bytesStartAt ); - int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt ); + // int charStartAt = translator.getCharIndex( bytesStartAt ); + // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt + // ); - // TODO: CHECK! - // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x - // ) ); - CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer( - getGrpprl( x ), 0 ) ); - _chpxList.add( chpx ); + for ( int[] range : translator.getCharIndexRanges( bytesStartAt, + bytesEndAt ) ) + { + CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer( + getGrpprl( x ), 0 ) ); + _chpxList.add( chpx ); + } } } @@ -99,6 +102,11 @@ public final class CHPFormattedDiskPage extends FormattedDiskPage return _chpxList.get(index); } + public List getCHPXs() + { + return Collections.unmodifiableList( _chpxList ); + } + public void fill(List filler) { _chpxList.addAll(filler); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java index 6272b1afe7..b6e48e21d5 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CharIndexTranslator.java @@ -31,12 +31,16 @@ public interface CharIndexTranslator { int getByteIndex( int charPos ); /** - * Calculates the char index of the given byte index. - * Look forward if index is not in table - * - * @param bytePos The character offset to check + * Calculates the char index of the given byte index. Look forward if index + * is not in table + * + * @param bytePos + * The character offset to check * @return the char index + * @deprecated This API were based on incorrect assumption that single byte + * offset corresponds to single char offset */ + @Deprecated int getCharIndex(int bytePos); /** @@ -46,16 +50,29 @@ public interface CharIndexTranslator { * @param bytePos The character offset to check * @param startCP look from this characted position * @return the char index + * @deprecated This API were based on incorrect assumption that single byte + * offset corresponds to single char offset */ + @Deprecated int getCharIndex(int bytePos, int startCP); - + + /** + * Finds character ranges that includes specified byte range. + * + * @param startBytePosInclusive + * start byte range + * @param endBytePosExclusive + * end byte range + */ + int[][] getCharIndexRanges( int startBytePosInclusive, + int endBytePosExclusive ); + /** * Check if index is in table - * + * * @param bytePos * @return true if index in table, false if not */ - boolean isIndexInTable(int bytePos); /** diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 69c0c06196..7ab0576d6a 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -92,12 +92,8 @@ public class PAPBinTable documentStream, dataStream, pageOffset, charIndexTranslator ); - int fkpSize = pfkp.size(); - - for ( int y = 0; y < fkpSize; y++ ) + for ( PAPX papx : pfkp.getPAPXs() ) { - PAPX papx = pfkp.getPAPX( y ); - if ( papx != null ) _paragraphs.add( papx ); } @@ -107,6 +103,12 @@ public class PAPBinTable logger.log( POILogger.DEBUG, "PAPX tables loaded in ", Long.valueOf( System.currentTimeMillis() - start ), " ms (", Integer.valueOf( _paragraphs.size() ), " elements)" ); + + if ( _paragraphs.isEmpty() ) + { + logger.log( POILogger.WARN, "PAPX FKPs are empty" ); + _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) ); + } } public void rebuild( final StringBuilder docText, diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java index 29c838c6ac..6c78f420b1 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPFormattedDiskPage.java @@ -23,6 +23,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.poi.hwpf.sprm.SprmBuffer; + import org.apache.poi.hwpf.model.io.HWPFOutputStream; import org.apache.poi.util.Internal; import org.apache.poi.util.LittleEndian; @@ -88,12 +90,20 @@ public final class PAPFormattedDiskPage extends FormattedDiskPage { int bytesStartAt = getStart( x ); int bytesEndAt = getEnd( x ); - int charStartAt = translator.getCharIndex( bytesStartAt ); - int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt ); + // int charStartAt = translator.getCharIndex( bytesStartAt ); + // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt + // ); + // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ), + // getParagraphHeight( x ), dataStream ); + // _papxList.add( papx ); - PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ), - getParagraphHeight( x ), dataStream ); - _papxList.add( papx ); + for ( int[] range : translator.getCharIndexRanges( bytesStartAt, + bytesEndAt ) ) + { + PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ), + getParagraphHeight( x ), dataStream ); + _papxList.add( papx ); + } } _fkp = null; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index 82fb71eae0..facb4e472c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedList; import java.util.List; import org.apache.poi.hwpf.model.io.HWPFOutputStream; @@ -107,8 +108,10 @@ public class TextPieceTable implements CharIndexTranslator System.arraycopy( documentStream, start, buf, 0, textSizeBytes ); // And now build the piece - _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf, - pieces[x] ) ); + final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf, + pieces[x] ); + + _textPieces.add( newTextPiece ); } // In the interest of our sanity, now sort the text pieces @@ -201,11 +204,13 @@ public class TextPieceTable implements CharIndexTranslator return byteCount; } + @Deprecated public int getCharIndex( int bytePos ) { return getCharIndex( bytePos, 0 ); } + @Deprecated public int getCharIndex( int startBytePos, int startCP ) { int charCount = 0; @@ -253,6 +258,42 @@ public class TextPieceTable implements CharIndexTranslator return charCount; } + public int[][] getCharIndexRanges( int startBytePosInclusive, + int endBytePosExclusive ) + { + List result = new LinkedList(); + for ( TextPiece textPiece : _textPiecesFCOrder ) + { + final int tpStart = textPiece.getPieceDescriptor() + .getFilePosition(); + final int tpEnd = textPiece.getPieceDescriptor().getFilePosition() + + textPiece.bytesLength(); + if ( startBytePosInclusive > tpEnd ) + continue; + if ( endBytePosExclusive < tpStart ) + break; + + final int rangeStartBytes = Math.max( tpStart, + startBytePosInclusive ); + final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive ); + final int rangeLengthBytes = rangeEndBytes - rangeStartBytes; + + if ( rangeStartBytes > rangeEndBytes ) + continue; + + final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1; + + final int rangeStartCp = textPiece.getStart() + + ( rangeStartBytes - tpStart ) / encodingMultiplier; + final int rangeEndCp = rangeStartCp + rangeLengthBytes + / encodingMultiplier; + + result.add( new int[] { rangeStartCp, rangeEndCp } ); + } + + return result.toArray( new int[result.size()][] ); + } + public int getCpMin() { return _cpMin; @@ -377,24 +418,42 @@ public class TextPieceTable implements CharIndexTranslator public int lookIndexForward( final int startBytePos ) { - int bytePos = startBytePos; - for ( TextPiece tp : _textPiecesFCOrder ) - { - int pieceStart = tp.getPieceDescriptor().getFilePosition(); + if ( _textPiecesFCOrder.isEmpty() ) + throw new IllegalStateException( "Text pieces table is empty" ); - if ( bytePos >= pieceStart + tp.bytesLength() ) - { - continue; - } + if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos ) + return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition(); - if ( pieceStart > bytePos ) - { - bytePos = pieceStart; - } + if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 ) + .getPieceDescriptor().getFilePosition() <= startBytePos ) + return startBytePos; - break; + int low = 0; + int high = _textPiecesFCOrder.size() - 1; + + while ( low <= high ) + { + int mid = ( low + high ) >>> 1; + final TextPiece textPiece = _textPiecesFCOrder.get( mid ); + int midVal = textPiece.getPieceDescriptor().getFilePosition(); + + if ( midVal < startBytePos ) + low = mid + 1; + else if ( midVal > startBytePos ) + high = mid - 1; + else + // found piece with exact start + return textPiece.getPieceDescriptor().getFilePosition(); } - return bytePos; + assert low == high; + assert _textPiecesFCOrder.get( low ).getPieceDescriptor() + .getFilePosition() < startBytePos; + // last line can't be current, can it? + assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor() + .getFilePosition() > startBytePos; + + // shifting to next piece start + return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition(); } public byte[] writeTo( HWPFOutputStream docStream ) throws IOException diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java index 5566caf468..a12baa47d0 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java @@ -226,6 +226,36 @@ public class TestBugs extends TestCase assertEquals( extractor1.getText(), extractor2.getText() ); } + /** + * Bug 44331 - HWPFDocument.write destroys fields + */ + public void test44431_2() + { + HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" ); + WordExtractor extractor1 = new WordExtractor( doc1 ); + + assertEquals( "File name=FieldsTest.doc\n" + + "\n" + + "\n" + + "STYLEREF test\n" + + "\n" + + "\n" + + "\n" + + "TEST TABLE OF CONTENTS\n" + + "\n" + + "Heading paragraph in next page\t2\n" + + "Another heading paragraph in further page\t3\n" + + "Another heading paragraph in further page\t3\n" + + "\n" + + "\n" + + "Heading paragraph in next page\n" + + "Another heading paragraph in further page\n" + + "\n" + + "\n" + + "\n" + + "Page 3 of 3", extractor1.getText() ); + } + /** * Bug 45473 - HWPF cannot read file after save */ @@ -640,19 +670,20 @@ public class TestBugs extends TestCase hwpfDocument.write( new ByteArrayOutputStream() ); } - - /** - * Bug 51678 - Extracting text from Bug51524.zip is slow - * Bug 51524 - PapBinTable constructor is slow + * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 - + * PapBinTable constructor is slow */ public void test51678And51524() { - // YK: the test will run only if the poi.test.remote system property is set. + // YK: the test will run only if the poi.test.remote system property is + // set. // TODO: refactor into something nicer! - if(System.getProperty("poi.test.remote") != null) { + if ( System.getProperty( "poi.test.remote" ) != null ) + { String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc"; - HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href ); + HWPFDocument hwpfDocument = HWPFTestDataSamples + .openRemoteFile( href ); WordExtractor wordExtractor = new WordExtractor( hwpfDocument ); wordExtractor.getText(); -- 2.39.5