<changes>
<release version="3.8-beta5" date="2011-??-??">
+ <action dev="poi-developers" type="fix">51772 - IllegalArgumentException Parsing MS Word 97 - 2003</action>
<action dev="poi-developers" type="add">XSLFPowerPointExtractor support for including comment authors with comment text</action>
<action dev="poi-developers" type="fix">Converted XSLFPowerPointExtractor to use UserModel for all text extraction</action>
<action dev="poi-developers" type="add">XSLF initial UserModel support for Notes and Comments for Slides</action>
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
pageOffset, translator);
- int fkpSize = cfkp.size();
-
- for (int y = 0; y < fkpSize; y++)
- {
- final CHPX chpx = cfkp.getCHPX(y);
- if (chpx != null)
- _textRuns.add(chpx);
- }
+ for ( CHPX chpx : cfkp.getCHPXs() )
+ {
+ if ( chpx != null )
+ _textRuns.add( chpx );
+ }
}
logger.log( POILogger.DEBUG, "CHPX FKPs loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _textRuns.size() ), " elements)" );
+
+ if ( _textRuns.isEmpty() )
+ {
+ logger.log( POILogger.WARN, "CHPX FKPs are empty" );
+ _textRuns.add( new CHPX( 0, 0, new SprmBuffer( 0 ) ) );
+ }
}
public void rebuild( ComplexFileTable complexFileTable )
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import org.apache.poi.hwpf.sprm.SprmBuffer;
int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x );
- int charStartAt = translator.getCharIndex( bytesStartAt );
- int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+ // int charStartAt = translator.getCharIndex( bytesStartAt );
+ // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+ // );
- // TODO: CHECK!
- // CHPX chpx = new CHPX( bytesStartAt, bytesEndAt, tpt, getGrpprl( x
- // ) );
- CHPX chpx = new CHPX( charStartAt, charEndAt, new SprmBuffer(
- getGrpprl( x ), 0 ) );
- _chpxList.add( chpx );
+ for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+ bytesEndAt ) )
+ {
+ CHPX chpx = new CHPX( range[0], range[1], new SprmBuffer(
+ getGrpprl( x ), 0 ) );
+ _chpxList.add( chpx );
+ }
}
}
return _chpxList.get(index);
}
+ public List<CHPX> getCHPXs()
+ {
+ return Collections.unmodifiableList( _chpxList );
+ }
+
public void fill(List<CHPX> filler)
{
_chpxList.addAll(filler);
int getByteIndex( int charPos );\r
\r
/**\r
- * Calculates the char index of the given byte index.\r
- * Look forward if index is not in table\r
- *\r
- * @param bytePos The character offset to check \r
+ * Calculates the char index of the given byte index. Look forward if index\r
+ * is not in table\r
+ * \r
+ * @param bytePos\r
+ * The character offset to check\r
* @return the char index\r
+ * @deprecated This API were based on incorrect assumption that single byte\r
+ * offset corresponds to single char offset\r
*/\r
+ @Deprecated\r
int getCharIndex(int bytePos);\r
\r
/**\r
* @param bytePos The character offset to check\r
* @param startCP look from this characted position \r
* @return the char index\r
+ * @deprecated This API were based on incorrect assumption that single byte\r
+ * offset corresponds to single char offset\r
*/\r
+ @Deprecated\r
int getCharIndex(int bytePos, int startCP);\r
- \r
+\r
+ /**\r
+ * Finds character ranges that includes specified byte range.\r
+ * \r
+ * @param startBytePosInclusive\r
+ * start byte range\r
+ * @param endBytePosExclusive\r
+ * end byte range\r
+ */\r
+ int[][] getCharIndexRanges( int startBytePosInclusive,\r
+ int endBytePosExclusive );\r
+\r
/**\r
* Check if index is in table\r
- *\r
+ * \r
* @param bytePos\r
* @return true if index in table, false if not\r
*/\r
-\r
boolean isIndexInTable(int bytePos);\r
\r
/**\r
documentStream, dataStream, pageOffset,
charIndexTranslator );
- int fkpSize = pfkp.size();
-
- for ( int y = 0; y < fkpSize; y++ )
+ for ( PAPX papx : pfkp.getPAPXs() )
{
- PAPX papx = pfkp.getPAPX( y );
-
if ( papx != null )
_paragraphs.add( papx );
}
logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
Long.valueOf( System.currentTimeMillis() - start ), " ms (",
Integer.valueOf( _paragraphs.size() ), " elements)" );
+
+ if ( _paragraphs.isEmpty() )
+ {
+ logger.log( POILogger.WARN, "PAPX FKPs are empty" );
+ _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
+ }
}
public void rebuild( final StringBuilder docText,
import java.util.Collections;
import java.util.List;
+import org.apache.poi.hwpf.sprm.SprmBuffer;
+
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
int bytesStartAt = getStart( x );
int bytesEndAt = getEnd( x );
- int charStartAt = translator.getCharIndex( bytesStartAt );
- int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt );
+ // int charStartAt = translator.getCharIndex( bytesStartAt );
+ // int charEndAt = translator.getCharIndex( bytesEndAt, charStartAt
+ // );
+ // PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
+ // getParagraphHeight( x ), dataStream );
+ // _papxList.add( papx );
- PAPX papx = new PAPX( charStartAt, charEndAt, getGrpprl( x ),
- getParagraphHeight( x ), dataStream );
- _papxList.add( papx );
+ for ( int[] range : translator.getCharIndexRanges( bytesStartAt,
+ bytesEndAt ) )
+ {
+ PAPX papx = new PAPX( range[0], range[1], getGrpprl( x ),
+ getParagraphHeight( x ), dataStream );
+ _papxList.add( papx );
+ }
}
_fkp = null;
}
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
+import java.util.LinkedList;
import java.util.List;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
System.arraycopy( documentStream, start, buf, 0, textSizeBytes );
// And now build the piece
- _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
- pieces[x] ) );
+ final TextPiece newTextPiece = new TextPiece( nodeStartChars, nodeEndChars, buf,
+ pieces[x] );
+
+ _textPieces.add( newTextPiece );
}
// In the interest of our sanity, now sort the text pieces
return byteCount;
}
+ @Deprecated
public int getCharIndex( int bytePos )
{
return getCharIndex( bytePos, 0 );
}
+ @Deprecated
public int getCharIndex( int startBytePos, int startCP )
{
int charCount = 0;
return charCount;
}
+ public int[][] getCharIndexRanges( int startBytePosInclusive,
+ int endBytePosExclusive )
+ {
+ List<int[]> result = new LinkedList<int[]>();
+ for ( TextPiece textPiece : _textPiecesFCOrder )
+ {
+ final int tpStart = textPiece.getPieceDescriptor()
+ .getFilePosition();
+ final int tpEnd = textPiece.getPieceDescriptor().getFilePosition()
+ + textPiece.bytesLength();
+ if ( startBytePosInclusive > tpEnd )
+ continue;
+ if ( endBytePosExclusive < tpStart )
+ break;
+
+ final int rangeStartBytes = Math.max( tpStart,
+ startBytePosInclusive );
+ final int rangeEndBytes = Math.min( tpEnd, endBytePosExclusive );
+ final int rangeLengthBytes = rangeEndBytes - rangeStartBytes;
+
+ if ( rangeStartBytes > rangeEndBytes )
+ continue;
+
+ final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
+
+ final int rangeStartCp = textPiece.getStart()
+ + ( rangeStartBytes - tpStart ) / encodingMultiplier;
+ final int rangeEndCp = rangeStartCp + rangeLengthBytes
+ / encodingMultiplier;
+
+ result.add( new int[] { rangeStartCp, rangeEndCp } );
+ }
+
+ return result.toArray( new int[result.size()][] );
+ }
+
public int getCpMin()
{
return _cpMin;
public int lookIndexForward( final int startBytePos )
{
- int bytePos = startBytePos;
- for ( TextPiece tp : _textPiecesFCOrder )
- {
- int pieceStart = tp.getPieceDescriptor().getFilePosition();
+ if ( _textPiecesFCOrder.isEmpty() )
+ throw new IllegalStateException( "Text pieces table is empty" );
- if ( bytePos >= pieceStart + tp.bytesLength() )
- {
- continue;
- }
+ if ( _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition() > startBytePos )
+ return _textPiecesFCOrder.get( 0 ).getPieceDescriptor().getFilePosition();
- if ( pieceStart > bytePos )
- {
- bytePos = pieceStart;
- }
+ if ( _textPiecesFCOrder.get( _textPiecesFCOrder.size() - 1 )
+ .getPieceDescriptor().getFilePosition() <= startBytePos )
+ return startBytePos;
- break;
+ int low = 0;
+ int high = _textPiecesFCOrder.size() - 1;
+
+ while ( low <= high )
+ {
+ int mid = ( low + high ) >>> 1;
+ final TextPiece textPiece = _textPiecesFCOrder.get( mid );
+ int midVal = textPiece.getPieceDescriptor().getFilePosition();
+
+ if ( midVal < startBytePos )
+ low = mid + 1;
+ else if ( midVal > startBytePos )
+ high = mid - 1;
+ else
+ // found piece with exact start
+ return textPiece.getPieceDescriptor().getFilePosition();
}
- return bytePos;
+ assert low == high;
+ assert _textPiecesFCOrder.get( low ).getPieceDescriptor()
+ .getFilePosition() < startBytePos;
+ // last line can't be current, can it?
+ assert _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor()
+ .getFilePosition() > startBytePos;
+
+ // shifting to next piece start
+ return _textPiecesFCOrder.get( low + 1 ).getPieceDescriptor().getFilePosition();
}
public byte[] writeTo( HWPFOutputStream docStream ) throws IOException
assertEquals( extractor1.getText(), extractor2.getText() );
}
+ /**
+ * Bug 44331 - HWPFDocument.write destroys fields
+ */
+ public void test44431_2()
+ {
+ HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile( "Bug44431.doc" );
+ WordExtractor extractor1 = new WordExtractor( doc1 );
+
+ assertEquals( "File name=FieldsTest.doc\n" +
+ "\n" +
+ "\n" +
+ "STYLEREF test\n" +
+ "\n" +
+ "\n" +
+ "\n" +
+ "TEST TABLE OF CONTENTS\n" +
+ "\n" +
+ "Heading paragraph in next page\t2\n" +
+ "Another heading paragraph in further page\t3\n" +
+ "Another heading paragraph in further page\t3\n" +
+ "\n" +
+ "\n" +
+ "Heading paragraph in next page\n" +
+ "Another heading paragraph in further page\n" +
+ "\n" +
+ "\n" +
+ "\n" +
+ "Page 3 of 3", extractor1.getText() );
+ }
+
/**
* Bug 45473 - HWPF cannot read file after save
*/
hwpfDocument.write( new ByteArrayOutputStream() );
}
-
-
/**
- * Bug 51678 - Extracting text from Bug51524.zip is slow
- * Bug 51524 - PapBinTable constructor is slow
+ * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
+ * PapBinTable constructor is slow
*/
public void test51678And51524()
{
- // YK: the test will run only if the poi.test.remote system property is set.
+ // YK: the test will run only if the poi.test.remote system property is
+ // set.
// TODO: refactor into something nicer!
- if(System.getProperty("poi.test.remote") != null) {
+ if ( System.getProperty( "poi.test.remote" ) != null )
+ {
String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
- HWPFDocument hwpfDocument = HWPFTestDataSamples.openRemoteFile( href );
+ HWPFDocument hwpfDocument = HWPFTestDataSamples
+ .openRemoteFile( href );
WordExtractor wordExtractor = new WordExtractor( hwpfDocument );
wordExtractor.getText();