import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Comparator;
+import java.util.IdentityHashMap;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import org.apache.poi.hwpf.model.io.HWPFFileSystem;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
}
public PAPBinTable( byte[] documentStream, byte[] tableStream,
- byte[] dataStream, int offset, int size, ComplexFileTable complexFileTable,
- TextPieceTable tpt, boolean reconstructPapxTable )
+ byte[] dataStream, int offset, int size,
+ ComplexFileTable complexFileTable, TextPieceTable tpt,
+ boolean reconstructPapxTable )
{
- PlexOfCps binTable = new PlexOfCps(tableStream, offset, size, 4);
- this.tpt = tpt;
+ long start = System.currentTimeMillis();
- int length = binTable.length();
- for (int x = 0; x < length; x++)
- {
- GenericPropertyNode node = binTable.getProperty(x);
+ {
+ PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );
+ this.tpt = tpt;
- int pageNum = LittleEndian.getInt(node.getBytes());
- int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
+ int length = binTable.length();
+ for ( int x = 0; x < length; x++ )
+ {
+ GenericPropertyNode node = binTable.getProperty( x );
- PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
- dataStream, pageOffset, tpt, reconstructPapxTable);
+ int pageNum = LittleEndian.getInt( node.getBytes() );
+ int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
+ * pageNum;
- int fkpSize = pfkp.size();
+ PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
+ documentStream, dataStream, pageOffset, tpt,
+ reconstructPapxTable );
- for (int y = 0; y < fkpSize; y++)
- {
- PAPX papx = pfkp.getPAPX(y);
+ int fkpSize = pfkp.size();
- if (papx != null)
- _paragraphs.add(papx);
- }
- }
+ for ( int y = 0; y < fkpSize; y++ )
+ {
+ PAPX papx = pfkp.getPAPX( y );
+
+ if ( papx != null )
+ _paragraphs.add( papx );
+ }
+ }
+ }
+
+ logger.log( POILogger.DEBUG, "PAPX tables loaded in ",
+ Long.valueOf( System.currentTimeMillis() - start ), " ms (",
+ Integer.valueOf( _paragraphs.size() ), " elements)" );
+ start = System.currentTimeMillis();
if ( !reconstructPapxTable )
{
Collections.sort( _paragraphs );
+
+ logger.log( POILogger.DEBUG, "PAPX sorted in ",
+ Long.valueOf( System.currentTimeMillis() - start ), " ms" );
return;
}
{
SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
- // adding CHPX from fast-saved SPRMs
+ // adding PAPX from fast-saved SPRMs
for ( TextPiece textPiece : tpt.getTextPieces() )
{
PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
if ( hasPap )
{
- SprmBuffer newSprmBuffer = new SprmBuffer(2);
+ SprmBuffer newSprmBuffer = new SprmBuffer( 2 );
newSprmBuffer.append( sprmBuffer.toByteArray() );
PAPX papx = new PAPX( textPiece.getStart(),
_paragraphs.add( papx );
}
}
+
+ logger.log( POILogger.DEBUG,
+ "Merged (?) with PAPX from complex file table in ",
+ Long.valueOf( System.currentTimeMillis() - start ),
+ " ms (", Integer.valueOf( _paragraphs.size() ),
+ " elements in total)" );
+ start = System.currentTimeMillis();
}
// rebuild document paragraphs structure
docText.replace( textPiece.getStart(), textPiece.getStart()
+ toAppendLength, toAppend );
}
+ logger.log( POILogger.DEBUG, "Document text rebuilded in ",
+ Long.valueOf( System.currentTimeMillis() - start ), " ms (",
+ Integer.valueOf( docText.length() ), " chars)" );
+ start = System.currentTimeMillis();
+
+ List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
+ Collections.sort( oldPapxSortedByEndPos,
+ PropertyNode.EndComparator.instance );
+
+ logger.log( POILogger.DEBUG, "PAPX sorted by end position in ",
+ Long.valueOf( System.currentTimeMillis() - start ), " ms" );
+ start = System.currentTimeMillis();
+
+ final Map<PAPX, Integer> papxToFileOrder = new IdentityHashMap<PAPX, Integer>();
+ {
+ int counter = 0;
+ for ( PAPX papx : _paragraphs )
+ {
+ papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
+ }
+ }
+
+ logger.log( POILogger.DEBUG, "PAPX's order map created in ",
+ Long.valueOf( System.currentTimeMillis() - start ), " ms" );
+ start = System.currentTimeMillis();
List<PAPX> newPapxs = new LinkedList<PAPX>();
int lastParStart = 0;
+ int lastPapxIndex = 0;
for ( int charIndex = 0; charIndex < docText.length(); charIndex++ )
{
final char c = docText.charAt( charIndex );
final int endExclusive = charIndex + 1;
List<PAPX> papxs = new LinkedList<PAPX>();
- for ( PAPX papx : _paragraphs )
+ for ( int papxIndex = lastPapxIndex; papxIndex < oldPapxSortedByEndPos
+ .size(); papxIndex++ )
{
- // TODO: Tests, check, etc
- for ( int f = papx.getEnd() - 1; f <= charIndex; f++ )
+ PAPX papx = oldPapxSortedByEndPos.get( papxIndex );
+
+ assert papx.getEnd() > startInclusive;
+ if ( papx.getEnd() - 1 > charIndex )
{
- if ( f == charIndex )
- {
- papxs.add( papx );
- break;
- }
- final char fChar = docText.charAt( charIndex );
- if ( fChar == 13 || fChar == 7 || fChar == 12 )
- break;
+ lastPapxIndex = papxIndex;
+ break;
}
+
+ papxs.add( papx );
}
if ( papxs.size() == 0 )
}
}
+ // restore file order of PAPX
+ Collections.sort( papxs, new Comparator<PAPX>()
+ {
+ public int compare( PAPX o1, PAPX o2 )
+ {
+ Integer i1 = papxToFileOrder.get( o1 );
+ Integer i2 = papxToFileOrder.get( o2 );
+ return i1.compareTo( i2 );
+ }
+ } );
+
SprmBuffer sprmBuffer = null;
for ( PAPX papx : papxs )
{
==================================================================== */\r
package org.apache.poi.hwpf;\r
\r
+import java.io.ByteArrayInputStream;\r
+import java.io.ByteArrayOutputStream;\r
+import java.io.IOException;\r
+import java.io.InputStream;\r
+import java.util.zip.ZipInputStream;\r
+\r
import org.apache.poi.POIDataSamples;\r
import org.apache.poi.poifs.filesystem.POIFSFileSystem;\r
-\r
-import java.io.*;\r
+import org.apache.poi.util.IOUtils;\r
+import org.apache.poi.util.POILogFactory;\r
+import org.apache.poi.util.POILogger;\r
\r
public class HWPFTestDataSamples {\r
\r
+ private static final POILogger logger = POILogFactory\r
+ .getLogger( HWPFTestDataSamples.class );\r
+\r
public static HWPFDocument openSampleFile(String sampleFileName) {\r
try {\r
InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);\r
throw new RuntimeException(e);\r
}\r
}\r
+\r
+ public static HWPFDocument openSampleFileFromArchive( String sampleFileName )\r
+ {\r
+ final long start = System.currentTimeMillis();\r
+ try\r
+ {\r
+ ZipInputStream is = new ZipInputStream( POIDataSamples\r
+ .getDocumentInstance()\r
+ .openResourceAsStream( sampleFileName ) );\r
+ try\r
+ {\r
+ is.getNextEntry();\r
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();\r
+ try\r
+ {\r
+ IOUtils.copy( is, baos );\r
+ }\r
+ finally\r
+ {\r
+ baos.close();\r
+ }\r
+\r
+ final long endUnzip = System.currentTimeMillis();\r
+ byte[] byteArray = baos.toByteArray();\r
+\r
+ logger.log( POILogger.DEBUG, "Unzipped in ",\r
+ Long.valueOf( endUnzip - start ), " ms -- ",\r
+ Long.valueOf( byteArray.length ), " byte(s)" );\r
+\r
+ ByteArrayInputStream bais = new ByteArrayInputStream( byteArray );\r
+ HWPFDocument doc = new HWPFDocument( bais );\r
+ final long endParse = System.currentTimeMillis();\r
+\r
+ logger.log( POILogger.DEBUG, "Parsed in ",\r
+ Long.valueOf( endParse - start ), " ms" );\r
+\r
+ return doc;\r
+ }\r
+ finally\r
+ {\r
+ is.close();\r
+ }\r
+ }\r
+ catch ( IOException e )\r
+ {\r
+ throw new RuntimeException( e );\r
+ }\r
+ }\r
+\r
public static HWPFOldDocument openOldSampleFile(String sampleFileName) {\r
try {\r
InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);\r