From: Sergey Vladimirov Date: Mon, 25 Jul 2011 12:58:09 +0000 (+0000) Subject: replace ComplexFileTable with single-element-one right after load; replace text piece... X-Git-Tag: REL_3_8_BETA4~67 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=4c724bf71ccda1ab7e6b854aace8d7da3c581629;p=poi.git replace ComplexFileTable with single-element-one right after load; replace text piece table as well git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1150675 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index 1ce89377b7..ffa1196f5c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -23,8 +23,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.Iterator; -import java.util.List; import org.apache.poi.hwpf.model.BookmarksTables; import org.apache.poi.hwpf.model.CHPBinTable; @@ -40,6 +38,7 @@ import org.apache.poi.hwpf.model.NoteType; import org.apache.poi.hwpf.model.NotesTables; import org.apache.poi.hwpf.model.PAPBinTable; import org.apache.poi.hwpf.model.PicturesTable; +import org.apache.poi.hwpf.model.PieceDescriptor; import org.apache.poi.hwpf.model.RevisionMarkAuthorTable; import org.apache.poi.hwpf.model.SavedByTable; import org.apache.poi.hwpf.model.SectionTable; @@ -92,7 +91,7 @@ public final class HWPFDocument extends HWPFDocumentCore * structure*/ protected ComplexFileTable _cft; - protected TextPieceTable _tpt; + protected final StringBuilder _text; /** Holds the save history for this document. */ protected SavedByTable _sbt; @@ -139,6 +138,7 @@ public final class HWPFDocument extends HWPFDocumentCore protected HWPFDocument() { super(); + this._text = new StringBuilder("\r"); } /** @@ -246,15 +246,35 @@ public final class HWPFDocument extends HWPFDocumentCore // Start to load up our standard structures. _dop = new DocumentProperties(_tableStream, _fib.getFcDop()); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin); - _tpt = _cft.getTextPieceTable(); + TextPieceTable _tpt = _cft.getTextPieceTable(); // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt); _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt); + _text = _tpt.getText(); _cbt.rebuild( _cft ); - _pbt.rebuild( _dataStream, _cft ); + _pbt.rebuild( _text, _dataStream, _cft ); + + boolean preserve = false; + try + { + preserve = Boolean.parseBoolean( System + .getProperty( "org.apache.poi.hwpf.preserveTextTable" ) ); + } + catch ( Exception exc ) + { + // ignore; + } + if ( !preserve ) + { + _cft = new ComplexFileTable(); + _tpt = _cft.getTextPieceTable(); + _tpt.add( new TextPiece( 0, _text.length(), _text.toString() + .getBytes( "UTF-16LE" ), new PieceDescriptor( new byte[8], + 0 ) ) ); + } // Read FSPA and Escher information _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); @@ -314,6 +334,12 @@ public final class HWPFDocument extends HWPFDocumentCore return _cft.getTextPieceTable(); } + @Override + public StringBuilder getText() + { + return _text; + } + @Deprecated public CPSplitCalculator getCPSplitCalculator() { @@ -326,10 +352,7 @@ public final class HWPFDocument extends HWPFDocumentCore } public Range getOverallRange() { - // hack to get the ending cp of the document, Have to revisit this. - TextPiece p = _tpt.getTextPieces().get(_tpt.getTextPieces().size() - 1); - - return new Range(0, p.getEnd(), this); + return new Range(0, _text.length(), this); } /** @@ -445,16 +468,7 @@ public final class HWPFDocument extends HWPFDocumentCore */ public int characterLength() { - List textPieces = _tpt.getTextPieces(); - Iterator textIt = textPieces.iterator(); - - int length = 0; - while(textIt.hasNext()) - { - TextPiece tp = textIt.next(); - length += tp.characterLength(); - } - return length; + return _text.length(); } /** @@ -643,7 +657,7 @@ public final class HWPFDocument extends HWPFDocumentCore // write out the PAPBinTable. _fib.setFcPlcfbtePapx(tableOffset); - _pbt.writeTo(docSys, fcMin); + _pbt.writeTo(docSys, fcMin, _cft.getTextPieceTable()); _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java index e3bd68d8ea..50171c37e7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java @@ -35,6 +35,7 @@ import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.Internal; /** @@ -161,8 +162,20 @@ public abstract class HWPFDocumentCore extends POIDocument */ public abstract Range getOverallRange(); - public abstract TextPieceTable getTextTable(); - + /** + * Returns document text, i.e. text information from all text pieces, + * including OLE descriptions and field codes + */ + public String getDocumentText() { + return getText().toString(); + } + + /** + * Internal method to access document text + */ + @Internal + public abstract StringBuilder getText(); + public CHPBinTable getCharacterTable() { return _cbt; @@ -197,4 +210,6 @@ public abstract class HWPFDocumentCore extends POIDocument { return _fib; } + + public abstract TextPieceTable getTextTable(); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 3bc32a13f6..08c60959cf 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -38,6 +38,8 @@ import org.apache.poi.util.LittleEndian; public class HWPFOldDocument extends HWPFDocumentCore { private TextPieceTable tpt; + private StringBuilder _text; + public HWPFOldDocument(POIFSFileSystem fs) throws IOException { this(fs.getRoot()); } @@ -88,13 +90,15 @@ public class HWPFOldDocument extends HWPFDocumentCore { byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()]; System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length); TextPiece tp = new TextPiece( - 0, textData.length, textData, pd, 0 + 0, textData.length, textData, pd ); tpt.add(tp); text.append(tp.getStringBuffer()); } + _text = tpt.getText(); + // Now we can fetch the character and paragraph properties _cbt = new OldCHPBinTable( _mainStream, chpTableOffset, chpTableSize, @@ -126,6 +130,12 @@ public class HWPFOldDocument extends HWPFDocumentCore { return tpt; } + @Override + public StringBuilder getText() + { + return _text; + } + @Override public void write(OutputStream out) throws IOException { throw new IllegalStateException("Writing is not available for the older file formats"); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java b/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java index 70b8b38404..1591797856 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java @@ -23,9 +23,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -37,10 +35,7 @@ import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.model.CHPX; import org.apache.poi.hwpf.model.FieldsDocumentPart; import org.apache.poi.hwpf.model.FileInformationBlock; -import org.apache.poi.hwpf.model.GenericPropertyNode; -import org.apache.poi.hwpf.model.PAPFormattedDiskPage; import org.apache.poi.hwpf.model.PAPX; -import org.apache.poi.hwpf.model.PlexOfCps; import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.sprm.SprmIterator; @@ -51,10 +46,8 @@ import org.apache.poi.hwpf.usermodel.Field; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; -import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.IOUtils; -import org.apache.poi.util.LittleEndian; /** * Used by developers to list out key information on a HWPF file. End users will @@ -241,13 +234,10 @@ public final class HWPFLister private LinkedHashMap paragraphs; - private String text; - public HWPFLister( HWPFDocumentCore doc ) { _doc = doc; - buildText(); buildParagraphs(); } @@ -256,6 +246,7 @@ public final class HWPFLister paragraphs = new LinkedHashMap(); StringBuilder part = new StringBuilder(); + String text = _doc.getDocumentText(); for ( int charIndex = 0; charIndex < text.length(); charIndex++ ) { char c = text.charAt( charIndex ); @@ -268,24 +259,6 @@ public final class HWPFLister } } - private void buildText() - { - StringBuilder builder = new StringBuilder(); - for ( TextPiece textPiece : _doc.getTextTable().getTextPieces() ) - { - String toAppend = textPiece.getStringBuffer().toString(); - - if ( toAppend.length() != ( textPiece.getEnd() - textPiece - .getStart() ) ) - { - throw new AssertionError(); - } - - builder.replace( textPiece.getStart(), textPiece.getEnd(), toAppend ); - } - this.text = builder.toString(); - } - private void dumpBookmarks() { if ( !( _doc instanceof HWPFDocument ) ) @@ -379,69 +352,69 @@ public final class HWPFLister public void dumpPapx( boolean withProperties ) throws Exception { - if ( _doc instanceof HWPFDocument ) - { - System.out.println( "binary PAP pages " ); - - HWPFDocument doc = (HWPFDocument) _doc; - - java.lang.reflect.Field fMainStream = HWPFDocumentCore.class - .getDeclaredField( "_mainStream" ); - fMainStream.setAccessible( true ); - byte[] mainStream = (byte[]) fMainStream.get( _doc ); - - PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc - .getFileInformationBlock().getFcPlcfbtePapx(), doc - .getFileInformationBlock().getLcbPlcfbtePapx(), 4 ); - - List papxs = new ArrayList(); - - int length = binTable.length(); - for ( int x = 0; x < length; x++ ) - { - GenericPropertyNode node = binTable.getProperty( x ); - - int pageNum = LittleEndian.getInt( node.getBytes() ); - int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE - * pageNum; - - PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage( - mainStream, doc.getDataStream(), pageOffset, - doc.getTextTable() ); - - System.out.println( "* PFKP: " + pfkp ); - - for ( PAPX papx : pfkp.getPAPXs() ) - { - System.out.println( "** " + papx ); - papxs.add( papx ); - if ( papx != null && true ) - { - SprmIterator sprmIt = new SprmIterator( - papx.getGrpprl(), 2 ); - while ( sprmIt.hasNext() ) - { - SprmOperation sprm = sprmIt.next(); - System.out.println( "*** " + sprm.toString() ); - } - } - - } - } - - Collections.sort( papxs ); - System.out.println( "* Sorted by END" ); - for ( PAPX papx : papxs ) - { - System.out.println( "** " + papx ); - SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 ); - while ( sprmIt.hasNext() ) - { - SprmOperation sprm = sprmIt.next(); - System.out.println( "*** " + sprm.toString() ); - } - } - } +// if ( _doc instanceof HWPFDocument ) +// { +// System.out.println( "binary PAP pages " ); +// +// HWPFDocument doc = (HWPFDocument) _doc; +// +// java.lang.reflect.Field fMainStream = HWPFDocumentCore.class +// .getDeclaredField( "_mainStream" ); +// fMainStream.setAccessible( true ); +// byte[] mainStream = (byte[]) fMainStream.get( _doc ); +// +// PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc +// .getFileInformationBlock().getFcPlcfbtePapx(), doc +// .getFileInformationBlock().getLcbPlcfbtePapx(), 4 ); +// +// List papxs = new ArrayList(); +// +// int length = binTable.length(); +// for ( int x = 0; x < length; x++ ) +// { +// GenericPropertyNode node = binTable.getProperty( x ); +// +// int pageNum = LittleEndian.getInt( node.getBytes() ); +// int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE +// * pageNum; +// +// PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage( +// mainStream, doc.getDataStream(), pageOffset, +// doc.getTextTable() ); +// +// System.out.println( "* PFKP: " + pfkp ); +// +// for ( PAPX papx : pfkp.getPAPXs() ) +// { +// System.out.println( "** " + papx ); +// papxs.add( papx ); +// if ( papx != null && true ) +// { +// SprmIterator sprmIt = new SprmIterator( +// papx.getGrpprl(), 2 ); +// while ( sprmIt.hasNext() ) +// { +// SprmOperation sprm = sprmIt.next(); +// System.out.println( "*** " + sprm.toString() ); +// } +// } +// +// } +// } +// +// Collections.sort( papxs ); +// System.out.println( "* Sorted by END" ); +// for ( PAPX papx : papxs ) +// { +// System.out.println( "** " + papx ); +// SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 ); +// while ( sprmIt.hasNext() ) +// { +// SprmOperation sprm = sprmIt.next(); +// System.out.println( "*** " + sprm.toString() ); +// } +// } +// } // for ( PAPX papx : _doc.getParagraphTable().getParagraphs() ) // { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index b5dcc78a80..464b11a406 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -20,13 +20,11 @@ package org.apache.poi.hwpf.extractor; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Arrays; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFDocument; -import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.usermodel.HeaderStories; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; @@ -218,22 +216,7 @@ public final class WordExtractor extends POIOLE2TextExtractor { * mapping is broken. Fast too. */ public String getTextFromPieces() { - StringBuffer textBuf = new StringBuffer(); - - for(TextPiece piece : doc.getTextTable().getTextPieces()) { - String encoding = "Cp1252"; - if (piece.isUnicode()) { - encoding = "UTF-16LE"; - } - try { - String text = new String(piece.getRawBytes(), encoding); - textBuf.append(text); - } catch(UnsupportedEncodingException e) { - throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken"); - } - } - - String text = textBuf.toString(); + String text = doc.getDocumentText(); // Fix line endings (Note - won't get all of them text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java index 976c4a7055..551e90b750 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/CHPBinTable.java @@ -179,34 +179,6 @@ public class CHPBinTable start = System.currentTimeMillis(); } - // rebuild document paragraphs structure - StringBuilder docText = new StringBuilder(); - for ( TextPiece textPiece : tpt.getTextPieces() ) - { - String toAppend = textPiece.getStringBuffer().toString(); - int toAppendLength = toAppend.length(); - - if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() ) - { - logger.log( - POILogger.WARN, - "Text piece has boundaries [", - Integer.valueOf( textPiece.getStart() ), - "; ", - Integer.valueOf( textPiece.getEnd() ), - ") but length ", - Integer.valueOf( textPiece.getEnd() - - textPiece.getStart() ) ); - } - - docText.replace( textPiece.getStart(), textPiece.getStart() - + toAppendLength, toAppend ); - } - logger.log( POILogger.DEBUG, "Document text rebuilded in ", - Long.valueOf( System.currentTimeMillis() - start ), " ms (", - Integer.valueOf( docText.length() ), " chars)" ); - start = System.currentTimeMillis(); - List oldChpxSortedByStartPos = new ArrayList( _textRuns ); Collections.sort( oldChpxSortedByStartPos, PropertyNode.StartComparator.instance ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 4bb50e023e..e93fb07741 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -54,9 +54,6 @@ public class PAPBinTable protected ArrayList _paragraphs = new ArrayList(); byte[] _dataStream; - /** So we can know if things are unicode or not */ - private TextPieceTable tpt; - public PAPBinTable() { } @@ -81,7 +78,6 @@ public class PAPBinTable { PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 ); - this.tpt = tpt; int length = binTable.length(); for ( int x = 0; x < length; x++ ) @@ -112,7 +108,8 @@ public class PAPBinTable Integer.valueOf( _paragraphs.size() ), " elements)" ); } - public void rebuild( byte[] dataStream, ComplexFileTable complexFileTable ) + public void rebuild( final StringBuilder docText, byte[] dataStream, + ComplexFileTable complexFileTable ) { long start = System.currentTimeMillis(); @@ -121,7 +118,8 @@ public class PAPBinTable SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls(); // adding PAPX from fast-saved SPRMs - for ( TextPiece textPiece : tpt.getTextPieces() ) + for ( TextPiece textPiece : complexFileTable.getTextPieceTable() + .getTextPieces() ) { PropertyModifier prm = textPiece.getPieceDescriptor().getPrm(); if ( !prm.isComplex() ) @@ -167,34 +165,6 @@ public class PAPBinTable start = System.currentTimeMillis(); } - // rebuild document paragraphs structure - StringBuilder docText = new StringBuilder(); - for ( TextPiece textPiece : tpt.getTextPieces() ) - { - String toAppend = textPiece.getStringBuffer().toString(); - int toAppendLength = toAppend.length(); - - if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() ) - { - logger.log( - POILogger.WARN, - "Text piece has boundaries [", - Integer.valueOf( textPiece.getStart() ), - "; ", - Integer.valueOf( textPiece.getEnd() ), - ") but length ", - Integer.valueOf( textPiece.getEnd() - - textPiece.getStart() ) ); - } - - docText.replace( textPiece.getStart(), textPiece.getStart() - + toAppendLength, toAppend ); - } - logger.log( POILogger.DEBUG, "Document text rebuilded in ", - Long.valueOf( System.currentTimeMillis() - start ), " ms (", - Integer.valueOf( docText.length() ), " chars)" ); - start = System.currentTimeMillis(); - List oldPapxSortedByEndPos = new ArrayList( _paragraphs ); Collections.sort( oldPapxSortedByEndPos, PropertyNode.EndComparator.instance ); @@ -274,7 +244,8 @@ public class PAPBinTable { // can we reuse existing? PAPX existing = papxs.get( 0 ); - if ( existing.getStart() == startInclusive && existing.getEnd() == endExclusive ) + if ( existing.getStart() == startInclusive + && existing.getEnd() == endExclusive ) { newPapxs.add( existing ); lastParStart = endExclusive; @@ -311,7 +282,8 @@ public class PAPBinTable this._paragraphs = new ArrayList( newPapxs ); logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ", - Long.valueOf( System.currentTimeMillis() - start ), " ms" ); + Long.valueOf( System.currentTimeMillis() - start ), " ms (", + Integer.valueOf( _paragraphs.size() ), " elements)" ); start = System.currentTimeMillis(); _dataStream = dataStream; @@ -320,7 +292,7 @@ public class PAPBinTable public void insert(int listIndex, int cpStart, SprmBuffer buf) { - PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream); + PAPX forInsert = new PAPX(0, 0, buf, _dataStream); // Ensure character offsets are really characters forInsert.setStart(cpStart); @@ -350,7 +322,7 @@ public class PAPBinTable // Original, until insert at point // New one // Clone of original, on to the old end - PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream); + PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream); // Again ensure contains character based offsets no matter what clone.setStart(cpStart); clone.setEnd(currentPap.getEnd()); @@ -427,9 +399,8 @@ public class PAPBinTable return _paragraphs; } - public void writeTo(HWPFFileSystem sys, int fcMin) - throws IOException - { + public void writeTo( HWPFFileSystem sys, int fcMin, CharIndexTranslator translator ) throws IOException + { HWPFOutputStream docStream = sys.getStream("WordDocument"); OutputStream tableStream = sys.getStream("1Table"); @@ -463,7 +434,7 @@ public class PAPBinTable PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(_dataStream); pfkp.fill(overflow); - byte[] bufFkp = pfkp.toByteArray(tpt, fcMin); + byte[] bufFkp = pfkp.toByteArray(translator, fcMin); docStream.write(bufFkp); overflow = pfkp.getOverflow(); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index ce448911ab..a2255e7538 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -19,6 +19,9 @@ package org.apache.poi.hwpf.model; import java.io.UnsupportedEncodingException; + +import org.apache.poi.util.Internal; + /** * Lightweight representation of a text piece. * Works in the character domain, not the byte domain, so you @@ -27,19 +30,39 @@ import java.io.UnsupportedEncodingException; * * @author Ryan Ackley */ - +@Internal public final class TextPiece extends PropertyNode { private boolean _usesUnicode; private PieceDescriptor _pd; - /** - * @param start Beginning offset in main document stream, in characters. - * @param end Ending offset in main document stream, in characters. - * @param text The raw bytes of our text - */ - public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) { + /** + * @param start + * Beginning offset in main document stream, in characters. + * @param end + * Ending offset in main document stream, in characters. + * @param text + * The raw bytes of our text + * @deprecated Use {@link #TextPiece(int,int,byte[],PieceDescriptor)} + * instead + */ + public TextPiece( int start, int end, byte[] text, PieceDescriptor pd, + int cpStart ) + { + this( start, end, text, pd ); + } + + /** + * @param start + * Beginning offset in main document stream, in characters. + * @param end + * Ending offset in main document stream, in characters. + * @param text + * The raw bytes of our text + */ + public TextPiece( int start, int end, byte[] text, PieceDescriptor pd ) + { super(start, end, buildInitSB(text, pd)); _usesUnicode = pd.isUnicode(); _pd = pd; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index 6c6ca188c4..453d0285a1 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -24,6 +24,8 @@ import java.util.List; import org.apache.poi.hwpf.model.io.HWPFOutputStream; import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * The piece table for matching up character positions to bits of text. This @@ -34,6 +36,9 @@ import org.apache.poi.poifs.common.POIFSConstants; */ public class TextPieceTable implements CharIndexTranslator { + private static final POILogger logger = POILogFactory + .getLogger( TextPieceTable.class ); + // int _multiple; int _cpMin; protected ArrayList _textPieces = new ArrayList(); @@ -101,7 +106,7 @@ public class TextPieceTable implements CharIndexTranslator // And now build the piece _textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf, - pieces[x], node.getStart() ) ); + pieces[x] ) ); } // In the interest of our sanity, now sort the text pieces @@ -251,6 +256,41 @@ public class TextPieceTable implements CharIndexTranslator return _cpMin; } + public StringBuilder getText() + { + final long start = System.currentTimeMillis(); + + // rebuild document paragraphs structure + StringBuilder docText = new StringBuilder(); + for ( TextPiece textPiece : _textPieces ) + { + String toAppend = textPiece.getStringBuffer().toString(); + int toAppendLength = toAppend.length(); + + if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() ) + { + logger.log( + POILogger.WARN, + "Text piece has boundaries [", + Integer.valueOf( textPiece.getStart() ), + "; ", + Integer.valueOf( textPiece.getEnd() ), + ") but length ", + Integer.valueOf( textPiece.getEnd() + - textPiece.getStart() ) ); + } + + docText.replace( textPiece.getStart(), textPiece.getStart() + + toAppendLength, toAppend ); + } + + logger.log( POILogger.DEBUG, "Document text were rebuilded in ", + Long.valueOf( System.currentTimeMillis() - start ), " ms (", + Integer.valueOf( docText.length() ), " chars)" ); + + return docText; + } + public List getTextPieces() { return _textPieces; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index 51287ef7b6..7bb89b869e 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -31,7 +31,7 @@ import org.apache.poi.hwpf.model.PropertyNode; import org.apache.poi.hwpf.model.SEPX; import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.model.SubdocumentType; -import org.apache.poi.hwpf.model.TextPiece; +import org.apache.poi.hwpf.model.TextPieceTable; import org.apache.poi.hwpf.sprm.CharacterSprmCompressor; import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor; import org.apache.poi.hwpf.sprm.SprmBuffer; @@ -108,18 +108,8 @@ public class Range { // TODO -instantiable superclass /** The end index in the characterRuns list for this Range. */ protected int _charEnd; - /** Have we loaded the Text indexes yet */ - protected boolean _textRangeFound; - - /** All text pieces that belong to the document this Range belongs to. */ - protected List _text; - - /** The start index in the text list for this Range. */ - protected int _textStart; - - /** The end index in the text list for this Range. */ - protected int _textEnd; - + protected StringBuilder _text; + // protected Range() // { // @@ -144,7 +134,7 @@ public class Range { // TODO -instantiable superclass _sections = _doc.getSectionTable().getSections(); _paragraphs = _doc.getParagraphTable().getParagraphs(); _characters = _doc.getCharacterTable().getTextRuns(); - _text = _doc.getTextTable().getTextPieces(); + _text = _doc.getText(); _parent = new WeakReference(null); sanityCheckStartEnd(); @@ -171,6 +161,7 @@ public class Range { // TODO -instantiable superclass _parent = new WeakReference(parent); sanityCheckStartEnd(); + assert sanityCheck(); } /** @@ -212,23 +203,17 @@ public class Range { // TODO -instantiable superclass } } - /** - * Does any TextPiece in this Range use unicode? - * - * @return true if it does and false if it doesn't - */ - public boolean usesUnicode() { - - initText(); - - for (int i = _textStart; i < _textEnd; i++) { - TextPiece piece = _text.get(i); - if (piece.isUnicode()) - return true; - } - - return false; - } + /** + * @return always return true + * @deprecated Range is not linked to any text piece anymore, so to check if + * unicode is used please access {@link TextPieceTable} during + * document load time + */ + @Deprecated + public boolean usesUnicode() + { + return true; + } /** * Gets the text that this Range contains. @@ -236,29 +221,7 @@ public class Range { // TODO -instantiable superclass * @return The text for this range. */ public String text() { - initText(); - - StringBuffer sb = new StringBuffer(); - - for (int x = _textStart; x < _textEnd; x++) { - TextPiece piece = _text.get(x); - - // Figure out where in this piece the text - // we're after lives - int rStart = 0; - int rEnd = piece.characterLength(); - if (_start > piece.getStart()) { - rStart = _start - piece.getStart(); - } - if (_end < piece.getEnd()) { - rEnd -= (piece.getEnd() - _end); - } - - // Luckily TextPieces work in characters, so we don't - // need to worry about unicode here - sb.append(piece.substring(rStart, rEnd)); - } - return sb.toString(); + return _text.substring( _start, _end ); } /** @@ -346,67 +309,52 @@ public class Range { // TODO -instantiable superclass return _charEnd - _charStart; } - /** - * Inserts text into the front of this range. - * - * @param text - * The text to insert - * @return The character run that text was inserted into. - */ - public CharacterRun insertBefore(String text) - // throws UnsupportedEncodingException - { - initAll(); - - TextPiece tp = _text.get(_textStart); - StringBuffer sb = tp.getStringBuffer(); - - // Since this is the first item in our list, it is safe to assume that - // _start >= tp.getStart() - int insertIndex = _start - tp.getStart(); - sb.insert(insertIndex, text); - - int adjustedLength = _doc.getTextTable().adjustForInsert(_textStart, text.length()); - _doc.getCharacterTable().adjustForInsert(_charStart, adjustedLength); - _doc.getParagraphTable().adjustForInsert(_parStart, adjustedLength); - _doc.getSectionTable().adjustForInsert(_sectionStart, adjustedLength); - adjustForInsert(adjustedLength); + /** + * Inserts text into the front of this range. + * + * @param text + * The text to insert + * @return The character run that text was inserted into. + */ + public CharacterRun insertBefore( String text ) + { + initAll(); - // update the FIB.CCPText + friends fields - adjustFIB(text.length()); + _text.insert( _start, text ); + _doc.getCharacterTable().adjustForInsert( _charStart, text.length() ); + _doc.getParagraphTable().adjustForInsert( _parStart, text.length() ); + _doc.getSectionTable().adjustForInsert( _sectionStart, text.length() ); + adjustForInsert( text.length() ); - return getCharacterRun(0); - } + // update the FIB.CCPText + friends fields + adjustFIB( text.length() ); - /** - * Inserts text onto the end of this range - * - * @param text - * The text to insert - * @return The character run the text was inserted into. - */ - public CharacterRun insertAfter(String text) { - initAll(); + assert sanityCheck(); - int listIndex = _textEnd - 1; - TextPiece tp = _text.get(listIndex); - StringBuffer sb = tp.getStringBuffer(); + return getCharacterRun( 0 ); + } - int insertIndex = _end - tp.getStart(); + /** + * Inserts text onto the end of this range + * + * @param text + * The text to insert + * @return The character run the text was inserted into. + */ + public CharacterRun insertAfter( String text ) + { + initAll(); - if (tp.getStringBuffer().charAt(_end - 1) == '\r' && text.charAt(0) != '\u0007') { - insertIndex--; - } - sb.insert(insertIndex, text); - int adjustedLength = _doc.getTextTable().adjustForInsert(listIndex, text.length()); - _doc.getCharacterTable().adjustForInsert(_charEnd - 1, adjustedLength); - _doc.getParagraphTable().adjustForInsert(_parEnd - 1, adjustedLength); - _doc.getSectionTable().adjustForInsert(_sectionEnd - 1, adjustedLength); - adjustForInsert(text.length()); + _text.insert( _end, text ); - return getCharacterRun(numCharacterRuns() - 1); + _doc.getCharacterTable().adjustForInsert( _charEnd - 1, text.length() ); + _doc.getParagraphTable().adjustForInsert( _parEnd - 1, text.length() ); + _doc.getSectionTable().adjustForInsert( _sectionEnd - 1, text.length() ); + adjustForInsert( text.length() ); - } + assert sanityCheck(); + return getCharacterRun( numCharacterRuns() - 1 ); + } /** * Inserts text into the front of this range and it gives that text the @@ -580,7 +528,6 @@ public class Range { // TODO -instantiable superclass int numSections = _sections.size(); int numRuns = _characters.size(); int numParagraphs = _paragraphs.size(); - int numTextPieces = _text.size(); for (int x = _charStart; x < numRuns; x++) { CHPX chpx = _characters.get(x); @@ -605,10 +552,12 @@ public class Range { // TODO -instantiable superclass // + " -> " + sepx.getEnd()); } - for (int x = _textStart; x < numTextPieces; x++) { - TextPiece piece = _text.get(x); - piece.adjustForDelete(_start, _end - _start); - } + _text.delete( _start, _end ); + Range parent = _parent.get(); + if ( parent != null ) + { + parent.adjustForInsert( -( _end - _start ) ); + } // update the FIB.CCPText + friends field adjustFIB(-(_end - _start)); @@ -623,7 +572,7 @@ public class Range { // TODO -instantiable superclass * @param rows * The number of rows. * @return The empty Table that is now part of the document. - * @deprecated Use code shall not work with {@link ParagraphProperties} + * @deprecated Use code shall not work with {@link TableProperties} */ @Deprecated public Table insertBefore(TableProperties props, int rows) { @@ -631,19 +580,28 @@ public class Range { // TODO -instantiable superclass parProps.setFInTable(true); parProps.setItap( 1 ); + final int oldEnd = this._end; + int columns = props.getItcMac(); - for (int x = 0; x < rows; x++) { - Paragraph cell = this.insertBefore(parProps, StyleSheet.NIL_STYLE); - cell.insertAfter(String.valueOf('\u0007')); - for (int y = 1; y < columns; y++) { - cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE); - cell.insertAfter(String.valueOf('\u0007')); - } - cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE, String.valueOf('\u0007')); - cell.setTableRowEnd(props); - } - return new Table(_start, _start + (rows * (columns + 1)) * 2, this, 1); - } + for ( int x = 0; x < rows; x++ ) + { + Paragraph cell = this.insertBefore( parProps, StyleSheet.NIL_STYLE ); + cell.insertAfter( String.valueOf( '\u0007' ) ); + for ( int y = 1; y < columns; y++ ) + { + cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE ); + cell.insertAfter( String.valueOf( '\u0007' ) ); + } + cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE, + String.valueOf( '\u0007' ) ); + cell.setTableRowEnd( props ); + } + + final int newEnd = this._end; + final int diff = newEnd - oldEnd; + + return new Table( _start, _start + diff, this, 1 ); + } /** * Inserts a list into the beginning of this range. @@ -715,23 +673,14 @@ public class Range { // TODO -instantiable superclass */ public void replaceText(String pPlaceHolder, String pValue, int pOffset) { int absPlaceHolderIndex = getStartOffset() + pOffset; - Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder - .length()), getDocument()); - - // this Range isn't a proper parent of the subRange() so we'll have to - // keep - // track of an updated endOffset on our own - int previousEndOffset = subRange.getEndOffset(); + Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder + .length()), this); subRange.insertBefore(pValue); - if (subRange.getEndOffset() != previousEndOffset) { - adjustForInsert(subRange.getEndOffset() - previousEndOffset); - } - // re-create the sub-range so we can delete it subRange = new Range((absPlaceHolderIndex + pValue.length()), (absPlaceHolderIndex - + pPlaceHolder.length() + pValue.length()), getDocument()); + + pPlaceHolder.length() + pValue.length()), this); // deletes are automagically propagated subRange.delete(); @@ -921,7 +870,6 @@ public class Range { // TODO -instantiable superclass * loads all of the list indexes. */ protected void initAll() { - initText(); initCharacterRuns(); initParagraphs(); initSections(); @@ -951,18 +899,6 @@ public class Range { // TODO -instantiable superclass } } - /** - * inits the text piece list indexes. - */ - private void initText() { - if (!_textRangeFound) { - int[] point = findRange(_text, _textStart, _start, _end); - _textStart = point[0]; - _textEnd = point[1]; - _textRangeFound = true; - } - } - /** * inits the section list indexes. */ @@ -1038,7 +974,6 @@ public class Range { // TODO -instantiable superclass * resets the list indexes. */ protected void reset() { - _textRangeFound = false; _charRangeFound = false; _parRangeFound = false; _sectionRangeFound = false; @@ -1153,8 +1088,19 @@ public class Range { // TODO -instantiable superclass * Method for debug purposes. Checks that all resolved elements are inside * of current range. */ - public void sanityCheck() + public boolean sanityCheck() { + if ( _start < 0 ) + throw new AssertionError(); + if ( _start >= _text.length() ) + throw new AssertionError(); + if ( _end < 0 ) + throw new AssertionError(); + if ( _end > _text.length() ) + throw new AssertionError(); + if ( _start > _end ) + throw new AssertionError(); + if ( _charRangeFound ) { for ( int c = _charStart; c < _charEnd; c++ ) @@ -1181,5 +1127,7 @@ public class Range { // TODO -instantiable superclass throw new AssertionError(); } } + + return true; } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java index bda0866bb0..2ecd482892 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestDifferentRoutes.java @@ -17,16 +17,13 @@ package org.apache.poi.hwpf.extractor; -import java.util.Iterator; +import junit.framework.TestCase; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFTestDataSamples; -import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; -import junit.framework.TestCase; - /** * Test the different routes to extracting text * @@ -78,24 +75,10 @@ public final class TestDifferentRoutes extends TestCase { * Test textPieces based extraction */ public void testExtractFromTextPieces() throws Exception { - StringBuffer textBuf = new StringBuffer(); - - Iterator textPieces = doc.getTextTable().getTextPieces().iterator(); - while (textPieces.hasNext()) { - TextPiece piece = (TextPiece) textPieces.next(); - - String encoding = "Cp1252"; - if (piece.isUnicode()) { - encoding = "UTF-16LE"; - } - String text = new String(piece.getRawBytes(), encoding); - textBuf.append(text); - } - StringBuffer exp = new StringBuffer(); for (int i = 0; i < p_text.length; i++) { exp.append(p_text[i]); } - assertEquals(exp.toString(), textBuf.toString()); + assertEquals(exp.toString(), doc.getDocumentText()); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java index c743d52d86..e21cee3573 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestPAPBinTable.java @@ -53,7 +53,7 @@ public final class TestPAPBinTable extends TestCase HWPFFileSystem fileSys = new HWPFFileSystem(); - _pAPBinTable.writeTo( fileSys, 0 ); + _pAPBinTable.writeTo( fileSys, 0, fakeTPT ); ByteArrayOutputStream tableOut = fileSys.getStream( "1Table" ); ByteArrayOutputStream mainOut = fileSys.getStream( "WordDocument" ); diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java index 7c73022ef5..bef1fc32e8 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/model/TestTextPieceTable.java @@ -169,6 +169,7 @@ public final class TestTextPieceTable extends TestCase { throws Exception { super.setUp(); + System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.TRUE.toString() ); _hWPFDocFixture = new HWPFDocFixture(this, HWPFDocFixture.DEFAULT_TEST_FILE); _hWPFDocFixture.setUp(); @@ -178,8 +179,9 @@ public final class TestTextPieceTable extends TestCase { throws Exception { _hWPFDocFixture.tearDown(); - _hWPFDocFixture = null; + + System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.FALSE.toString() ); super.tearDown(); } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java index fae45758ac..6fc32c456a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java @@ -103,10 +103,6 @@ public final class TestProblems extends HWPFTestCase { assertEquals("One paragraph is ok\7", r.getParagraph(3).text()); assertEquals("\7", r.getParagraph(4).text()); assertEquals("\r", r.getParagraph(5).text()); - for(int i=0; i<=5; i++) { - assertFalse(r.getParagraph(i).usesUnicode()); - } - // Get the table Table t = r.getTable(p); @@ -304,9 +300,6 @@ public final class TestProblems extends HWPFTestCase { assertEquals("Row 3/Cell 3\u0007", r.getParagraph(10).text()); assertEquals("\u0007", r.getParagraph(11).text()); assertEquals("\r", r.getParagraph(12).text()); - for(int i=0; i<=12; i++) { - assertFalse(r.getParagraph(i).usesUnicode()); - } Paragraph p; @@ -791,7 +784,9 @@ public final class TestProblems extends HWPFTestCase { Paragraph actParagraph = actual.getParagraph( p ); assertEquals( expParagraph.text(), actParagraph.text() ); - assertEquals( expParagraph.isInTable(), actParagraph.isInTable() ); + assertEquals( "Diffent isInTable flags for paragraphs #" + p + + " -- " + expParagraph + " -- " + actParagraph + ".", + expParagraph.isInTable(), actParagraph.isInTable() ); assertEquals( expParagraph.isTableRowEnd(), actParagraph.isTableRowEnd() ); diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRangeDelete.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRangeDelete.java index f123a2018c..2e55486695 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRangeDelete.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRangeDelete.java @@ -150,6 +150,8 @@ public final class TestRangeDelete extends TestCase { assertEquals(searchText, subRange.text()); subRange.delete(); + daDoc.getOverallRange().sanityCheck(); + daDoc.getRange().sanityCheck(); // we need to let the model re-calculate the Range before we evaluate it range = daDoc.getRange(); @@ -166,6 +168,7 @@ public final class TestRangeDelete extends TestCase { // this can lead to a StringBufferOutOfBoundsException, so we will add it // even though we don't have an assertion for it Range daRange = daDoc.getRange(); + daRange.sanityCheck(); daRange.text(); }