import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
-import java.util.Iterator;
-import java.util.List;
import org.apache.poi.hwpf.model.BookmarksTables;
import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.hwpf.model.NotesTables;
import org.apache.poi.hwpf.model.PAPBinTable;
import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.SectionTable;
* structure*/
protected ComplexFileTable _cft;
- protected TextPieceTable _tpt;
+ protected final StringBuilder _text;
/** Holds the save history for this document. */
protected SavedByTable _sbt;
protected HWPFDocument()
{
super();
+ this._text = new StringBuilder("\r");
}
/**
// Start to load up our standard structures.
_dop = new DocumentProperties(_tableStream, _fib.getFcDop());
_cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
- _tpt = _cft.getTextPieceTable();
+ TextPieceTable _tpt = _cft.getTextPieceTable();
// Now load the rest of the properties, which need to be adjusted
// for where text really begin
_cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt);
_pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
+ _text = _tpt.getText();
_cbt.rebuild( _cft );
- _pbt.rebuild( _dataStream, _cft );
+ _pbt.rebuild( _text, _dataStream, _cft );
+
+ boolean preserve = false;
+ try
+ {
+ preserve = Boolean.parseBoolean( System
+ .getProperty( "org.apache.poi.hwpf.preserveTextTable" ) );
+ }
+ catch ( Exception exc )
+ {
+ // ignore;
+ }
+ if ( !preserve )
+ {
+ _cft = new ComplexFileTable();
+ _tpt = _cft.getTextPieceTable();
+ _tpt.add( new TextPiece( 0, _text.length(), _text.toString()
+ .getBytes( "UTF-16LE" ), new PieceDescriptor( new byte[8],
+ 0 ) ) );
+ }
// Read FSPA and Escher information
_fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
return _cft.getTextPieceTable();
}
+ @Override
+ public StringBuilder getText()
+ {
+ return _text;
+ }
+
@Deprecated
public CPSplitCalculator getCPSplitCalculator()
{
}
public Range getOverallRange() {
- // hack to get the ending cp of the document, Have to revisit this.
- TextPiece p = _tpt.getTextPieces().get(_tpt.getTextPieces().size() - 1);
-
- return new Range(0, p.getEnd(), this);
+ return new Range(0, _text.length(), this);
}
/**
*/
public int characterLength()
{
- List<TextPiece> textPieces = _tpt.getTextPieces();
- Iterator<TextPiece> textIt = textPieces.iterator();
-
- int length = 0;
- while(textIt.hasNext())
- {
- TextPiece tp = textIt.next();
- length += tp.characterLength();
- }
- return length;
+ return _text.length();
}
/**
// write out the PAPBinTable.
_fib.setFcPlcfbtePapx(tableOffset);
- _pbt.writeTo(docSys, fcMin);
+ _pbt.writeTo(docSys, fcMin, _cft.getTextPieceTable());
_fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.Internal;
/**
*/
public abstract Range getOverallRange();
- public abstract TextPieceTable getTextTable();
-
+ /**
+ * Returns document text, i.e. text information from all text pieces,
+ * including OLE descriptions and field codes
+ */
+ public String getDocumentText() {
+ return getText().toString();
+ }
+
+ /**
+ * Internal method to access document text
+ */
+ @Internal
+ public abstract StringBuilder getText();
+
public CHPBinTable getCharacterTable()
{
return _cbt;
{
return _fib;
}
+
+ public abstract TextPieceTable getTextTable();
}
public class HWPFOldDocument extends HWPFDocumentCore {
private TextPieceTable tpt;
+ private StringBuilder _text;
+
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
TextPiece tp = new TextPiece(
- 0, textData.length, textData, pd, 0
+ 0, textData.length, textData, pd
);
tpt.add(tp);
text.append(tp.getStringBuffer());
}
+ _text = tpt.getText();
+
// Now we can fetch the character and paragraph properties
_cbt = new OldCHPBinTable(
_mainStream, chpTableOffset, chpTableSize,
return tpt;
}
+ @Override
+ public StringBuilder getText()
+ {
+ return _text;
+ }
+
@Override
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the older file formats");
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.FileInformationBlock;
-import org.apache.poi.hwpf.model.GenericPropertyNode;
-import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
import org.apache.poi.hwpf.model.PAPX;
-import org.apache.poi.hwpf.model.PlexOfCps;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.sprm.SprmIterator;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
-import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
/**
* Used by developers to list out key information on a HWPF file. End users will
private LinkedHashMap<Integer, String> paragraphs;
- private String text;
-
public HWPFLister( HWPFDocumentCore doc )
{
_doc = doc;
- buildText();
buildParagraphs();
}
paragraphs = new LinkedHashMap<Integer, String>();
StringBuilder part = new StringBuilder();
+ String text = _doc.getDocumentText();
for ( int charIndex = 0; charIndex < text.length(); charIndex++ )
{
char c = text.charAt( charIndex );
}
}
- private void buildText()
- {
- StringBuilder builder = new StringBuilder();
- for ( TextPiece textPiece : _doc.getTextTable().getTextPieces() )
- {
- String toAppend = textPiece.getStringBuffer().toString();
-
- if ( toAppend.length() != ( textPiece.getEnd() - textPiece
- .getStart() ) )
- {
- throw new AssertionError();
- }
-
- builder.replace( textPiece.getStart(), textPiece.getEnd(), toAppend );
- }
- this.text = builder.toString();
- }
-
private void dumpBookmarks()
{
if ( !( _doc instanceof HWPFDocument ) )
public void dumpPapx( boolean withProperties ) throws Exception
{
- if ( _doc instanceof HWPFDocument )
- {
- System.out.println( "binary PAP pages " );
-
- HWPFDocument doc = (HWPFDocument) _doc;
-
- java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
- .getDeclaredField( "_mainStream" );
- fMainStream.setAccessible( true );
- byte[] mainStream = (byte[]) fMainStream.get( _doc );
-
- PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
- .getFileInformationBlock().getFcPlcfbtePapx(), doc
- .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
-
- List<PAPX> papxs = new ArrayList<PAPX>();
-
- int length = binTable.length();
- for ( int x = 0; x < length; x++ )
- {
- GenericPropertyNode node = binTable.getProperty( x );
-
- int pageNum = LittleEndian.getInt( node.getBytes() );
- int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
- * pageNum;
-
- PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
- mainStream, doc.getDataStream(), pageOffset,
- doc.getTextTable() );
-
- System.out.println( "* PFKP: " + pfkp );
-
- for ( PAPX papx : pfkp.getPAPXs() )
- {
- System.out.println( "** " + papx );
- papxs.add( papx );
- if ( papx != null && true )
- {
- SprmIterator sprmIt = new SprmIterator(
- papx.getGrpprl(), 2 );
- while ( sprmIt.hasNext() )
- {
- SprmOperation sprm = sprmIt.next();
- System.out.println( "*** " + sprm.toString() );
- }
- }
-
- }
- }
-
- Collections.sort( papxs );
- System.out.println( "* Sorted by END" );
- for ( PAPX papx : papxs )
- {
- System.out.println( "** " + papx );
- SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
- while ( sprmIt.hasNext() )
- {
- SprmOperation sprm = sprmIt.next();
- System.out.println( "*** " + sprm.toString() );
- }
- }
- }
+// if ( _doc instanceof HWPFDocument )
+// {
+// System.out.println( "binary PAP pages " );
+//
+// HWPFDocument doc = (HWPFDocument) _doc;
+//
+// java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
+// .getDeclaredField( "_mainStream" );
+// fMainStream.setAccessible( true );
+// byte[] mainStream = (byte[]) fMainStream.get( _doc );
+//
+// PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
+// .getFileInformationBlock().getFcPlcfbtePapx(), doc
+// .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
+//
+// List<PAPX> papxs = new ArrayList<PAPX>();
+//
+// int length = binTable.length();
+// for ( int x = 0; x < length; x++ )
+// {
+// GenericPropertyNode node = binTable.getProperty( x );
+//
+// int pageNum = LittleEndian.getInt( node.getBytes() );
+// int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
+// * pageNum;
+//
+// PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
+// mainStream, doc.getDataStream(), pageOffset,
+// doc.getTextTable() );
+//
+// System.out.println( "* PFKP: " + pfkp );
+//
+// for ( PAPX papx : pfkp.getPAPXs() )
+// {
+// System.out.println( "** " + papx );
+// papxs.add( papx );
+// if ( papx != null && true )
+// {
+// SprmIterator sprmIt = new SprmIterator(
+// papx.getGrpprl(), 2 );
+// while ( sprmIt.hasNext() )
+// {
+// SprmOperation sprm = sprmIt.next();
+// System.out.println( "*** " + sprm.toString() );
+// }
+// }
+//
+// }
+// }
+//
+// Collections.sort( papxs );
+// System.out.println( "* Sorted by END" );
+// for ( PAPX papx : papxs )
+// {
+// System.out.println( "** " + papx );
+// SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
+// while ( sprmIt.hasNext() )
+// {
+// SprmOperation sprm = sprmIt.next();
+// System.out.println( "*** " + sprm.toString() );
+// }
+// }
+// }
// for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
// {
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
-import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
* mapping is broken. Fast too.
*/
public String getTextFromPieces() {
- StringBuffer textBuf = new StringBuffer();
-
- for(TextPiece piece : doc.getTextTable().getTextPieces()) {
- String encoding = "Cp1252";
- if (piece.isUnicode()) {
- encoding = "UTF-16LE";
- }
- try {
- String text = new String(piece.getRawBytes(), encoding);
- textBuf.append(text);
- } catch(UnsupportedEncodingException e) {
- throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
- }
- }
-
- String text = textBuf.toString();
+ String text = doc.getDocumentText();
// Fix line endings (Note - won't get all of them
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
start = System.currentTimeMillis();
}
- // rebuild document paragraphs structure
- StringBuilder docText = new StringBuilder();
- for ( TextPiece textPiece : tpt.getTextPieces() )
- {
- String toAppend = textPiece.getStringBuffer().toString();
- int toAppendLength = toAppend.length();
-
- if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
- {
- logger.log(
- POILogger.WARN,
- "Text piece has boundaries [",
- Integer.valueOf( textPiece.getStart() ),
- "; ",
- Integer.valueOf( textPiece.getEnd() ),
- ") but length ",
- Integer.valueOf( textPiece.getEnd()
- - textPiece.getStart() ) );
- }
-
- docText.replace( textPiece.getStart(), textPiece.getStart()
- + toAppendLength, toAppend );
- }
- logger.log( POILogger.DEBUG, "Document text rebuilded in ",
- Long.valueOf( System.currentTimeMillis() - start ), " ms (",
- Integer.valueOf( docText.length() ), " chars)" );
- start = System.currentTimeMillis();
-
List<CHPX> oldChpxSortedByStartPos = new ArrayList<CHPX>( _textRuns );
Collections.sort( oldChpxSortedByStartPos,
PropertyNode.StartComparator.instance );
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
byte[] _dataStream;
- /** So we can know if things are unicode or not */
- private TextPieceTable tpt;
-
public PAPBinTable()
{
}
{
PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );
- this.tpt = tpt;
int length = binTable.length();
for ( int x = 0; x < length; x++ )
Integer.valueOf( _paragraphs.size() ), " elements)" );
}
- public void rebuild( byte[] dataStream, ComplexFileTable complexFileTable )
+ public void rebuild( final StringBuilder docText, byte[] dataStream,
+ ComplexFileTable complexFileTable )
{
long start = System.currentTimeMillis();
SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();
// adding PAPX from fast-saved SPRMs
- for ( TextPiece textPiece : tpt.getTextPieces() )
+ for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
+ .getTextPieces() )
{
PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
if ( !prm.isComplex() )
start = System.currentTimeMillis();
}
- // rebuild document paragraphs structure
- StringBuilder docText = new StringBuilder();
- for ( TextPiece textPiece : tpt.getTextPieces() )
- {
- String toAppend = textPiece.getStringBuffer().toString();
- int toAppendLength = toAppend.length();
-
- if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
- {
- logger.log(
- POILogger.WARN,
- "Text piece has boundaries [",
- Integer.valueOf( textPiece.getStart() ),
- "; ",
- Integer.valueOf( textPiece.getEnd() ),
- ") but length ",
- Integer.valueOf( textPiece.getEnd()
- - textPiece.getStart() ) );
- }
-
- docText.replace( textPiece.getStart(), textPiece.getStart()
- + toAppendLength, toAppend );
- }
- logger.log( POILogger.DEBUG, "Document text rebuilded in ",
- Long.valueOf( System.currentTimeMillis() - start ), " ms (",
- Integer.valueOf( docText.length() ), " chars)" );
- start = System.currentTimeMillis();
-
List<PAPX> oldPapxSortedByEndPos = new ArrayList<PAPX>( _paragraphs );
Collections.sort( oldPapxSortedByEndPos,
PropertyNode.EndComparator.instance );
{
// can we reuse existing?
PAPX existing = papxs.get( 0 );
- if ( existing.getStart() == startInclusive && existing.getEnd() == endExclusive )
+ if ( existing.getStart() == startInclusive
+ && existing.getEnd() == endExclusive )
{
newPapxs.add( existing );
lastParStart = endExclusive;
this._paragraphs = new ArrayList<PAPX>( newPapxs );
logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ",
- Long.valueOf( System.currentTimeMillis() - start ), " ms" );
+ Long.valueOf( System.currentTimeMillis() - start ), " ms (",
+ Integer.valueOf( _paragraphs.size() ), " elements)" );
start = System.currentTimeMillis();
_dataStream = dataStream;
public void insert(int listIndex, int cpStart, SprmBuffer buf)
{
- PAPX forInsert = new PAPX(0, 0, tpt, buf, _dataStream);
+ PAPX forInsert = new PAPX(0, 0, buf, _dataStream);
// Ensure character offsets are really characters
forInsert.setStart(cpStart);
// Original, until insert at point
// New one
// Clone of original, on to the old end
- PAPX clone = new PAPX(0, 0, tpt, clonedBuf, _dataStream);
+ PAPX clone = new PAPX(0, 0, clonedBuf, _dataStream);
// Again ensure contains character based offsets no matter what
clone.setStart(cpStart);
clone.setEnd(currentPap.getEnd());
return _paragraphs;
}
- public void writeTo(HWPFFileSystem sys, int fcMin)
- throws IOException
- {
+ public void writeTo( HWPFFileSystem sys, int fcMin, CharIndexTranslator translator ) throws IOException
+ {
HWPFOutputStream docStream = sys.getStream("WordDocument");
OutputStream tableStream = sys.getStream("1Table");
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(_dataStream);
pfkp.fill(overflow);
- byte[] bufFkp = pfkp.toByteArray(tpt, fcMin);
+ byte[] bufFkp = pfkp.toByteArray(translator, fcMin);
docStream.write(bufFkp);
overflow = pfkp.getOverflow();
import java.io.UnsupportedEncodingException;
+
+import org.apache.poi.util.Internal;
+
/**
* Lightweight representation of a text piece.
* Works in the character domain, not the byte domain, so you
*
* @author Ryan Ackley
*/
-
+@Internal
public final class TextPiece extends PropertyNode<TextPiece>
{
private boolean _usesUnicode;
private PieceDescriptor _pd;
- /**
- * @param start Beginning offset in main document stream, in characters.
- * @param end Ending offset in main document stream, in characters.
- * @param text The raw bytes of our text
- */
- public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, int cpStart) {
+ /**
+ * @param start
+ * Beginning offset in main document stream, in characters.
+ * @param end
+ * Ending offset in main document stream, in characters.
+ * @param text
+ * The raw bytes of our text
+ * @deprecated Use {@link #TextPiece(int,int,byte[],PieceDescriptor)}
+ * instead
+ */
+ public TextPiece( int start, int end, byte[] text, PieceDescriptor pd,
+ int cpStart )
+ {
+ this( start, end, text, pd );
+ }
+
+ /**
+ * @param start
+ * Beginning offset in main document stream, in characters.
+ * @param end
+ * Ending offset in main document stream, in characters.
+ * @param text
+ * The raw bytes of our text
+ */
+ public TextPiece( int start, int end, byte[] text, PieceDescriptor pd )
+ {
super(start, end, buildInitSB(text, pd));
_usesUnicode = pd.isUnicode();
_pd = pd;
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.poifs.common.POIFSConstants;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
/**
* The piece table for matching up character positions to bits of text. This
*/
public class TextPieceTable implements CharIndexTranslator
{
+ private static final POILogger logger = POILogFactory
+ .getLogger( TextPieceTable.class );
+
// int _multiple;
int _cpMin;
protected ArrayList<TextPiece> _textPieces = new ArrayList<TextPiece>();
// And now build the piece
_textPieces.add( new TextPiece( nodeStartChars, nodeEndChars, buf,
- pieces[x], node.getStart() ) );
+ pieces[x] ) );
}
// In the interest of our sanity, now sort the text pieces
return _cpMin;
}
+ public StringBuilder getText()
+ {
+ final long start = System.currentTimeMillis();
+
+ // rebuild document paragraphs structure
+ StringBuilder docText = new StringBuilder();
+ for ( TextPiece textPiece : _textPieces )
+ {
+ String toAppend = textPiece.getStringBuffer().toString();
+ int toAppendLength = toAppend.length();
+
+ if ( toAppendLength != textPiece.getEnd() - textPiece.getStart() )
+ {
+ logger.log(
+ POILogger.WARN,
+ "Text piece has boundaries [",
+ Integer.valueOf( textPiece.getStart() ),
+ "; ",
+ Integer.valueOf( textPiece.getEnd() ),
+ ") but length ",
+ Integer.valueOf( textPiece.getEnd()
+ - textPiece.getStart() ) );
+ }
+
+ docText.replace( textPiece.getStart(), textPiece.getStart()
+ + toAppendLength, toAppend );
+ }
+
+ logger.log( POILogger.DEBUG, "Document text were rebuilded in ",
+ Long.valueOf( System.currentTimeMillis() - start ), " ms (",
+ Integer.valueOf( docText.length() ), " chars)" );
+
+ return docText;
+ }
+
public List<TextPiece> getTextPieces()
{
return _textPieces;
import org.apache.poi.hwpf.model.SEPX;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.SubdocumentType;
-import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
import org.apache.poi.hwpf.sprm.SprmBuffer;
/** The end index in the characterRuns list for this Range. */
protected int _charEnd;
- /** Have we loaded the Text indexes yet */
- protected boolean _textRangeFound;
-
- /** All text pieces that belong to the document this Range belongs to. */
- protected List<TextPiece> _text;
-
- /** The start index in the text list for this Range. */
- protected int _textStart;
-
- /** The end index in the text list for this Range. */
- protected int _textEnd;
-
+ protected StringBuilder _text;
+
// protected Range()
// {
//
_sections = _doc.getSectionTable().getSections();
_paragraphs = _doc.getParagraphTable().getParagraphs();
_characters = _doc.getCharacterTable().getTextRuns();
- _text = _doc.getTextTable().getTextPieces();
+ _text = _doc.getText();
_parent = new WeakReference<Range>(null);
sanityCheckStartEnd();
_parent = new WeakReference<Range>(parent);
sanityCheckStartEnd();
+ assert sanityCheck();
}
/**
}
}
- /**
- * Does any <code>TextPiece</code> in this Range use unicode?
- *
- * @return true if it does and false if it doesn't
- */
- public boolean usesUnicode() {
-
- initText();
-
- for (int i = _textStart; i < _textEnd; i++) {
- TextPiece piece = _text.get(i);
- if (piece.isUnicode())
- return true;
- }
-
- return false;
- }
+ /**
+ * @return always return true
+ * @deprecated Range is not linked to any text piece anymore, so to check if
+ * unicode is used please access {@link TextPieceTable} during
+ * document load time
+ */
+ @Deprecated
+ public boolean usesUnicode()
+ {
+ return true;
+ }
/**
* Gets the text that this Range contains.
* @return The text for this range.
*/
public String text() {
- initText();
-
- StringBuffer sb = new StringBuffer();
-
- for (int x = _textStart; x < _textEnd; x++) {
- TextPiece piece = _text.get(x);
-
- // Figure out where in this piece the text
- // we're after lives
- int rStart = 0;
- int rEnd = piece.characterLength();
- if (_start > piece.getStart()) {
- rStart = _start - piece.getStart();
- }
- if (_end < piece.getEnd()) {
- rEnd -= (piece.getEnd() - _end);
- }
-
- // Luckily TextPieces work in characters, so we don't
- // need to worry about unicode here
- sb.append(piece.substring(rStart, rEnd));
- }
- return sb.toString();
+ return _text.substring( _start, _end );
}
/**
return _charEnd - _charStart;
}
- /**
- * Inserts text into the front of this range.
- *
- * @param text
- * The text to insert
- * @return The character run that text was inserted into.
- */
- public CharacterRun insertBefore(String text)
- // throws UnsupportedEncodingException
- {
- initAll();
-
- TextPiece tp = _text.get(_textStart);
- StringBuffer sb = tp.getStringBuffer();
-
- // Since this is the first item in our list, it is safe to assume that
- // _start >= tp.getStart()
- int insertIndex = _start - tp.getStart();
- sb.insert(insertIndex, text);
-
- int adjustedLength = _doc.getTextTable().adjustForInsert(_textStart, text.length());
- _doc.getCharacterTable().adjustForInsert(_charStart, adjustedLength);
- _doc.getParagraphTable().adjustForInsert(_parStart, adjustedLength);
- _doc.getSectionTable().adjustForInsert(_sectionStart, adjustedLength);
- adjustForInsert(adjustedLength);
+ /**
+ * Inserts text into the front of this range.
+ *
+ * @param text
+ * The text to insert
+ * @return The character run that text was inserted into.
+ */
+ public CharacterRun insertBefore( String text )
+ {
+ initAll();
- // update the FIB.CCPText + friends fields
- adjustFIB(text.length());
+ _text.insert( _start, text );
+ _doc.getCharacterTable().adjustForInsert( _charStart, text.length() );
+ _doc.getParagraphTable().adjustForInsert( _parStart, text.length() );
+ _doc.getSectionTable().adjustForInsert( _sectionStart, text.length() );
+ adjustForInsert( text.length() );
- return getCharacterRun(0);
- }
+ // update the FIB.CCPText + friends fields
+ adjustFIB( text.length() );
- /**
- * Inserts text onto the end of this range
- *
- * @param text
- * The text to insert
- * @return The character run the text was inserted into.
- */
- public CharacterRun insertAfter(String text) {
- initAll();
+ assert sanityCheck();
- int listIndex = _textEnd - 1;
- TextPiece tp = _text.get(listIndex);
- StringBuffer sb = tp.getStringBuffer();
+ return getCharacterRun( 0 );
+ }
- int insertIndex = _end - tp.getStart();
+ /**
+ * Inserts text onto the end of this range
+ *
+ * @param text
+ * The text to insert
+ * @return The character run the text was inserted into.
+ */
+ public CharacterRun insertAfter( String text )
+ {
+ initAll();
- if (tp.getStringBuffer().charAt(_end - 1) == '\r' && text.charAt(0) != '\u0007') {
- insertIndex--;
- }
- sb.insert(insertIndex, text);
- int adjustedLength = _doc.getTextTable().adjustForInsert(listIndex, text.length());
- _doc.getCharacterTable().adjustForInsert(_charEnd - 1, adjustedLength);
- _doc.getParagraphTable().adjustForInsert(_parEnd - 1, adjustedLength);
- _doc.getSectionTable().adjustForInsert(_sectionEnd - 1, adjustedLength);
- adjustForInsert(text.length());
+ _text.insert( _end, text );
- return getCharacterRun(numCharacterRuns() - 1);
+ _doc.getCharacterTable().adjustForInsert( _charEnd - 1, text.length() );
+ _doc.getParagraphTable().adjustForInsert( _parEnd - 1, text.length() );
+ _doc.getSectionTable().adjustForInsert( _sectionEnd - 1, text.length() );
+ adjustForInsert( text.length() );
- }
+ assert sanityCheck();
+ return getCharacterRun( numCharacterRuns() - 1 );
+ }
/**
* Inserts text into the front of this range and it gives that text the
int numSections = _sections.size();
int numRuns = _characters.size();
int numParagraphs = _paragraphs.size();
- int numTextPieces = _text.size();
for (int x = _charStart; x < numRuns; x++) {
CHPX chpx = _characters.get(x);
// + " -> " + sepx.getEnd());
}
- for (int x = _textStart; x < numTextPieces; x++) {
- TextPiece piece = _text.get(x);
- piece.adjustForDelete(_start, _end - _start);
- }
+ _text.delete( _start, _end );
+ Range parent = _parent.get();
+ if ( parent != null )
+ {
+ parent.adjustForInsert( -( _end - _start ) );
+ }
// update the FIB.CCPText + friends field
adjustFIB(-(_end - _start));
* @param rows
* The number of rows.
* @return The empty Table that is now part of the document.
- * @deprecated Use code shall not work with {@link ParagraphProperties}
+ * @deprecated Use code shall not work with {@link TableProperties}
*/
@Deprecated
public Table insertBefore(TableProperties props, int rows) {
parProps.setFInTable(true);
parProps.setItap( 1 );
+ final int oldEnd = this._end;
+
int columns = props.getItcMac();
- for (int x = 0; x < rows; x++) {
- Paragraph cell = this.insertBefore(parProps, StyleSheet.NIL_STYLE);
- cell.insertAfter(String.valueOf('\u0007'));
- for (int y = 1; y < columns; y++) {
- cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE);
- cell.insertAfter(String.valueOf('\u0007'));
- }
- cell = cell.insertAfter(parProps, StyleSheet.NIL_STYLE, String.valueOf('\u0007'));
- cell.setTableRowEnd(props);
- }
- return new Table(_start, _start + (rows * (columns + 1)) * 2, this, 1);
- }
+ for ( int x = 0; x < rows; x++ )
+ {
+ Paragraph cell = this.insertBefore( parProps, StyleSheet.NIL_STYLE );
+ cell.insertAfter( String.valueOf( '\u0007' ) );
+ for ( int y = 1; y < columns; y++ )
+ {
+ cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE );
+ cell.insertAfter( String.valueOf( '\u0007' ) );
+ }
+ cell = cell.insertAfter( parProps, StyleSheet.NIL_STYLE,
+ String.valueOf( '\u0007' ) );
+ cell.setTableRowEnd( props );
+ }
+
+ final int newEnd = this._end;
+ final int diff = newEnd - oldEnd;
+
+ return new Table( _start, _start + diff, this, 1 );
+ }
/**
* Inserts a list into the beginning of this range.
*/
public void replaceText(String pPlaceHolder, String pValue, int pOffset) {
int absPlaceHolderIndex = getStartOffset() + pOffset;
- Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder
- .length()), getDocument());
-
- // this Range isn't a proper parent of the subRange() so we'll have to
- // keep
- // track of an updated endOffset on our own
- int previousEndOffset = subRange.getEndOffset();
+ Range subRange = new Range(absPlaceHolderIndex, (absPlaceHolderIndex + pPlaceHolder
+ .length()), this);
subRange.insertBefore(pValue);
- if (subRange.getEndOffset() != previousEndOffset) {
- adjustForInsert(subRange.getEndOffset() - previousEndOffset);
- }
-
// re-create the sub-range so we can delete it
subRange = new Range((absPlaceHolderIndex + pValue.length()), (absPlaceHolderIndex
- + pPlaceHolder.length() + pValue.length()), getDocument());
+ + pPlaceHolder.length() + pValue.length()), this);
// deletes are automagically propagated
subRange.delete();
* loads all of the list indexes.
*/
protected void initAll() {
- initText();
initCharacterRuns();
initParagraphs();
initSections();
}
}
- /**
- * inits the text piece list indexes.
- */
- private void initText() {
- if (!_textRangeFound) {
- int[] point = findRange(_text, _textStart, _start, _end);
- _textStart = point[0];
- _textEnd = point[1];
- _textRangeFound = true;
- }
- }
-
/**
* inits the section list indexes.
*/
* resets the list indexes.
*/
protected void reset() {
- _textRangeFound = false;
_charRangeFound = false;
_parRangeFound = false;
_sectionRangeFound = false;
* Method for debug purposes. Checks that all resolved elements are inside
* of current range.
*/
- public void sanityCheck()
+ public boolean sanityCheck()
{
+ if ( _start < 0 )
+ throw new AssertionError();
+ if ( _start >= _text.length() )
+ throw new AssertionError();
+ if ( _end < 0 )
+ throw new AssertionError();
+ if ( _end > _text.length() )
+ throw new AssertionError();
+ if ( _start > _end )
+ throw new AssertionError();
+
if ( _charRangeFound )
{
for ( int c = _charStart; c < _charEnd; c++ )
throw new AssertionError();
}
}
+
+ return true;
}
}
package org.apache.poi.hwpf.extractor;
-import java.util.Iterator;
+import junit.framework.TestCase;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFTestDataSamples;
-import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
-import junit.framework.TestCase;
-
/**
* Test the different routes to extracting text
*
* Test textPieces based extraction
*/
public void testExtractFromTextPieces() throws Exception {
- StringBuffer textBuf = new StringBuffer();
-
- Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
- while (textPieces.hasNext()) {
- TextPiece piece = (TextPiece) textPieces.next();
-
- String encoding = "Cp1252";
- if (piece.isUnicode()) {
- encoding = "UTF-16LE";
- }
- String text = new String(piece.getRawBytes(), encoding);
- textBuf.append(text);
- }
-
StringBuffer exp = new StringBuffer();
for (int i = 0; i < p_text.length; i++) {
exp.append(p_text[i]);
}
- assertEquals(exp.toString(), textBuf.toString());
+ assertEquals(exp.toString(), doc.getDocumentText());
}
}
HWPFFileSystem fileSys = new HWPFFileSystem();
- _pAPBinTable.writeTo( fileSys, 0 );
+ _pAPBinTable.writeTo( fileSys, 0, fakeTPT );
ByteArrayOutputStream tableOut = fileSys.getStream( "1Table" );
ByteArrayOutputStream mainOut = fileSys.getStream( "WordDocument" );
throws Exception
{
super.setUp();
+ System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.TRUE.toString() );
_hWPFDocFixture = new HWPFDocFixture(this, HWPFDocFixture.DEFAULT_TEST_FILE);
_hWPFDocFixture.setUp();
throws Exception
{
_hWPFDocFixture.tearDown();
-
_hWPFDocFixture = null;
+
+ System.setProperty( "org.apache.poi.hwpf.preserveTextTable", Boolean.FALSE.toString() );
super.tearDown();
}
assertEquals("One paragraph is ok\7", r.getParagraph(3).text());
assertEquals("\7", r.getParagraph(4).text());
assertEquals("\r", r.getParagraph(5).text());
- for(int i=0; i<=5; i++) {
- assertFalse(r.getParagraph(i).usesUnicode());
- }
-
// Get the table
Table t = r.getTable(p);
assertEquals("Row 3/Cell 3\u0007", r.getParagraph(10).text());
assertEquals("\u0007", r.getParagraph(11).text());
assertEquals("\r", r.getParagraph(12).text());
- for(int i=0; i<=12; i++) {
- assertFalse(r.getParagraph(i).usesUnicode());
- }
Paragraph p;
Paragraph actParagraph = actual.getParagraph( p );
assertEquals( expParagraph.text(), actParagraph.text() );
- assertEquals( expParagraph.isInTable(), actParagraph.isInTable() );
+ assertEquals( "Diffent isInTable flags for paragraphs #" + p
+ + " -- " + expParagraph + " -- " + actParagraph + ".",
+ expParagraph.isInTable(), actParagraph.isInTable() );
assertEquals( expParagraph.isTableRowEnd(),
actParagraph.isTableRowEnd() );
assertEquals(searchText, subRange.text());
subRange.delete();
+ daDoc.getOverallRange().sanityCheck();
+ daDoc.getRange().sanityCheck();
// we need to let the model re-calculate the Range before we evaluate it
range = daDoc.getRange();
// this can lead to a StringBufferOutOfBoundsException, so we will add it
// even though we don't have an assertion for it
Range daRange = daDoc.getRange();
+ daRange.sanityCheck();
daRange.text();
}