*/
public final class HWPFDocument extends HWPFDocumentCore
{
+ private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
+ private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
+
/** And for making sense of CP lengths in the FIB */
@Deprecated
protected CPSplitCalculator _cpSplit;
_pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
_text = _tpt.getText();
- _cbt.rebuild( _cft );
- _pbt.rebuild( _text, _cft );
- boolean preserve = false;
+ /*
+ * in this mode we preserving PAPX/CHPX structure from file, so text may
+ * miss from output, and text order may be corrupted
+ */
+ boolean preserveBinTables = false;
+ try
+ {
+ preserveBinTables = Boolean.parseBoolean( System
+ .getProperty( PROPERTY_PRESERVE_BIN_TABLES ) );
+ }
+ catch ( Exception exc )
+ {
+ // ignore;
+ }
+
+ if ( !preserveBinTables )
+ {
+ _cbt.rebuild( _cft );
+ _pbt.rebuild( _text, _cft );
+ }
+
+ /*
+ * Property to disable text rebuilding. In this mode changing the text
+ * will lead to unpredictable behavior
+ */
+ boolean preserveTextTable = false;
try
{
- preserve = Boolean.parseBoolean( System
- .getProperty( "org.apache.poi.hwpf.preserveTextTable" ) );
+ preserveTextTable = Boolean.parseBoolean( System
+ .getProperty( PROPERTY_PRESERVE_TEXT_TABLE ) );
}
catch ( Exception exc )
{
// ignore;
}
- if ( !preserve )
+ if ( !preserveTextTable )
{
_cft = new ComplexFileTable();
_tpt = _cft.getTextPieceTable();
_fields = new FieldsImpl(_fieldsTables);
}
+ @Internal
public TextPieceTable getTextTable()
{
return _cft.getTextPieceTable();
}
+ @Internal
@Override
public StringBuilder getText()
{
*
* @return the saved-by table.
*/
+ @Internal
public SavedByTable getSavedByTable()
{
return _sbt;
*
* @return the saved-by table.
*/
+ @Internal
public RevisionMarkAuthorTable getRevisionMarkAuthorTable()
{
return _rmat;
return _pictures;
}
+ @Internal
public EscherRecordHolder getEscherRecordHolder() {
return _escherRecordHolder;
}
* @deprecated use {@link #getOfficeDrawingsMain()} instead
*/
@Deprecated
+ @Internal
public ShapesTable getShapesTable()
{
return _officeArts;
this._dataStream = dataBuf;
}
+ @Internal
public byte[] getDataStream()
{
return _dataStream;
}
+ @Internal
public byte[] getTableStream()
{
return _tableStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.GenericPropertyNode;
+import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
import org.apache.poi.hwpf.model.PAPX;
+import org.apache.poi.hwpf.model.PlexOfCps;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.sprm.SprmIterator;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
/**
* Used by developers to list out key information on a HWPF file. End users will
if ( args.length == 0 )
{
System.err.println( "Use:" );
- System.err
- .println( "\tHWPFLister <filename>\n"
- + "\t\t[--textPieces] [--textPiecesText]\n"
- + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n"
- + "\t\t[--papx] [--papxProperties]\n"
- + "\t\t[--paragraphs] [--paragraphsSprms] [--paragraphsText]\n"
- + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n"
- + "\t\t[--fields]\n" + "\t\t[--pictures]\n"
- + "\t\t[--officeDrawings]\n"
- + "\t\t[--writereadback]\n" );
+ System.err.println( "\tHWPFLister <filename>\n"
+ + "\t\t[--textPieces] [--textPiecesText]\n"
+ + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n"
+ + "\t\t[--papx] [--papxProperties] [--papxSprms]\n"
+ + "\t\t[--paragraphs] [--paragraphsText]\n"
+ + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n"
+ + "\t\t[--fields]\n" + "\t\t[--pictures]\n"
+ + "\t\t[--officeDrawings]\n" + "\t\t[--writereadback]\n" );
System.exit( 1 );
}
boolean outputChpxSprms = false;
boolean outputParagraphs = false;
- boolean outputParagraphsSprms = false;
boolean outputParagraphsText = false;
boolean outputPapx = false;
+ boolean outputPapxSprms = false;
boolean outputPapxProperties = false;
boolean outputBookmarks = false;
if ( "--paragraphs".equals( arg ) )
outputParagraphs = true;
- if ( "--paragraphsSprms".equals( arg ) )
- outputParagraphsSprms = true;
if ( "--paragraphsText".equals( arg ) )
outputParagraphsText = true;
outputPapx = true;
if ( "--papxProperties".equals( arg ) )
outputPapxProperties = true;
+ if ( "--papxSprms".equals( arg ) )
+ outputPapxSprms = true;
if ( "--bookmarks".equals( arg ) )
outputBookmarks = true;
if ( writereadback )
doc = writeOutAndReadBack( doc );
- HWPFLister lister = new HWPFLister( doc );
- lister.dumpFIB();
+ HWPFDocumentCore original;
+ {
+ System.setProperty( "org.apache.poi.hwpf.preserveBinTables",
+ Boolean.TRUE.toString() );
+ System.setProperty( "org.apache.poi.hwpf.preserveTextTable",
+ Boolean.TRUE.toString() );
+
+ original = loadDoc( new File( args[0] ) );
+ if ( writereadback )
+ original = writeOutAndReadBack( original );
+ }
+
+ HWPFLister listerOriginal = new HWPFLister( original );
+ HWPFLister listerRebuilded = new HWPFLister( doc );
+
+ System.out.println( "== FIB (original) ==" );
+ listerOriginal.dumpFIB();
if ( outputTextPieces )
{
- System.out.println( "== Text pieces ==" );
- lister.dumpTextPieces( outputTextPiecesText );
+ System.out.println( "== Text pieces (original) ==" );
+ listerOriginal.dumpTextPieces( outputTextPiecesText );
}
if ( outputChpx )
{
- System.out.println( "== CHPX ==" );
- lister.dumpChpx( outputChpxProperties, outputChpxSprms );
+ System.out.println( "== CHPX (original) ==" );
+ listerOriginal.dumpChpx( outputChpxProperties, outputChpxSprms );
+
+ System.out.println( "== CHPX (rebuilded) ==" );
+ listerRebuilded.dumpChpx( outputChpxProperties, outputChpxSprms );
}
if ( outputPapx )
{
- System.out.println( "== PAPX ==" );
- lister.dumpPapx( outputPapxProperties );
+ System.out.println( "== PAPX (original) ==" );
+ listerOriginal.dumpPapx( outputPapxProperties, outputPapxSprms );
+
+ System.out.println( "== PAPX (rebuilded) ==" );
+ listerRebuilded.dumpPapx( outputPapxProperties, outputPapxSprms );
}
if ( outputParagraphs )
{
- System.out.println( "== Text paragraphs ==" );
- lister.dumpParagraphs( true );
+ System.out.println( "== Text paragraphs (original) ==" );
+ listerRebuilded.dumpParagraphs( true );
- System.out.println( "== DOM paragraphs ==" );
- lister.dumpParagraphsDom( outputParagraphsSprms, outputPapx,
- outputParagraphsText );
+ System.out.println( "== DOM paragraphs (rebuilded) ==" );
+ listerRebuilded.dumpParagraphsDom( outputParagraphsText );
}
if ( outputBookmarks )
{
- System.out.println( "== BOOKMARKS ==" );
- lister.dumpBookmarks();
+ System.out.println( "== BOOKMARKS (rebuilded) ==" );
+ listerRebuilded.dumpBookmarks();
}
if ( outputEscher )
{
- System.out.println( "== ESCHER PROPERTIES ==" );
- lister.dumpEscher();
+ System.out.println( "== ESCHER PROPERTIES (rebuilded) ==" );
+ listerRebuilded.dumpEscher();
}
if ( outputFields )
{
- System.out.println( "== FIELDS ==" );
- lister.dumpFields();
+ System.out.println( "== FIELDS (rebuilded) ==" );
+ listerRebuilded.dumpFields();
}
if ( outputOfficeDrawings )
{
- System.out.println( "== OFFICE DRAWINGS ==" );
- lister.dumpOfficeDrawings();
+ System.out.println( "== OFFICE DRAWINGS (rebuilded) ==" );
+ listerRebuilded.dumpOfficeDrawings();
}
if ( outputPictures )
{
- System.out.println( "== PICTURES ==" );
- lister.dumpPictures();
+ System.out.println( "== PICTURES (rebuilded) ==" );
+ listerRebuilded.dumpPictures();
}
}
}
}
- public void dumpPapx( boolean withProperties ) throws Exception
+ public void dumpPapx( boolean withProperties, boolean withSprms )
+ throws Exception
{
- // if ( _doc instanceof HWPFDocument )
- // {
- // System.out.println( "binary PAP pages " );
- //
- // HWPFDocument doc = (HWPFDocument) _doc;
- //
- // java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
- // .getDeclaredField( "_mainStream" );
- // fMainStream.setAccessible( true );
- // byte[] mainStream = (byte[]) fMainStream.get( _doc );
- //
- // PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
- // .getFileInformationBlock().getFcPlcfbtePapx(), doc
- // .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
- //
- // List<PAPX> papxs = new ArrayList<PAPX>();
- //
- // int length = binTable.length();
- // for ( int x = 0; x < length; x++ )
- // {
- // GenericPropertyNode node = binTable.getProperty( x );
- //
- // int pageNum = LittleEndian.getInt( node.getBytes() );
- // int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
- // * pageNum;
- //
- // PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
- // mainStream, doc.getDataStream(), pageOffset,
- // doc.getTextTable() );
- //
- // System.out.println( "* PFKP: " + pfkp );
- //
- // for ( PAPX papx : pfkp.getPAPXs() )
- // {
- // System.out.println( "** " + papx );
- // papxs.add( papx );
- // if ( papx != null && true )
- // {
- // SprmIterator sprmIt = new SprmIterator(
- // papx.getGrpprl(), 2 );
- // while ( sprmIt.hasNext() )
- // {
- // SprmOperation sprm = sprmIt.next();
- // System.out.println( "*** " + sprm.toString() );
- // }
- // }
- //
- // }
- // }
- //
- // Collections.sort( papxs );
- // System.out.println( "* Sorted by END" );
- // for ( PAPX papx : papxs )
- // {
- // System.out.println( "** " + papx );
- // SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
- // while ( sprmIt.hasNext() )
- // {
- // SprmOperation sprm = sprmIt.next();
- // System.out.println( "*** " + sprm.toString() );
- // }
- // }
- // }
-
- // for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
- // {
- // System.out.println( papx );
- //
- // if ( withProperties )
- // System.out.println( papx.getParagraphProperties( _doc
- // .getStyleSheet() ) );
- //
- // if ( true )
- // {
- // SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
- // while ( sprmIt.hasNext() )
- // {
- // SprmOperation sprm = sprmIt.next();
- // System.out.println( "\t" + sprm.toString() );
- // }
- // }
- // }
+ if ( _doc instanceof HWPFDocument )
+ {
+ System.out.println( "binary PAP pages " );
+
+ HWPFDocument doc = (HWPFDocument) _doc;
+
+ java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
+ .getDeclaredField( "_mainStream" );
+ fMainStream.setAccessible( true );
+ byte[] mainStream = (byte[]) fMainStream.get( _doc );
+
+ PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
+ .getFileInformationBlock().getFcPlcfbtePapx(), doc
+ .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
+
+ List<PAPX> papxs = new ArrayList<PAPX>();
+
+ int length = binTable.length();
+ for ( int x = 0; x < length; x++ )
+ {
+ GenericPropertyNode node = binTable.getProperty( x );
+
+ int pageNum = LittleEndian.getInt( node.getBytes() );
+ int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
+ * pageNum;
+
+ PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
+ mainStream, doc.getDataStream(), pageOffset,
+ doc.getTextTable() );
+
+ System.out.println( "* PFKP: " + pfkp );
+
+ for ( PAPX papx : pfkp.getPAPXs() )
+ {
+ System.out.println( "** " + papx );
+ papxs.add( papx );
+ if ( papx != null && withSprms )
+ {
+ SprmIterator sprmIt = new SprmIterator(
+ papx.getGrpprl(), 2 );
+ while ( sprmIt.hasNext() )
+ {
+ SprmOperation sprm = sprmIt.next();
+ System.out.println( "*** " + sprm.toString() );
+ }
+ }
+
+ }
+ }
+
+ Collections.sort( papxs );
+ System.out.println( "* Sorted by END" );
+ for ( PAPX papx : papxs )
+ {
+ System.out.println( "** " + papx );
+ if ( papx != null && withSprms )
+ {
+ SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
+ while ( sprmIt.hasNext() )
+ {
+ SprmOperation sprm = sprmIt.next();
+ System.out.println( "*** " + sprm.toString() );
+ }
+ }
+ }
+ }
+
+ for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
+ {
+ System.out.println( papx );
+
+ if ( withProperties )
+ System.out.println( papx.getParagraphProperties( _doc
+ .getStyleSheet() ) );
+
+ if ( true )
+ {
+ SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
+ while ( sprmIt.hasNext() )
+ {
+ SprmOperation sprm = sprmIt.next();
+ System.out.println( "\t" + sprm.toString() );
+ }
+ }
+ }
}
public void dumpParagraphs( boolean dumpAssotiatedPapx )
}
}
- public void dumpParagraphsDom( boolean withSprms, boolean withPapx,
- boolean withText )
+ public void dumpParagraphsDom( boolean withText )
{
Range range = _doc.getOverallRange();
for ( int p = 0; p < range.numParagraphs(); p++ )