]> source.dussan.org Git - poi.git/commitdiff
allow to disable document rebuilding and use it in HWPF Lister
authorSergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 08:37:21 +0000 (08:37 +0000)
committerSergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 08:37:21 +0000 (08:37 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155262 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
src/scratchpad/src/org/apache/poi/hwpf/dev/HWPFLister.java

index 296136ccc9fca5a3cdcae0eb56cd154781072056..72843973fb10bc8a94e4ef15407b963635fc3221 100644 (file)
@@ -76,6 +76,9 @@ import org.apache.poi.util.Internal;
  */
 public final class HWPFDocument extends HWPFDocumentCore
 {
+    private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
+    private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
+
   /** And for making sense of CP lengths in the FIB */
   @Deprecated
   protected CPSplitCalculator _cpSplit;
@@ -267,20 +270,43 @@ public final class HWPFDocument extends HWPFDocumentCore
     _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
 
         _text = _tpt.getText();
-        _cbt.rebuild( _cft );
-        _pbt.rebuild( _text, _cft );
 
-        boolean preserve = false;
+        /*
+         * in this mode we preserving PAPX/CHPX structure from file, so text may
+         * miss from output, and text order may be corrupted
+         */
+        boolean preserveBinTables = false;
+        try
+        {
+            preserveBinTables = Boolean.parseBoolean( System
+                    .getProperty( PROPERTY_PRESERVE_BIN_TABLES ) );
+        }
+        catch ( Exception exc )
+        {
+            // ignore;
+        }
+
+        if ( !preserveBinTables )
+        {
+            _cbt.rebuild( _cft );
+            _pbt.rebuild( _text, _cft );
+        }
+
+        /*
+         * Property to disable text rebuilding. In this mode changing the text
+         * will lead to unpredictable behavior
+         */
+        boolean preserveTextTable = false;
         try
         {
-            preserve = Boolean.parseBoolean( System
-                    .getProperty( "org.apache.poi.hwpf.preserveTextTable" ) );
+            preserveTextTable = Boolean.parseBoolean( System
+                    .getProperty( PROPERTY_PRESERVE_TEXT_TABLE ) );
         }
         catch ( Exception exc )
         {
             // ignore;
         }
-        if ( !preserve )
+        if ( !preserveTextTable )
         {
             _cft = new ComplexFileTable();
             _tpt = _cft.getTextPieceTable();
@@ -350,11 +376,13 @@ public final class HWPFDocument extends HWPFDocumentCore
     _fields = new FieldsImpl(_fieldsTables);
   }
 
+  @Internal
   public TextPieceTable getTextTable()
   {
     return _cft.getTextPieceTable();
   }
 
+    @Internal
     @Override
     public StringBuilder getText()
     {
@@ -497,6 +525,7 @@ public final class HWPFDocument extends HWPFDocumentCore
    *
    * @return the saved-by table.
    */
+  @Internal
   public SavedByTable getSavedByTable()
   {
     return _sbt;
@@ -507,6 +536,7 @@ public final class HWPFDocument extends HWPFDocumentCore
    *
    * @return the saved-by table.
    */
+  @Internal
   public RevisionMarkAuthorTable getRevisionMarkAuthorTable()
   {
     return _rmat;
@@ -519,6 +549,7 @@ public final class HWPFDocument extends HWPFDocumentCore
          return _pictures;
   }
 
+  @Internal
   public EscherRecordHolder getEscherRecordHolder() {
       return _escherRecordHolder;
   }
@@ -529,6 +560,7 @@ public final class HWPFDocument extends HWPFDocumentCore
      * @deprecated use {@link #getOfficeDrawingsMain()} instead
      */
     @Deprecated
+    @Internal
     public ShapesTable getShapesTable()
     {
         return _officeArts;
@@ -908,10 +940,12 @@ public final class HWPFDocument extends HWPFDocumentCore
         this._dataStream = dataBuf;
     }
 
+  @Internal
   public byte[] getDataStream()
   {
     return _dataStream;
   }
+  @Internal
   public byte[] getTableStream()
   {
        return _tableStream;
index 8328fe9c66f5e0ca13e54ab382f21e75af37457f..028dbf4ef8e62b71ca3eeeea0e2e61833fc1078a 100644 (file)
@@ -23,7 +23,9 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -35,7 +37,10 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.model.CHPX;
 import org.apache.poi.hwpf.model.FieldsDocumentPart;
 import org.apache.poi.hwpf.model.FileInformationBlock;
+import org.apache.poi.hwpf.model.GenericPropertyNode;
+import org.apache.poi.hwpf.model.PAPFormattedDiskPage;
 import org.apache.poi.hwpf.model.PAPX;
+import org.apache.poi.hwpf.model.PlexOfCps;
 import org.apache.poi.hwpf.model.StyleSheet;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.sprm.SprmIterator;
@@ -47,9 +52,11 @@ import org.apache.poi.hwpf.usermodel.OfficeDrawing;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.common.POIFSConstants;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
 
 /**
  * Used by developers to list out key information on a HWPF file. End users will
@@ -94,16 +101,14 @@ public final class HWPFLister
         if ( args.length == 0 )
         {
             System.err.println( "Use:" );
-            System.err
-                    .println( "\tHWPFLister <filename>\n"
-                            + "\t\t[--textPieces] [--textPiecesText]\n"
-                            + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n"
-                            + "\t\t[--papx] [--papxProperties]\n"
-                            + "\t\t[--paragraphs] [--paragraphsSprms] [--paragraphsText]\n"
-                            + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n"
-                            + "\t\t[--fields]\n" + "\t\t[--pictures]\n"
-                            + "\t\t[--officeDrawings]\n"
-                            + "\t\t[--writereadback]\n" );
+            System.err.println( "\tHWPFLister <filename>\n"
+                    + "\t\t[--textPieces] [--textPiecesText]\n"
+                    + "\t\t[--chpx] [--chpxProperties] [--chpxSprms]\n"
+                    + "\t\t[--papx] [--papxProperties] [--papxSprms]\n"
+                    + "\t\t[--paragraphs] [--paragraphsText]\n"
+                    + "\t\t[--bookmarks]\n" + "\t\t[--escher]\n"
+                    + "\t\t[--fields]\n" + "\t\t[--pictures]\n"
+                    + "\t\t[--officeDrawings]\n" + "\t\t[--writereadback]\n" );
             System.exit( 1 );
         }
 
@@ -115,10 +120,10 @@ public final class HWPFLister
         boolean outputChpxSprms = false;
 
         boolean outputParagraphs = false;
-        boolean outputParagraphsSprms = false;
         boolean outputParagraphsText = false;
 
         boolean outputPapx = false;
+        boolean outputPapxSprms = false;
         boolean outputPapxProperties = false;
 
         boolean outputBookmarks = false;
@@ -145,8 +150,6 @@ public final class HWPFLister
 
             if ( "--paragraphs".equals( arg ) )
                 outputParagraphs = true;
-            if ( "--paragraphsSprms".equals( arg ) )
-                outputParagraphsSprms = true;
             if ( "--paragraphsText".equals( arg ) )
                 outputParagraphsText = true;
 
@@ -154,6 +157,8 @@ public final class HWPFLister
                 outputPapx = true;
             if ( "--papxProperties".equals( arg ) )
                 outputPapxProperties = true;
+            if ( "--papxSprms".equals( arg ) )
+                outputPapxSprms = true;
 
             if ( "--bookmarks".equals( arg ) )
                 outputBookmarks = true;
@@ -174,65 +179,85 @@ public final class HWPFLister
         if ( writereadback )
             doc = writeOutAndReadBack( doc );
 
-        HWPFLister lister = new HWPFLister( doc );
-        lister.dumpFIB();
+        HWPFDocumentCore original;
+        {
+            System.setProperty( "org.apache.poi.hwpf.preserveBinTables",
+                    Boolean.TRUE.toString() );
+            System.setProperty( "org.apache.poi.hwpf.preserveTextTable",
+                    Boolean.TRUE.toString() );
+
+            original = loadDoc( new File( args[0] ) );
+            if ( writereadback )
+                original = writeOutAndReadBack( original );
+        }
+
+        HWPFLister listerOriginal = new HWPFLister( original );
+        HWPFLister listerRebuilded = new HWPFLister( doc );
+
+        System.out.println( "== FIB (original) ==" );
+        listerOriginal.dumpFIB();
 
         if ( outputTextPieces )
         {
-            System.out.println( "== Text pieces ==" );
-            lister.dumpTextPieces( outputTextPiecesText );
+            System.out.println( "== Text pieces (original) ==" );
+            listerOriginal.dumpTextPieces( outputTextPiecesText );
         }
 
         if ( outputChpx )
         {
-            System.out.println( "== CHPX ==" );
-            lister.dumpChpx( outputChpxProperties, outputChpxSprms );
+            System.out.println( "== CHPX (original) ==" );
+            listerOriginal.dumpChpx( outputChpxProperties, outputChpxSprms );
+
+            System.out.println( "== CHPX (rebuilded) ==" );
+            listerRebuilded.dumpChpx( outputChpxProperties, outputChpxSprms );
         }
 
         if ( outputPapx )
         {
-            System.out.println( "== PAPX ==" );
-            lister.dumpPapx( outputPapxProperties );
+            System.out.println( "== PAPX (original) ==" );
+            listerOriginal.dumpPapx( outputPapxProperties, outputPapxSprms );
+
+            System.out.println( "== PAPX (rebuilded) ==" );
+            listerRebuilded.dumpPapx( outputPapxProperties, outputPapxSprms );
         }
 
         if ( outputParagraphs )
         {
-            System.out.println( "== Text paragraphs ==" );
-            lister.dumpParagraphs( true );
+            System.out.println( "== Text paragraphs (original) ==" );
+            listerRebuilded.dumpParagraphs( true );
 
-            System.out.println( "== DOM paragraphs ==" );
-            lister.dumpParagraphsDom( outputParagraphsSprms, outputPapx,
-                    outputParagraphsText );
+            System.out.println( "== DOM paragraphs (rebuilded) ==" );
+            listerRebuilded.dumpParagraphsDom( outputParagraphsText );
         }
 
         if ( outputBookmarks )
         {
-            System.out.println( "== BOOKMARKS ==" );
-            lister.dumpBookmarks();
+            System.out.println( "== BOOKMARKS (rebuilded) ==" );
+            listerRebuilded.dumpBookmarks();
         }
 
         if ( outputEscher )
         {
-            System.out.println( "== ESCHER PROPERTIES ==" );
-            lister.dumpEscher();
+            System.out.println( "== ESCHER PROPERTIES (rebuilded) ==" );
+            listerRebuilded.dumpEscher();
         }
 
         if ( outputFields )
         {
-            System.out.println( "== FIELDS ==" );
-            lister.dumpFields();
+            System.out.println( "== FIELDS (rebuilded) ==" );
+            listerRebuilded.dumpFields();
         }
 
         if ( outputOfficeDrawings )
         {
-            System.out.println( "== OFFICE DRAWINGS ==" );
-            lister.dumpOfficeDrawings();
+            System.out.println( "== OFFICE DRAWINGS (rebuilded) ==" );
+            listerRebuilded.dumpOfficeDrawings();
         }
 
         if ( outputPictures )
         {
-            System.out.println( "== PICTURES ==" );
-            lister.dumpPictures();
+            System.out.println( "== PICTURES (rebuilded) ==" );
+            listerRebuilded.dumpPictures();
         }
     }
 
@@ -402,90 +427,94 @@ public final class HWPFLister
         }
     }
 
-    public void dumpPapx( boolean withProperties ) throws Exception
+    public void dumpPapx( boolean withProperties, boolean withSprms )
+            throws Exception
     {
-        // if ( _doc instanceof HWPFDocument )
-        // {
-        // System.out.println( "binary PAP pages " );
-        //
-        // HWPFDocument doc = (HWPFDocument) _doc;
-        //
-        // java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
-        // .getDeclaredField( "_mainStream" );
-        // fMainStream.setAccessible( true );
-        // byte[] mainStream = (byte[]) fMainStream.get( _doc );
-        //
-        // PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
-        // .getFileInformationBlock().getFcPlcfbtePapx(), doc
-        // .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
-        //
-        // List<PAPX> papxs = new ArrayList<PAPX>();
-        //
-        // int length = binTable.length();
-        // for ( int x = 0; x < length; x++ )
-        // {
-        // GenericPropertyNode node = binTable.getProperty( x );
-        //
-        // int pageNum = LittleEndian.getInt( node.getBytes() );
-        // int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
-        // * pageNum;
-        //
-        // PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
-        // mainStream, doc.getDataStream(), pageOffset,
-        // doc.getTextTable() );
-        //
-        // System.out.println( "* PFKP: " + pfkp );
-        //
-        // for ( PAPX papx : pfkp.getPAPXs() )
-        // {
-        // System.out.println( "** " + papx );
-        // papxs.add( papx );
-        // if ( papx != null && true )
-        // {
-        // SprmIterator sprmIt = new SprmIterator(
-        // papx.getGrpprl(), 2 );
-        // while ( sprmIt.hasNext() )
-        // {
-        // SprmOperation sprm = sprmIt.next();
-        // System.out.println( "*** " + sprm.toString() );
-        // }
-        // }
-        //
-        // }
-        // }
-        //
-        // Collections.sort( papxs );
-        // System.out.println( "* Sorted by END" );
-        // for ( PAPX papx : papxs )
-        // {
-        // System.out.println( "** " + papx );
-        // SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
-        // while ( sprmIt.hasNext() )
-        // {
-        // SprmOperation sprm = sprmIt.next();
-        // System.out.println( "*** " + sprm.toString() );
-        // }
-        // }
-        // }
-
-        // for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
-        // {
-        // System.out.println( papx );
-        //
-        // if ( withProperties )
-        // System.out.println( papx.getParagraphProperties( _doc
-        // .getStyleSheet() ) );
-        //
-        // if ( true )
-        // {
-        // SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
-        // while ( sprmIt.hasNext() )
-        // {
-        // SprmOperation sprm = sprmIt.next();
-        // System.out.println( "\t" + sprm.toString() );
-        // }
-        // }
-        // }
+        if ( _doc instanceof HWPFDocument )
+        {
+            System.out.println( "binary PAP pages " );
+
+            HWPFDocument doc = (HWPFDocument) _doc;
+
+            java.lang.reflect.Field fMainStream = HWPFDocumentCore.class
+                    .getDeclaredField( "_mainStream" );
+            fMainStream.setAccessible( true );
+            byte[] mainStream = (byte[]) fMainStream.get( _doc );
+
+            PlexOfCps binTable = new PlexOfCps( doc.getTableStream(), doc
+                    .getFileInformationBlock().getFcPlcfbtePapx(), doc
+                    .getFileInformationBlock().getLcbPlcfbtePapx(), 4 );
+
+            List<PAPX> papxs = new ArrayList<PAPX>();
+
+            int length = binTable.length();
+            for ( int x = 0; x < length; x++ )
+            {
+                GenericPropertyNode node = binTable.getProperty( x );
+
+                int pageNum = LittleEndian.getInt( node.getBytes() );
+                int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE
+                        * pageNum;
+
+                PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
+                        mainStream, doc.getDataStream(), pageOffset,
+                        doc.getTextTable() );
+
+                System.out.println( "* PFKP: " + pfkp );
+
+                for ( PAPX papx : pfkp.getPAPXs() )
+                {
+                    System.out.println( "** " + papx );
+                    papxs.add( papx );
+                    if ( papx != null && withSprms )
+                    {
+                        SprmIterator sprmIt = new SprmIterator(
+                                papx.getGrpprl(), 2 );
+                        while ( sprmIt.hasNext() )
+                        {
+                            SprmOperation sprm = sprmIt.next();
+                            System.out.println( "*** " + sprm.toString() );
+                        }
+                    }
+
+                }
+            }
+
+            Collections.sort( papxs );
+            System.out.println( "* Sorted by END" );
+            for ( PAPX papx : papxs )
+            {
+                System.out.println( "** " + papx );
+                if ( papx != null && withSprms )
+                {
+                    SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
+                    while ( sprmIt.hasNext() )
+                    {
+                        SprmOperation sprm = sprmIt.next();
+                        System.out.println( "*** " + sprm.toString() );
+                    }
+                }
+            }
+        }
+
+        for ( PAPX papx : _doc.getParagraphTable().getParagraphs() )
+        {
+            System.out.println( papx );
+
+            if ( withProperties )
+                System.out.println( papx.getParagraphProperties( _doc
+                        .getStyleSheet() ) );
+
+            if ( true )
+            {
+                SprmIterator sprmIt = new SprmIterator( papx.getGrpprl(), 2 );
+                while ( sprmIt.hasNext() )
+                {
+                    SprmOperation sprm = sprmIt.next();
+                    System.out.println( "\t" + sprm.toString() );
+                }
+            }
+        }
     }
 
     public void dumpParagraphs( boolean dumpAssotiatedPapx )
@@ -526,8 +555,7 @@ public final class HWPFLister
         }
     }
 
-    public void dumpParagraphsDom( boolean withSprms, boolean withPapx,
-            boolean withText )
+    public void dumpParagraphsDom( boolean withText )
     {
         Range range = _doc.getOverallRange();
         for ( int p = 0; p < range.numParagraphs(); p++ )