From 0ccd78dc300764f2f9dfab456ff5e03365ec5cd8 Mon Sep 17 00:00:00 2001 From: Sergey Vladimirov Date: Sun, 30 Oct 2011 08:59:16 +0000 Subject: [PATCH] different workarounds for old Word format git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1195133 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/org/apache/poi/hwpf/HWPFDocument.java | 2 +- .../org/apache/poi/hwpf/HWPFOldDocument.java | 24 ++++++++++++++++++- .../apache/poi/hwpf/model/OldPAPBinTable.java | 4 ---- .../apache/poi/hwpf/model/PAPBinTable.java | 24 ++++++++++++++----- .../src/org/apache/poi/hwpf/model/PAPX.java | 10 ++++++-- .../org/apache/poi/hwpf/usermodel/Range.java | 2 +- .../apache/poi/hwpf/usermodel/TestBugs.java | 5 +++- 7 files changed, 55 insertions(+), 16 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index 4efa858db6..070f0660ee 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -80,7 +80,7 @@ import org.apache.poi.util.Internal; */ public final class HWPFDocument extends HWPFDocumentCore { - private static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables"; + static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables"; private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable"; private static final String STREAM_DATA = "Data"; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 58fb26b1b3..1ea906a282 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -66,9 +66,10 @@ public class HWPFOldDocument extends HWPFDocumentCore { // We need to get hold of the text that makes up the // document, which might be regular or fast-saved + ComplexFileTable cft = null; StringBuffer text = new StringBuffer(); if(_fib.getFibBase().isFComplex()) { - ComplexFileTable cft = new ComplexFileTable( + cft = new ComplexFileTable( _mainStream, _mainStream, complexTableOffset, _fib.getFibBase().getFcMin() ); @@ -113,6 +114,27 @@ public class HWPFOldDocument extends HWPFDocumentCore { _mainStream, sedTableOffset, sedTableSize, _fib.getFibBase().getFcMin(), tpt ); + + /* + * in this mode we preserving PAPX/CHPX structure from file, so text may + * miss from output, and text order may be corrupted + */ + boolean preserveBinTables = false; + try + { + preserveBinTables = Boolean.parseBoolean( System + .getProperty( HWPFDocument.PROPERTY_PRESERVE_BIN_TABLES ) ); + } + catch ( Exception exc ) + { + // ignore; + } + + if ( !preserveBinTables ) + { + _cbt.rebuild( cft ); + _pbt.rebuild( _text, cft ); + } } public Range getOverallRange() diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java index 0a4ae67ae8..0f7c90d529 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java @@ -17,8 +17,6 @@ package org.apache.poi.hwpf.model; -import java.util.Collections; - import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.util.Internal; import org.apache.poi.util.LittleEndian; @@ -57,7 +55,5 @@ public final class OldPAPBinTable extends PAPBinTable _paragraphs.add( papx ); } } - Collections.sort( _paragraphs, PropertyNode.StartComparator.instance ); } } - diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 7ab0576d6a..437ec97439 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -113,6 +113,12 @@ public class PAPBinTable public void rebuild( final StringBuilder docText, ComplexFileTable complexFileTable ) + { + rebuild( docText, complexFileTable, _paragraphs ); + } + + static void rebuild( final StringBuilder docText, + ComplexFileTable complexFileTable, List paragraphs ) { long start = System.currentTimeMillis(); @@ -156,19 +162,19 @@ public class PAPBinTable PAPX papx = new PAPX( textPiece.getStart(), textPiece.getEnd(), newSprmBuffer ); - _paragraphs.add( papx ); + paragraphs.add( papx ); } } logger.log( POILogger.DEBUG, "Merged (?) with PAPX from complex file table in ", Long.valueOf( System.currentTimeMillis() - start ), - " ms (", Integer.valueOf( _paragraphs.size() ), + " ms (", Integer.valueOf( paragraphs.size() ), " elements in total)" ); start = System.currentTimeMillis(); } - List oldPapxSortedByEndPos = new ArrayList( _paragraphs ); + List oldPapxSortedByEndPos = new ArrayList( paragraphs ); Collections.sort( oldPapxSortedByEndPos, PropertyNode.EndComparator.instance ); @@ -179,7 +185,7 @@ public class PAPBinTable final Map papxToFileOrder = new IdentityHashMap(); { int counter = 0; - for ( PAPX papx : _paragraphs ) + for ( PAPX papx : paragraphs ) { papxToFileOrder.put( papx, Integer.valueOf( counter++ ) ); } @@ -270,6 +276,9 @@ public class PAPBinTable SprmBuffer sprmBuffer = null; for ( PAPX papx : papxs ) { + if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 ) + continue; + if ( sprmBuffer == null ) try { @@ -281,7 +290,9 @@ public class PAPBinTable throw new Error( e ); } else + { sprmBuffer.append( papx.getGrpprl(), 2 ); + } } PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer ); newPapxs.add( newPapx ); @@ -289,11 +300,12 @@ public class PAPBinTable lastParStart = endExclusive; continue; } - this._paragraphs = new ArrayList( newPapxs ); + paragraphs.clear(); + paragraphs.addAll( newPapxs ); logger.log( POILogger.DEBUG, "PAPX rebuilded from document text in ", Long.valueOf( System.currentTimeMillis() - start ), " ms (", - Integer.valueOf( _paragraphs.size() ), " elements)" ); + Integer.valueOf( paragraphs.size() ), " elements)" ); start = System.currentTimeMillis(); } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java index 0f5e847dcc..bfaa8a9e31 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java @@ -112,11 +112,17 @@ public final class PAPX extends BytePropertyNode { public byte[] getGrpprl() { + if (_buf == null) + return new byte[0]; + return ((SprmBuffer)_buf).toByteArray(); } - public short getIstd() - { + public short getIstd() + { + if ( _buf == null ) + return 0; + byte[] buf = getGrpprl(); if (buf.length == 0) { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index b690d400d7..4999f128a2 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -1101,7 +1101,7 @@ public class Range { // TODO -instantiable superclass int endIndex = binarySearchEnd( rpl, startIndex, end ); while ( endIndex < rpl.size() - 1 && rpl.get( endIndex + 1 ).getEnd() <= end ) - endIndex--; + endIndex++; if ( startIndex < 0 || startIndex >= rpl.size() || startIndex > endIndex || endIndex < 0 diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java index 5aa15eeef7..7309f7f44b 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java @@ -28,6 +28,8 @@ import java.util.List; import junit.framework.TestCase; +import org.apache.poi.hwpf.converter.WordToTextConverter; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocument; @@ -736,7 +738,8 @@ public class TestBugs extends TestCase */ public void testBug51944() throws Exception { - HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" ); + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile( "Bug51944.doc" ); + WordToTextConverter.getText( doc ); } /** -- 2.39.5