From: Nick Burch Date: Fri, 2 Jul 2010 20:59:30 +0000 (+0000) Subject: More Word 6 / Word 95 Support X-Git-Tag: REL_3_7_BETA2~40 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=2d9df14178b81cc45f03bdf9255f87ddc4412d42;p=poi.git More Word 6 / Word 95 Support HWPFOldDocument now processes a few more table sections, and so we can fake up some basic Ranges. This allows us to do paragraph level text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@960102 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index f55a853710..acea33a3af 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF Text Extraction support for older Word 6 and Word 95 files via HWPF 49508 - Allow the addition of paragraphs to XWPF Table Cells 49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index 00d1162e90..877075158a 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -31,7 +31,6 @@ import org.apache.poi.hwpf.model.ComplexFileTable; import org.apache.poi.hwpf.model.DocumentProperties; import org.apache.poi.hwpf.model.EscherRecordHolder; import org.apache.poi.hwpf.model.FSPATable; -import org.apache.poi.hwpf.model.FileInformationBlock; import org.apache.poi.hwpf.model.FontTable; import org.apache.poi.hwpf.model.GenericPropertyNode; import org.apache.poi.hwpf.model.ListTables; @@ -83,24 +82,6 @@ public final class HWPFDocument extends HWPFDocumentCore protected TextPieceTable _tpt; - /** Contains formatting properties for text*/ - protected CHPBinTable _cbt; - - /** Contains formatting properties for paragraphs*/ - protected PAPBinTable _pbt; - - /** Contains formatting properties for sections.*/ - protected SectionTable _st; - - /** Holds styles for this document.*/ - protected StyleSheet _ss; - - /** Holds fonts for this document.*/ - protected FontTable _ft; - - /** Hold list tables */ - protected ListTables _lt; - /** Holds the save history for this document. */ protected SavedByTable _sbt; @@ -277,15 +258,11 @@ public final class HWPFDocument extends HWPFDocumentCore } } - public StyleSheet getStyleSheet() + public TextPieceTable getTextTable() { - return _ss; + return _cft.getTextPieceTable(); } - public FileInformationBlock getFileInformationBlock() - { - return _fib; - } public CPSplitCalculator getCPSplitCalculator() { return _cpSplit; @@ -390,11 +367,6 @@ public final class HWPFDocument extends HWPFDocumentCore return length; } - public ListTables getListTables() - { - return _lt; - } - /** * Gets a reference to the saved -by table, which holds the save history for the document. * @@ -591,26 +563,6 @@ public final class HWPFDocument extends HWPFDocumentCore pfs.writeFilesystem(out); } - public CHPBinTable getCharacterTable() - { - return _cbt; - } - - public PAPBinTable getParagraphTable() - { - return _pbt; - } - - public SectionTable getSectionTable() - { - return _st; - } - - public TextPieceTable getTextTable() - { - return _cft.getTextPieceTable(); - } - public byte[] getDataStream() { return _dataStream; @@ -629,11 +581,6 @@ public final class HWPFDocument extends HWPFDocumentCore return _lt.addList(list.getListData(), list.getOverride()); } - public FontTable getFontTable() - { - return _ft; - } - public void delete(int start, int length) { Range r = new Range(start, start + length, this); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java index af17cc2ed2..f9ebc37801 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java @@ -23,7 +23,15 @@ import java.io.PushbackInputStream; import org.apache.poi.EncryptedDocumentException; import org.apache.poi.POIDocument; +import org.apache.poi.hwpf.model.CHPBinTable; import org.apache.poi.hwpf.model.FileInformationBlock; +import org.apache.poi.hwpf.model.FontTable; +import org.apache.poi.hwpf.model.ListTables; +import org.apache.poi.hwpf.model.PAPBinTable; +import org.apache.poi.hwpf.model.SectionTable; +import org.apache.poi.hwpf.model.StyleSheet; +import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -40,6 +48,24 @@ public abstract class HWPFDocumentCore extends POIDocument /** The FIB */ protected FileInformationBlock _fib; + /** Holds styles for this document.*/ + protected StyleSheet _ss; + + /** Contains formatting properties for text*/ + protected CHPBinTable _cbt; + + /** Contains formatting properties for paragraphs*/ + protected PAPBinTable _pbt; + + /** Contains formatting properties for sections.*/ + protected SectionTable _st; + + /** Holds fonts for this document.*/ + protected FontTable _ft; + + /** Hold list tables */ + protected ListTables _lt; + /** main document stream buffer*/ protected byte[] _mainStream; @@ -123,6 +149,44 @@ public abstract class HWPFDocumentCore extends POIDocument } } + /** + * Returns the range which covers the whole of the + * document, but excludes any headers and footers. + */ + public abstract Range getRange(); + + public abstract TextPieceTable getTextTable(); + + public CHPBinTable getCharacterTable() + { + return _cbt; + } + + public PAPBinTable getParagraphTable() + { + return _pbt; + } + + public SectionTable getSectionTable() + { + return _st; + } + + public StyleSheet getStyleSheet() + { + return _ss; + } + + public ListTables getListTables() + { + return _lt; + } + + public FontTable getFontTable() + { + return _ft; + } + public FileInformationBlock getFileInformationBlock() { return _fib; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 42cff2ace8..211dc9a6b7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -18,15 +18,15 @@ package org.apache.poi.hwpf; import java.io.IOException; import java.io.OutputStream; -import java.util.ArrayList; -import java.util.List; -import org.apache.poi.hwpf.model.CHPX; import org.apache.poi.hwpf.model.ComplexFileTable; import org.apache.poi.hwpf.model.OldCHPBinTable; +import org.apache.poi.hwpf.model.OldPAPBinTable; +import org.apache.poi.hwpf.model.OldSectionTable; import org.apache.poi.hwpf.model.PieceDescriptor; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.model.TextPieceTable; +import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.LittleEndian; @@ -34,11 +34,9 @@ import org.apache.poi.util.LittleEndian; /** * Provides very simple support for old (Word 6 / Word 95) * files. - * TODO Provide a way to get at the properties associated - * with each block of text */ public class HWPFOldDocument extends HWPFDocumentCore { - private List contents = new ArrayList(); + private TextPieceTable tpt; public HWPFOldDocument(POIFSFileSystem fs) throws IOException { this(fs.getRoot(), fs); @@ -49,14 +47,19 @@ public class HWPFOldDocument extends HWPFDocumentCore { super(directory, fs); // Where are things? + int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88); + int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c); int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8); - int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); + int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); + int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0); + int papTableSize = LittleEndian.getInt(_mainStream, 0xc4); + //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60); + //int shfTableSize = LittleEndian.getInt(_mainStream, 0x64); int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); // We need to get hold of the text that makes up the // document, which might be regular or fast-saved StringBuffer text = new StringBuffer(); - TextPieceTable tpt; if(_fib.isFComplex()) { ComplexFileTable cft = new ComplexFileTable( _mainStream, _mainStream, @@ -68,11 +71,15 @@ public class HWPFOldDocument extends HWPFDocumentCore { text.append( tp.getStringBuffer() ); } } else { + // TODO Discover if these older documents can ever hold Unicode Strings? + // (We think not, because they seem to lack a Piece table) // TODO Build the Piece Descriptor properly - // TODO Can these old documents ever contain Unicode strings? + // (We have to fake it, as they don't seem to have a proper Piece table) PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); pd.setFilePosition(_fib.getFcMin()); + // Generate a single Text Piece Table, with a single Text Piece + // which covers all the (8 bit only) text in the file tpt = new TextPieceTable(); byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()]; System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length); @@ -85,51 +92,34 @@ public class HWPFOldDocument extends HWPFDocumentCore { } // Now we can fetch the character and paragraph properties - OldCHPBinTable chpTable = new OldCHPBinTable( + _cbt = new OldCHPBinTable( _mainStream, chpTableOffset, chpTableSize, _fib.getFcMin(), tpt ); - - // Finally build up runs - for(CHPX chpx : chpTable.getTextRuns()) { - String str = text.substring(chpx.getStart(), chpx.getEnd()); - contents.add(new TextAndCHPX(str,chpx)); - } + _pbt = new OldPAPBinTable( + _mainStream, papTableOffset, papTableSize, + _fib.getFcMin(), tpt + ); + _st = new OldSectionTable( + _mainStream, sedTableOffset, sedTableSize, + _fib.getFcMin(), tpt + ); + } + + public Range getRange() { + // Life is easy when we have no footers, headers or unicode! + return new Range( + 0, _fib.getFcMac() - _fib.getFcMin(), this + ); + } + + public TextPieceTable getTextTable() + { + return tpt; } @Override public void write(OutputStream out) throws IOException { throw new IllegalStateException("Writing is not available for the older file formats"); } - - /** - * Retrieves all our text, in order, along with the - * CHPX information on each bit. - * Every entry has the same formatting, but as yet - * we've no way to tell what the formatting is... - * Warnings - this will change as soon as we support - * text formatting! - */ - public List getContents() { - return contents; - } - - /** - * Warnings - this will change as soon as we support - * text formatting! - */ - public static class TextAndCHPX { - private String text; - private CHPX chpx; - private TextAndCHPX(String text, CHPX chpx) { - this.text = text; - this.chpx = chpx; - } - public String getText() { - return text; - } - public CHPX getChpx() { - return chpx; - } - } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java index 3ea6d42d46..b40aa396b5 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java @@ -22,7 +22,6 @@ import java.io.InputStream; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFOldDocument; -import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -68,12 +67,41 @@ public final class Word6Extractor extends POIOLE2TextExtractor { this.doc = doc; } - @Override + /** + * Get the text from the word file, as an array with one String + * per paragraph + */ + public String[] getParagraphText() { + String[] ret; + + // Extract using the model code + try { + Range r = doc.getRange(); + + ret = WordExtractor.getParagraphText(r); + } catch (Exception e) { + // Something's up with turning the text pieces into paragraphs + // Fall back to ripping out the text pieces + ret = new String[doc.getTextTable().getTextPieces().size()]; + for(int i=0; i _textRuns = new ArrayList(); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java index 3c97652e55..1431cff032 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java @@ -17,9 +17,6 @@ package org.apache.poi.hwpf.model; -import java.util.ArrayList; -import java.util.List; - import org.apache.poi.poifs.common.POIFSConstants; import org.apache.poi.util.LittleEndian; @@ -31,11 +28,8 @@ import org.apache.poi.util.LittleEndian; * In common with the rest of the old support, it * is read only */ -public final class OldCHPBinTable +public final class OldCHPBinTable extends CHPBinTable { - /** List of character properties.*/ - protected ArrayList _textRuns = new ArrayList(); - /** * Constructor used to read an old-style binTable * in from a Word document. @@ -69,9 +63,4 @@ public final class OldCHPBinTable } } } - - public List getTextRuns() - { - return _textRuns; - } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java new file mode 100644 index 0000000000..37ec57557d --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldPAPBinTable.java @@ -0,0 +1,59 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.LittleEndian; + +/** + * This class holds all of the paragraph formatting + * properties from Old (Word 6 / Word 95) documents. + * Unlike with Word 97+, it all gets held in the + * same stream. + * In common with the rest of the old support, it + * is read only + */ +public final class OldPAPBinTable extends PAPBinTable +{ + public OldPAPBinTable(byte[] documentStream, int offset, + int size, int fcMin, TextPieceTable tpt) + { + PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); + + int length = binTable.length(); + for (int x = 0; x < length; x++) + { + GenericPropertyNode node = binTable.getProperty(x); + + int pageNum = LittleEndian.getShort(node.getBytes()); + int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum; + + PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream, + documentStream, pageOffset, fcMin, tpt); + + int fkpSize = pfkp.size(); + + for (int y = 0; y < fkpSize; y++) + { + PAPX papx = pfkp.getPAPX(y); + _paragraphs.add(papx); + } + } + } +} + diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java new file mode 100644 index 0000000000..dc992c69b6 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldSectionTable.java @@ -0,0 +1,65 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import org.apache.poi.util.LittleEndian; + +/** + * This class holds all of the section formatting + * properties from Old (Word 6 / Word 95) documents. + * Unlike with Word 97+, it all gets held in the + * same stream. + * In common with the rest of the old support, it + * is read only + */ +public final class OldSectionTable extends SectionTable +{ + public OldSectionTable(byte[] documentStream, int offset, + int size, int fcMin, + TextPieceTable tpt) + { + PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12); + + int length = sedPlex.length(); + + for (int x = 0; x < length; x++) + { + GenericPropertyNode node = sedPlex.getProperty(x); + SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0); + + int fileOffset = sed.getFc(); + int startAt = node.getStart(); + int endAt = node.getEnd(); + + // check for the optimization + if (fileOffset == 0xffffffff) + { + _sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); + } + else + { + // The first short at the offset is the size of the grpprl. + int sepxSize = LittleEndian.getShort(documentStream, fileOffset); + byte[] buf = new byte[sepxSize]; + fileOffset += LittleEndian.SHORT_SIZE; + System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); + _sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); + } + } + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 06b6d3f6af..894210f4b3 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -34,7 +34,7 @@ import org.apache.poi.util.LittleEndian; * * @author Ryan Ackley */ -public final class PAPBinTable +public class PAPBinTable { protected ArrayList _paragraphs = new ArrayList(); byte[] _dataStream; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java index 8d77ba5fb2..22777a80e7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPX.java @@ -112,7 +112,11 @@ public final class PAPX extends BytePropertyNode { { return 0; } - return LittleEndian.getShort(buf); + if (buf.length == 1) + { + return (short)LittleEndian.getUnsignedByte(buf, 0); + } + return LittleEndian.getShort(buf); } public SprmBuffer getSprmBuf() @@ -122,6 +126,11 @@ public final class PAPX extends BytePropertyNode { public ParagraphProperties getParagraphProperties(StyleSheet ss) { + if(ss == null) { + // TODO Fix up for Word 6/95 + return new ParagraphProperties(); + } + short istd = getIstd(); ParagraphProperties baseStyle = ss.getParagraphStyle(istd); ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java index 3b47c1e91c..987e49ac89 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/SectionTable.java @@ -27,12 +27,12 @@ import org.apache.poi.hwpf.model.io.*; /** * @author Ryan Ackley */ -public final class SectionTable +public class SectionTable { private static final int SED_SIZE = 12; - protected ArrayList _sections = new ArrayList(); - protected List _text; + protected ArrayList _sections = new ArrayList(); + protected List _text; /** So we can know if things are unicode or not */ private TextPieceTable tpt; @@ -84,7 +84,7 @@ public final class SectionTable boolean matchAt = false; boolean matchHalf = false; for(int i=0; i<_sections.size(); i++) { - SEPX s = (SEPX)_sections.get(i); + SEPX s = _sections.get(i); if(s.getEnd() == mainEndsAt) { matchAt = true; } else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) { @@ -94,7 +94,7 @@ public final class SectionTable if(! matchAt && matchHalf) { System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!"); for(int i=0; i<_sections.size(); i++) { - SEPX s = (SEPX)_sections.get(i); + SEPX s = _sections.get(i); GenericPropertyNode node = sedPlex.getProperty(i); s.setStart( CPtoFC(node.getStart()) ); @@ -106,12 +106,12 @@ public final class SectionTable public void adjustForInsert(int listIndex, int length) { int size = _sections.size(); - SEPX sepx = (SEPX)_sections.get(listIndex); + SEPX sepx = _sections.get(listIndex); sepx.setEnd(sepx.getEnd() + length); for (int x = listIndex + 1; x < size; x++) { - sepx = (SEPX)_sections.get(x); + sepx = _sections.get(x); sepx.setStart(sepx.getStart() + length); sepx.setEnd(sepx.getEnd() + length); } @@ -129,7 +129,7 @@ public final class SectionTable for(int i=_text.size()-1; i>-1; i--) { - TP = (TextPiece)_text.get(i); + TP = _text.get(i); if(CP >= TP.getCP()) break; } @@ -142,7 +142,7 @@ public final class SectionTable return FC; } - public ArrayList getSections() + public ArrayList getSections() { return _sections; } @@ -159,7 +159,7 @@ public final class SectionTable for (int x = 0; x < len; x++) { - SEPX sepx = (SEPX)_sections.get(x); + SEPX sepx = _sections.get(x); byte[] grpprl = sepx.getGrpprl(); // write the sepx to the document stream. starts with a 2 byte size diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index 1cac46d098..86f8bb54fa 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -20,6 +20,7 @@ package org.apache.poi.hwpf.usermodel; import org.apache.poi.util.LittleEndian; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Paragraph; @@ -77,7 +78,7 @@ public class Range { // TODO -instantiable superclass protected int _end; /** The document this range blongs to. */ - protected HWPFDocument _doc; + protected HWPFDocumentCore _doc; /** Have we loaded the section indexes yet */ boolean _sectionRangeFound; @@ -144,7 +145,7 @@ public class Range { // TODO -instantiable superclass * @param doc * The HWPFDocument the range is based on. */ - public Range(int start, int end, HWPFDocument doc) { + public Range(int start, int end, HWPFDocumentCore doc) { _start = start; _end = end; _doc = doc; @@ -1004,6 +1005,8 @@ public class Range { // TODO -instantiable superclass * The (signed) value that should be added to the FIB CCP fields */ protected void adjustFIB(int adjustment) { + assert (_doc instanceof HWPFDocument); + // update the FIB.CCPText field (this should happen once per adjustment, // so we don't want it in // adjustForInsert() or it would get updated multiple times if the range @@ -1011,7 +1014,7 @@ public class Range { // TODO -instantiable superclass // without this, OpenOffice.org (v. 2.2.x) does not see all the text in // the document - CPSplitCalculator cpS = _doc.getCPSplitCalculator(); + CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator(); FileInformationBlock fib = _doc.getFileInformationBlock(); // Do for each affected part @@ -1066,7 +1069,7 @@ public class Range { // TODO -instantiable superclass return _end; } - protected HWPFDocument getDocument() { + protected HWPFDocumentCore getDocument() { return _doc; } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index 9634ab9d40..eabb16b7c8 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -256,6 +256,16 @@ public final class TestWordExtractor extends TestCase { assertTrue(text.contains("Paragraph 2")); assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); assertTrue(text.contains("Last (4th) paragraph")); + + String[] tp = w6e.getParagraphText(); + assertEquals(7, tp.length); + assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]); + assertEquals("\r\n", tp[1]); + assertEquals("Paragraph 2\r\n", tp[2]); + assertEquals("\r\n", tp[3]); + assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]); + assertEquals("\r\n", tp[5]); + assertEquals("Last (4th) paragraph.\r\n", tp[6]); } public void testWord6() throws Exception { @@ -273,5 +283,9 @@ public final class TestWordExtractor extends TestCase { String text = w6e.getText(); assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); + + String[] tp = w6e.getParagraphText(); + assertEquals(1, tp.length); + assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]); } }