From: Sergey Vladimirov Date: Tue, 9 Aug 2011 12:38:52 +0000 (+0000) Subject: Add Word-to-Text converter and use it as replacement for WordExtractor X-Git-Tag: REL_3_8_BETA4~24 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=49697de696fffcd09b6c1dde7b5ec5b94b647c21;p=poi.git Add Word-to-Text converter and use it as replacement for WordExtractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155336 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 6c77dd532c..aca3037dc9 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Add Word-to-Text converter and use it as replacement for WordExtractor 51604 - replace text fails for doc ( poi 3.8 beta release from download site ) Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF Support for conditional formatting in XSSF diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index 9ec09fe47a..00930c3fdd 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -19,6 +19,7 @@ package org.apache.poi; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; +import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -39,7 +40,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public POIOLE2TextExtractor(POIDocument document) { super(document); } - + /** * Returns the document information metadata for the document */ @@ -52,20 +53,28 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public SummaryInformation getSummaryInformation() { return document.getSummaryInformation(); } - + /** - * Returns an HPSF powered text extractor for the + * Returns an HPSF powered text extractor for the * document properties metadata, such as title and author. */ public POITextExtractor getMetadataTextExtractor() { return new HPSFPropertiesExtractor(this); } - /** - * Return the underlying POIFS FileSystem of - * this document. - */ - public POIFSFileSystem getFileSystem() { - return document.directory.getFileSystem(); - } + public DirectoryEntry getRoot() + { + return document.directory; + } + + /** + * Return the underlying POIFS FileSystem of this document. + * + * @deprecated Use {@link #getRoot()} instead + */ + @Deprecated + public POIFSFileSystem getFileSystem() + { + return document.directory.getFileSystem(); + } } diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java index 8a7643b013..a4ff616125 100644 --- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java @@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; */ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { private DirectoryNode _dir; - private POIFSFileSystem _fs; boolean _includeSheetNames = true; boolean _formulasNotResults = false; - public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) { - super(null); - _dir = dir; - _fs = fs; - } + /** + * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead + */ + @Deprecated + @SuppressWarnings( "unused" ) + public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs ) + { + this( dir ); + } + + public EventBasedExcelExtractor( DirectoryNode dir ) + { + super( null ); + _dir = dir; + } + public EventBasedExcelExtractor(POIFSFileSystem fs) { - this(fs.getRoot(), fs); + this(fs.getRoot()); } /** @@ -79,9 +89,9 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { * this document. */ public POIFSFileSystem getFileSystem() { - return _fs; + return _dir.getFileSystem(); } - + /** * Would return the document information metadata for the document, * if we supported it @@ -200,7 +210,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { outputNextStringValue = true; nextRow = frec.getRow(); } else { - thisText = _ft.formatNumberDateCell(frec); + thisText = _ft.formatNumberDateCell(frec); } } break; @@ -234,7 +244,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor { case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; thisRow = numrec.getRow(); - thisText = _ft.formatNumberDateCell(numrec); + thisText = _ft.formatNumberDateCell(numrec); break; default: break; diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java index 44de2d638d..b620623595 100644 --- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java @@ -24,7 +24,6 @@ import java.io.InputStream; import java.io.PrintStream; import org.apache.poi.POIOLE2TextExtractor; -import org.apache.poi.ss.formula.eval.ErrorEval; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFCellStyle; import org.apache.poi.hssf.usermodel.HSSFComment; @@ -35,12 +34,13 @@ import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.ss.formula.eval.ErrorEval; import org.apache.poi.ss.usermodel.HeaderFooter; /** * A text extractor for Excel files. *

- * Returns the textual content of the file, suitable for + * Returns the textual content of the file, suitable for * indexing by something like Lucene, but not really * intended for display to the user. *

@@ -59,19 +59,27 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p private boolean _includeCellComments = false; private boolean _includeBlankCells = false; private boolean _includeHeadersFooters = true; - + public ExcelExtractor(HSSFWorkbook wb) { super(wb); _wb = wb; _formatter = new HSSFDataFormatter(); } public ExcelExtractor(POIFSFileSystem fs) throws IOException { - this(fs.getRoot(), fs); + this(fs.getRoot()); } - public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { - this(new HSSFWorkbook(dir, fs, true)); + /** + * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead + */ + @Deprecated + @SuppressWarnings( "unused" ) + public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { + this( dir ); + } + public ExcelExtractor(DirectoryNode dir) throws IOException { + this(new HSSFWorkbook(dir, true)); } - + private static final class CommandParseException extends Exception { public CommandParseException(String msg) { super(msg); @@ -183,7 +191,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p return _headersFooters; } } - + private static void printUsageMessage(PrintStream ps) { ps.println("Use:"); ps.println(" " + ExcelExtractor.class.getName() + " [ [ [...]]] [-i ]"); @@ -201,7 +209,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p * Command line extractor. */ public static void main(String[] args) { - + CommandArgs cmdArgs; try { cmdArgs = new CommandArgs(args); @@ -211,12 +219,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p System.exit(1); return; // suppress compiler error } - + if (cmdArgs.isRequestHelp()) { printUsageMessage(System.out); return; } - + try { InputStream is; if(cmdArgs.getInputFile() == null) { @@ -270,9 +278,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p * Default is to include them. */ public void setIncludeHeadersFooters(boolean includeHeadersFooters) { - _includeHeadersFooters = includeHeadersFooters; + _includeHeadersFooters = includeHeadersFooters; } - + /** * Retrieves the text contents of the file */ @@ -282,12 +290,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p // We don't care about the difference between // null (missing) and blank cells _wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL); - + // Process each sheet in turn for(int i=0;i<_wb.getNumberOfSheets();i++) { HSSFSheet sheet = _wb.getSheetAt(i); if(sheet == null) { continue; } - + if(_includeSheetNames) { String name = _wb.getSheetName(i); if(name != null) { @@ -295,12 +303,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p text.append("\n"); } } - + // Header text, if there is any if(_includeHeadersFooters) { text.append(_extractHeaderFooter(sheet.getHeader())); } - + int firstRow = sheet.getFirstRowNum(); int lastRow = sheet.getLastRowNum(); for(int j=firstRow;j<=lastRow;j++) { @@ -313,7 +321,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p if(_includeBlankCells) { firstCell = 0; } - + for(int k=firstCell;k 0) text.append("\n"); - + return text.toString(); } } diff --git a/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java b/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java index 844070fb50..bb50a75571 100644 --- a/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java +++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java @@ -15,13 +15,14 @@ See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ - -package org.apache.poi.poifs.filesystem; -import java.io.*; +package org.apache.poi.poifs.filesystem; -import java.util.*; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; import org.apache.poi.hpsf.ClassID; @@ -67,6 +68,12 @@ public interface DirectoryEntry public int getEntryCount(); + /** + * Checks if entry with specified name present + */ + + public boolean hasEntry( final String name ); + /** * get a specified Entry by name * diff --git a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java index d6f86d5ef4..20effd8bdb 100644 --- a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java +++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java @@ -15,7 +15,7 @@ See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ - + package org.apache.poi.poifs.filesystem; @@ -53,7 +53,7 @@ public class DirectoryNode // the POIFSFileSystem we belong to private POIFSFileSystem _ofilesystem; // the NPOIFSFileSytem we belong to - private NPOIFSFileSystem _nfilesystem; + private NPOIFSFileSystem _nfilesystem; // the path described by this document private POIFSDocumentPath _path; @@ -72,7 +72,7 @@ public class DirectoryNode { this(property, parent, filesystem, (NPOIFSFileSystem)null); } - + /** * create a DirectoryNode. This method is not public by design; it * is intended strictly for the internal use of this package @@ -87,7 +87,7 @@ public class DirectoryNode { this(property, parent, (POIFSFileSystem)null, nfilesystem); } - + private DirectoryNode(final DirectoryProperty property, final DirectoryNode parent, final POIFSFileSystem ofilesystem, @@ -96,7 +96,7 @@ public class DirectoryNode super(property, parent); this._ofilesystem = ofilesystem; this._nfilesystem = nfilesystem; - + if (parent == null) { _path = new POIFSDocumentPath(); @@ -143,23 +143,23 @@ public class DirectoryNode { return _path; } - + /** * @return the filesystem that this belongs to */ public POIFSFileSystem getFileSystem() { - return _ofilesystem; + return _ofilesystem; } - + /** * @return the filesystem that this belongs to */ public NPOIFSFileSystem getNFileSystem() { - return _nfilesystem; + return _nfilesystem; } - + /** * open a document in the directory's entry's list of entries * @@ -195,7 +195,7 @@ public class DirectoryNode throw new IOException("Entry '" + document.getName() + "' is not a DocumentEntry"); } - + DocumentEntry entry = (DocumentEntry)document; return new DocumentInputStream(entry); } @@ -217,7 +217,7 @@ public class DirectoryNode (( DirectoryProperty ) getProperty()).addChild(property); _ofilesystem.addDocument(document); - + _entries.add(rval); _byname.put(property.getName(), rval); return rval; @@ -240,7 +240,7 @@ public class DirectoryNode (( DirectoryProperty ) getProperty()).addChild(property); _nfilesystem.addDocument(document); - + _entries.add(rval); _byname.put(property.getName(), rval); return rval; @@ -290,7 +290,7 @@ public class DirectoryNode { _entries.remove(entry); _byname.remove(entry.getName()); - + if(_ofilesystem != null) { _ofilesystem.remove(entry); } else { @@ -342,6 +342,11 @@ public class DirectoryNode return _entries.size(); } + public boolean hasEntry( String name ) + { + return name != null && _byname.containsKey( name ); + } + /** * get a specified Entry by name * @@ -430,7 +435,7 @@ public class DirectoryNode { DirectoryNode rval; DirectoryProperty property = new DirectoryProperty(name); - + if(_ofilesystem != null) { rval = new DirectoryNode(property, _ofilesystem, this); _ofilesystem.addDirectory(property); @@ -562,7 +567,7 @@ public class DirectoryNode * Returns an Iterator over all the entries */ public Iterator iterator() { - return getEntries(); + return getEntries(); } /* ********** END begin implementation of POIFSViewable ********** */ diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 57b9aa914f..685d92c845 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -66,48 +66,48 @@ import org.apache.xmlbeans.XmlException; public class ExtractorFactory { public static final String CORE_DOCUMENT_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; - - + + /** Should this thread prefer event based over usermodel based extractors? */ private static final ThreadLocal threadPreferEventExtractors = new ThreadLocal() { protected Boolean initialValue() { return Boolean.FALSE; } }; /** Should all threads prefer event based over usermodel based extractors? */ private static Boolean allPreferEventExtractors; - - /** + + /** * Should this thread prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is false. + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is false. */ public static boolean getThreadPrefersEventExtractors() { return threadPreferEventExtractors.get(); } - /** - * Should all threads prefer event based over usermodel based extractors? - * (usermodel extractors tend to be more accurate, but use more memory) - * Default is to use the thread level setting, which defaults to false. + /** + * Should all threads prefer event based over usermodel based extractors? + * (usermodel extractors tend to be more accurate, but use more memory) + * Default is to use the thread level setting, which defaults to false. */ public static Boolean getAllThreadsPreferEventExtractors() { return allPreferEventExtractors; } - - /** + + /** * Should this thread prefer event based over usermodel based extractors? - * Will only be used if the All Threads setting is null. + * Will only be used if the All Threads setting is null. */ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { threadPreferEventExtractors.set(preferEventExtractors); } - /** + /** * Should all threads prefer event based over usermodel based extractors? - * If set, will take preference over the Thread level setting. + * If set, will take preference over the Thread level setting. */ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { allPreferEventExtractors = preferEventExtractors; } - - + + /** * Should this thread use event based extractors is available? * Checks the all-threads one first, then thread specific. @@ -118,8 +118,8 @@ public class ExtractorFactory { } return threadPreferEventExtractors.get(); } - - + + public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { InputStream inp = null; try { @@ -137,14 +137,14 @@ public class ExtractorFactory { if(inp != null) inp.close(); } } - + public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { // Figure out the kind of stream // If clearly doesn't do mark/reset, wrap up if(! inp.markSupported()) { inp = new PushbackInputStream(inp, 8); } - + if(POIFSFileSystem.hasPOIFSHeader(inp)) { return createExtractor(new POIFSFileSystem(inp)); } @@ -153,16 +153,16 @@ public class ExtractorFactory { } throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); } - + public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { - PackageRelationshipCollection core = + PackageRelationshipCollection core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); if(core.size() != 1) { throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); } PackagePart corePart = pkg.getPart(core.getRelationship(0)); - + // Is it XSSF? for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { if(corePart.getContentType().equals(rel.getContentType())) { @@ -173,84 +173,98 @@ public class ExtractorFactory { } } } - + // Is it XWPF? for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { if(corePart.getContentType().equals(rel.getContentType())) { return new XWPFWordExtractor(pkg); } } - + // Is it XSLF? for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { if(corePart.getContentType().equals(rel.getContentType())) { return new XSLFPowerPointExtractor(pkg); } } - + throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")"); } - + public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { // Only ever an OLE2 one from the root of the FS - return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs); + return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); } - public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { - // Look for certain entries in the stream, to figure it - // out from - for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) { - Entry entry = entries.next(); - - if(entry.getName().equals("Workbook")) { - if(getPreferEventExtractor()) { - return new EventBasedExcelExtractor(poifsDir, fs); - } else { - return new ExcelExtractor(poifsDir, fs); - } - } - if(entry.getName().equals("WordDocument")) { - // Old or new style word document? - try { - return new WordExtractor(poifsDir, fs); - } catch(OldWordFileFormatException e) { - return new Word6Extractor(poifsDir, fs); - } - } - if(entry.getName().equals("PowerPoint Document")) { - return new PowerPointExtractor(poifsDir, fs); - } - if(entry.getName().equals("VisioDocument")) { - return new VisioTextExtractor(poifsDir, fs); - } - if(entry.getName().equals("Quill")) { - return new PublisherTextExtractor(poifsDir, fs); - } - if( - entry.getName().equals("__substg1.0_1000001E") || - entry.getName().equals("__substg1.0_1000001F") || - entry.getName().equals("__substg1.0_0047001E") || - entry.getName().equals("__substg1.0_0047001F") || - entry.getName().equals("__substg1.0_0037001E") || - entry.getName().equals("__substg1.0_0037001F") - ) { - return new OutlookTextExtactor(poifsDir, fs); - } - if(entry.getName().equals("Package")) { - OPCPackage pkg = OPCPackage.open( - poifsDir.createDocumentInputStream(entry.getName()) - ); - return createExtractor(pkg); - } - } - throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); - } - - + + /** + * @deprecated Use {@link #createExtractor(DirectoryNode)} instead + */ + @Deprecated + @SuppressWarnings("unused") + public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) + throws IOException, InvalidFormatException, OpenXML4JException, XmlException + { + return createExtractor(poifsDir); + } + + public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, + InvalidFormatException, OpenXML4JException, XmlException + { + // Look for certain entries in the stream, to figure it + // out from + if (poifsDir.hasEntry("Workbook")) { + if (getPreferEventExtractor()) { + return new EventBasedExcelExtractor(poifsDir); + } + return new ExcelExtractor(poifsDir); + } + + if (poifsDir.hasEntry("WordDocument")) { + // Old or new style word document? + try { + return new WordExtractor(poifsDir); + } catch (OldWordFileFormatException e) { + return new Word6Extractor(poifsDir); + } + } + + if (poifsDir.hasEntry("PowerPoint Document")) { + return new PowerPointExtractor(poifsDir); + } + + if (poifsDir.hasEntry("VisioDocument")) { + return new VisioTextExtractor(poifsDir); + } + + if (poifsDir.hasEntry("Quill")) { + return new PublisherTextExtractor(poifsDir); + } + + if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F") + || poifsDir.hasEntry("__substg1.0_0047001E") + || poifsDir.hasEntry("__substg1.0_0047001F") + || poifsDir.hasEntry("__substg1.0_0037001E") + || poifsDir.hasEntry("__substg1.0_0037001F")) + { + return new OutlookTextExtactor(poifsDir); + } + + for (Iterator entries = poifsDir.getEntries(); entries.hasNext();) { + Entry entry = entries.next(); + + if (entry.getName().equals("Package")) { + OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); + return createExtractor(pkg); + } + } + throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); + } + /** * Returns an array of text extractors, one for each of * the embeded documents in the file (if there are any). * If there are no embeded documents, you'll get back an - * empty array. Otherwise, you'll get one open + * empty array. Otherwise, you'll get one open * {@link POITextExtractor} for each embeded file. */ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { @@ -258,16 +272,16 @@ public class ExtractorFactory { ArrayList dirs = new ArrayList(); // For anything else not directly held in as a POIFS directory ArrayList nonPOIFS = new ArrayList(); - + // Find all the embeded directories - POIFSFileSystem fs = ext.getFileSystem(); - if(fs == null) { + DirectoryEntry root = ext.getRoot(); + if(root == null) { throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); } - + if(ext instanceof ExcelExtractor) { // These are in MBD... under the root - Iterator it = fs.getRoot().getEntries(); + Iterator it = root.getEntries(); while(it.hasNext()) { Entry entry = it.next(); if(entry.getName().startsWith("MBD")) { @@ -278,7 +292,7 @@ public class ExtractorFactory { // These are in ObjectPool -> _... under the root try { DirectoryEntry op = (DirectoryEntry) - fs.getRoot().getEntry("ObjectPool"); + root.getEntry("ObjectPool"); Iterator it = op.getEntries(); while(it.hasNext()) { Entry entry = it.next(); @@ -302,7 +316,7 @@ public class ExtractorFactory { } } } - + // Create the extractors if( (dirs == null || dirs.size() == 0) && @@ -310,11 +324,11 @@ public class ExtractorFactory { ){ return new POITextExtractor[0]; } - + ArrayList e = new ArrayList(); for(int i=0; i + { + final int end; + final int start; + final Object structure; + + Structure( Bookmark bookmark ) + { + this.start = bookmark.getStart(); + this.end = bookmark.getEnd(); + this.structure = bookmark; + } + + Structure( Field field ) + { + this.start = field.getFieldStartOffset(); + this.end = field.getFieldEndOffset(); + this.structure = field; + } + + public int compareTo( Structure o ) + { + return start < o.start ? -1 : start == o.start ? 0 : 1; + } + } + private static final byte BEL_MARK = 7; private static final byte FIELD_BEGIN_MARK = 19; @@ -396,6 +423,13 @@ public abstract class AbstractWordConverter processDrawnObject( doc, characterRun, block ); continue; } + if ( characterRun.isOle2() + && ( wordDocument instanceof HWPFDocument ) ) + { + HWPFDocument doc = (HWPFDocument) wordDocument; + processOle2( doc, characterRun, block ); + continue; + } } if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) @@ -613,10 +647,11 @@ public abstract class AbstractWordConverter CharacterRun characterRun, OfficeDrawing officeDrawing, String path, Element block ); - protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument, - int noteIndex, Element block, Range endnoteTextRange ); + protected abstract void processEndnoteAutonumbered( + HWPFDocument wordDocument, int noteIndex, Element block, + Range endnoteTextRange ); - protected void processField( HWPFDocument hwpfDocument, Range parentRange, + protected void processField( HWPFDocument wordDocument, Range parentRange, int currentTableLevel, Field field, Element currentBlock ) { switch ( field.getType() ) @@ -633,7 +668,7 @@ public abstract class AbstractWordConverter if ( matcher.find() ) { String pageref = matcher.group( 1 ); - processPageref( hwpfDocument, currentBlock, + processPageref( wordDocument, currentBlock, field.secondSubrange( parentRange ), currentTableLevel, pageref ); return; @@ -641,6 +676,36 @@ public abstract class AbstractWordConverter } break; } + case 58: // Embedded Object + { + if ( !field.hasSeparator() ) + { + logger.log( POILogger.WARN, parentRange + " contains " + field + + " with 'Embedded Object' but without separator mark" ); + return; + } + + CharacterRun separator = field + .getMarkSeparatorCharacterRun( parentRange ); + + if ( separator.isOle2() ) + { + // the only supported so far + boolean processed = processOle2( wordDocument, separator, + currentBlock ); + + // if we didn't output OLE - output field value + if ( !processed ) + { + processCharacters( wordDocument, currentTableLevel, + field.secondSubrange( parentRange ), currentBlock ); + } + + return; + } + + break; + } case 88: // hyperlink { final Range firstSubrange = field.firstSubrange( parentRange ); @@ -653,7 +718,7 @@ public abstract class AbstractWordConverter if ( matcher.find() ) { String hyperlink = matcher.group( 1 ); - processHyperlink( hwpfDocument, currentBlock, + processHyperlink( wordDocument, currentBlock, field.secondSubrange( parentRange ), currentTableLevel, hyperlink ); return; @@ -665,12 +730,13 @@ public abstract class AbstractWordConverter logger.log( POILogger.WARN, parentRange + " contains " + field + " with unsupported type or format" ); - processCharacters( hwpfDocument, currentTableLevel, + processCharacters( wordDocument, currentTableLevel, field.secondSubrange( parentRange ), currentBlock ); } - protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument, - int noteIndex, Element block, Range footnoteTextRange ); + protected abstract void processFootnoteAutonumbered( + HWPFDocument wordDocument, int noteIndex, Element block, + Range footnoteTextRange ); protected abstract void processHyperlink( HWPFDocumentCore wordDocument, Element currentBlock, Range textRange, int currentTableLevel, @@ -732,6 +798,40 @@ public abstract class AbstractWordConverter } } + private boolean processOle2( HWPFDocument doc, CharacterRun characterRun, + Element block ) + { + Entry entry = doc.getObjectsPool().getObjectById( + "_" + characterRun.getPicOffset() ); + if ( entry == null ) + { + logger.log( POILogger.WARN, "Referenced OLE2 object '", + Integer.valueOf( characterRun.getPicOffset() ), + "' not found in ObjectPool" ); + return false; + } + + try + { + return processOle2( doc, block, entry ); + } + catch ( Exception exc ) + { + logger.log( POILogger.WARN, + "Unable to convert internal OLE2 object '", + Integer.valueOf( characterRun.getPicOffset() ), "': ", exc, + exc ); + return false; + } + } + + @SuppressWarnings( "unused" ) + protected boolean processOle2( HWPFDocument wordDocument, Element block, + Entry entry ) throws Exception + { + return false; + } + protected abstract void processPageref( HWPFDocumentCore wordDocument, Element currentBlock, Range textRange, int currentTableLevel, String pageref ); @@ -896,30 +996,4 @@ public abstract class AbstractWordConverter return endMark; } - private static final class Structure implements Comparable - { - final int end; - final int start; - final Object structure; - - Structure( Bookmark bookmark ) - { - this.start = bookmark.getStart(); - this.end = bookmark.getEnd(); - this.structure = bookmark; - } - - Structure( Field field ) - { - this.start = field.getFieldStartOffset(); - this.end = field.getFieldEndOffset(); - this.structure = field; - } - - public int compareTo( Structure o ) - { - return start < o.start ? -1 : start == o.start ? 0 : 1; - } - } - } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java index e7b85b0b74..eeb5d7843f 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java @@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Table; import org.apache.poi.hwpf.usermodel.TableCell; import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.Beta; import org.apache.poi.util.IOUtils; @@ -422,6 +423,19 @@ public class AbstractWordUtils return !isEmpty( str ); } + public static HWPFDocumentCore loadDoc( final DirectoryNode root ) + throws IOException + { + try + { + return new HWPFDocument( root ); + } + catch ( OldWordFileFormatException exc ) + { + return new HWPFOldDocument( root ); + } + } + public static HWPFDocumentCore loadDoc( File docFile ) throws IOException { final FileInputStream istream = new FileInputStream( docFile ); @@ -438,16 +452,13 @@ public class AbstractWordUtils public static HWPFDocumentCore loadDoc( InputStream inputStream ) throws IOException { - final POIFSFileSystem poifsFileSystem = HWPFDocumentCore - .verifyAndBuildPOIFS( inputStream ); - try - { - return new HWPFDocument( poifsFileSystem ); - } - catch ( OldWordFileFormatException exc ) - { - return new HWPFOldDocument( poifsFileSystem ); - } + return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) ); + } + + public static HWPFDocumentCore loadDoc( + final POIFSFileSystem poifsFileSystem ) throws IOException + { + return loadDoc( poifsFileSystem.getRoot() ); } static String substringBeforeLast( String str, String separator ) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java index dd9cfe8824..7e3e4dfe7d 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java @@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter } @Override - protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, - Element block, Range endnoteTextRange ) + protected void processEndnoteAutonumbered( HWPFDocument wordDocument, + int noteIndex, Element block, Range endnoteTextRange ) { final String textIndex = String.valueOf( internalLinkCounter .incrementAndGet() ); @@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter setId( backwardLink, forwardLinkName ); endnote.appendChild( backwardLink ); - processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote ); + processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, + endnote ); WordToFoUtils.compactInlines( endnote ); this.endnotes.add( endnote ); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java index 040d32a879..e6233d2c3e 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java @@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH; @Beta public class WordToHtmlConverter extends AbstractWordConverter { - /** * Holds properties values, applied to current p element. Those * properties shall not be doubled in children span elements. @@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter } @Override - protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex, - Element block, Range endnoteTextRange ) + protected void processEndnoteAutonumbered( HWPFDocument wordDocument, + int noteIndex, Element block, Range endnoteTextRange ) { - processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange ); + processNoteAutonumbered( wordDocument, "end", noteIndex, block, + endnoteTextRange ); } @Override diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java index edea5dab6c..fa9b87fcd7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java @@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter; import java.io.File; import java.io.FileWriter; +import java.io.StringWriter; +import java.lang.reflect.Method; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; @@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section; import org.apache.poi.hwpf.usermodel.Table; import org.apache.poi.hwpf.usermodel.TableCell; import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.util.Beta; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -33,6 +39,29 @@ import org.w3c.dom.Element; public class WordToTextConverter extends AbstractWordConverter { + public static String getText( DirectoryNode root ) throws Exception + { + final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root ); + return getText( wordDocument ); + } + + public static String getText( File docFile ) throws Exception + { + final HWPFDocumentCore wordDocument = AbstractWordUtils + .loadDoc( docFile ); + return getText( wordDocument ); + } + + public static String getText( final HWPFDocumentCore wordDocument ) + throws Exception + { + WordToTextConverter wordToTextConverter = new WordToTextConverter( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToTextConverter.processDocument( wordDocument ); + return wordToTextConverter.getText(); + } + /** * Java main() interface to interact with {@link WordToTextConverter} * @@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter private Element notes = null; + private boolean outputSummaryInformation = false; + private final TextDocumentFacade textDocumentFacade; + /** + * Creates new instance of {@link WordToTextConverter}. Can be used for + * output several {@link HWPFDocument}s into single text document. + * + * @throws ParserConfigurationException + * if an internal {@link DocumentBuilder} cannot be created + */ + public WordToTextConverter() throws ParserConfigurationException + { + this.textDocumentFacade = new TextDocumentFacade( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + } + /** * Creates new instance of {@link WordToTextConverter}. Can be used for * output several {@link HWPFDocument}s into single text document. @@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter return textDocumentFacade.getDocument(); } + public String getText() throws Exception + { + StringWriter stringWriter = new StringWriter(); + DOMSource domSource = new DOMSource( getDocument() ); + StreamResult streamResult = new StreamResult( stringWriter ); + + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer serializer = tf.newTransformer(); + // TODO set encoding from a command argument + serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); + serializer.setOutputProperty( OutputKeys.INDENT, "no" ); + serializer.setOutputProperty( OutputKeys.METHOD, "text" ); + serializer.transform( domSource, streamResult ); + + return stringWriter.toString(); + } + + public boolean isOutputSummaryInformation() + { + return outputSummaryInformation; + } + @Override protected void outputCharacters( Element block, CharacterRun characterRun, String text ) @@ -138,18 +205,24 @@ public class WordToTextConverter extends AbstractWordConverter protected void processDocumentInformation( SummaryInformation summaryInformation ) { - if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) ) - textDocumentFacade.setTitle( summaryInformation.getTitle() ); + if ( isOutputSummaryInformation() ) + { + if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) ) + textDocumentFacade.setTitle( summaryInformation.getTitle() ); - if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) ) - textDocumentFacade.addAuthor( summaryInformation.getAuthor() ); + if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) ) + textDocumentFacade.addAuthor( summaryInformation.getAuthor() ); - if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) ) - textDocumentFacade - .addDescription( summaryInformation.getComments() ); + if ( AbstractWordUtils + .isNotEmpty( summaryInformation.getComments() ) ) + textDocumentFacade.addDescription( summaryInformation + .getComments() ); - if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) ) - textDocumentFacade.addKeywords( summaryInformation.getKeywords() ); + if ( AbstractWordUtils + .isNotEmpty( summaryInformation.getKeywords() ) ) + textDocumentFacade.addKeywords( summaryInformation + .getKeywords() ); + } } @Override @@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter note.appendChild( textDocumentFacade.createText( "\n" ) ); } + @Override + protected boolean processOle2( HWPFDocument wordDocument, Element block, + Entry entry ) throws Exception + { + if ( !( entry instanceof DirectoryNode ) ) + return false; + DirectoryNode directoryNode = (DirectoryNode) entry; + + // even if no ExtractorFactory in classpath + if ( directoryNode.hasEntry( "WordDocument" ) ) + { + String text = WordToTextConverter.getText( (DirectoryNode) entry ); + block.appendChild( textDocumentFacade + .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text + + UNICODECHAR_ZERO_WIDTH_SPACE ) ); + return true; + } + + try + { + Class cls = Class + .forName( "org.apache.poi.extractor.ExtractorFactory" ); + Method createExtractor = cls.getMethod( "createExtractor", + DirectoryNode.class ); + Object extractor = createExtractor.invoke( null, directoryNode ); + + Method getText = extractor.getClass().getMethod( "getText" ); + String text = (String) getText.invoke( extractor ); + + block.appendChild( textDocumentFacade + .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text + + UNICODECHAR_ZERO_WIDTH_SPACE ) ); + return true; + } + catch ( ClassNotFoundException exc ) + { + // no extractor in classpath + } + + return false; + } + @Override protected void processPageref( HWPFDocumentCore wordDocument, Element currentBlock, Range textRange, int currentTableLevel, @@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter textDocumentFacade.body.appendChild( sectionElement ); } - protected void processTable( HWPFDocumentCore hwpfDocument, Element flow, + protected void processTable( HWPFDocumentCore wordDocument, Element flow, Table table ) { final int tableRows = table.numRows(); @@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter tableCellElement.appendChild( textDocumentFacade .createText( "\t" ) ); - processParagraphes( hwpfDocument, tableCellElement, tableCell, - table.getTableLevel() ); + processCharacters( wordDocument, table.getTableLevel(), + tableCell, tableCellElement ); tableRowElement.appendChild( tableCellElement ); } @@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter } } + public void setOutputSummaryInformation( boolean outputDocumentInformation ) + { + this.outputSummaryInformation = outputDocumentInformation; + } + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java index 123f501781..b5331e8be1 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java @@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor; import java.io.IOException; import java.io.InputStream; +import java.io.StringWriter; + +import org.apache.poi.hwpf.converter.WordToTextConverter; +import org.apache.poi.hwpf.usermodel.HeaderStories; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFOldDocument; @@ -47,16 +51,32 @@ public final class Word6Extractor extends POIOLE2TextExtractor { this( new POIFSFileSystem(is) ); } - /** - * Create a new Word Extractor - * @param fs POIFSFileSystem containing the word file - */ - public Word6Extractor(POIFSFileSystem fs) throws IOException { - this(fs.getRoot(), fs); - } - public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { - this(new HWPFOldDocument(dir,fs)); - } + /** + * Create a new Word Extractor + * + * @param fs + * POIFSFileSystem containing the word file + */ + public Word6Extractor( POIFSFileSystem fs ) throws IOException + { + this( fs.getRoot() ); + } + + /** + * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead + */ + @Deprecated + @SuppressWarnings( "unused" ) + public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) + throws IOException + { + this( dir ); + } + + public Word6Extractor( DirectoryNode dir ) throws IOException + { + this( new HWPFOldDocument( dir ) ); + } /** * Create a new Word Extractor @@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor { * Get the text from the word file, as an array with one String * per paragraph */ + @Deprecated public String[] getParagraphText() { String[] ret; @@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor { return ret; } - public String getText() { - StringBuffer text = new StringBuffer(); - - for(String t : getParagraphText()) { - text.append(t); + public String getText() + { + try + { + WordToTextConverter wordToTextConverter = new WordToTextConverter(); + wordToTextConverter.processDocument( doc ); + return wordToTextConverter.getText(); } + catch ( Exception exc ) + { + // fall-back + StringBuffer text = new StringBuffer(); - return text.toString(); + for ( String t : getParagraphText() ) + { + text.append( t ); + } + + return text.toString(); + } } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java index 464b11a406..8438df4f04 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java @@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.StringWriter; import java.util.ArrayList; import java.util.Arrays; +import org.apache.poi.hwpf.converter.WordToTextConverter; + import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.HeaderStories; @@ -33,231 +36,300 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * Class to extract the text from a Word Document. - * - * You should use either getParagraphText() or getText() unless - * you have a strong reason otherwise. - * + * + * You should use either getParagraphText() or getText() unless you have a + * strong reason otherwise. + * * @author Nick Burch */ -public final class WordExtractor extends POIOLE2TextExtractor { - private POIFSFileSystem fs; - private HWPFDocument doc; - - /** - * Create a new Word Extractor - * @param is InputStream containing the word file - */ - public WordExtractor(InputStream is) throws IOException { - this( HWPFDocument.verifyAndBuildPOIFS(is) ); - } - - /** - * Create a new Word Extractor - * @param fs POIFSFileSystem containing the word file - */ - public WordExtractor(POIFSFileSystem fs) throws IOException { - this(new HWPFDocument(fs)); - this.fs = fs; - } - public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException { - this(new HWPFDocument(dir, fs)); - this.fs = fs; - } - - /** - * Create a new Word Extractor - * @param doc The HWPFDocument to extract from - */ - public WordExtractor(HWPFDocument doc) { - super(doc); - this.doc = doc; - } - - /** - * Command line extractor, so people will stop moaning that - * they can't just run this. - */ - public static void main(String[] args) throws IOException { - if(args.length == 0) { - System.err.println("Use:"); - System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor "); - System.exit(1); - } - - // Process the first argument as a file - FileInputStream fin = new FileInputStream(args[0]); - WordExtractor extractor = new WordExtractor(fin); - System.out.println(extractor.getText()); - } - - /** - * Get the text from the word file, as an array with one String - * per paragraph - */ - public String[] getParagraphText() { - String[] ret; - - // Extract using the model code - try { - Range r = doc.getRange(); - - ret = getParagraphText(r); - } catch (Exception e) { - // Something's up with turning the text pieces into paragraphs - // Fall back to ripping out the text pieces - ret = new String[1]; - ret[0] = getTextFromPieces(); - } +public final class WordExtractor extends POIOLE2TextExtractor +{ + private HWPFDocument doc; + + /** + * Create a new Word Extractor + * + * @param is + * InputStream containing the word file + */ + public WordExtractor( InputStream is ) throws IOException + { + this( HWPFDocument.verifyAndBuildPOIFS( is ) ); + } + + /** + * Create a new Word Extractor + * + * @param fs + * POIFSFileSystem containing the word file + */ + public WordExtractor( POIFSFileSystem fs ) throws IOException + { + this( new HWPFDocument( fs ) ); + } + + /** + * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead + */ + @Deprecated + public WordExtractor( DirectoryNode dir, POIFSFileSystem fs ) + throws IOException + { + this( dir ); + } + + public WordExtractor( DirectoryNode dir ) throws IOException + { + this( new HWPFDocument( dir ) ); + } + + /** + * Create a new Word Extractor + * + * @param doc + * The HWPFDocument to extract from + */ + public WordExtractor( HWPFDocument doc ) + { + super( doc ); + this.doc = doc; + } + + /** + * Command line extractor, so people will stop moaning that they can't just + * run this. + */ + public static void main( String[] args ) throws IOException + { + if ( args.length == 0 ) + { + System.err.println( "Use:" ); + System.err + .println( " java org.apache.poi.hwpf.extractor.WordExtractor " ); + System.exit( 1 ); + } - return ret; + // Process the first argument as a file + FileInputStream fin = new FileInputStream( args[0] ); + WordExtractor extractor = new WordExtractor( fin ); + System.out.println( extractor.getText() ); + } + + /** + * Get the text from the word file, as an array with one String per + * paragraph + */ + public String[] getParagraphText() + { + String[] ret; + + // Extract using the model code + try + { + Range r = doc.getRange(); + + ret = getParagraphText( r ); + } + catch ( Exception e ) + { + // Something's up with turning the text pieces into paragraphs + // Fall back to ripping out the text pieces + ret = new String[1]; + ret[0] = getTextFromPieces(); } - public String[] getFootnoteText() { - Range r = doc.getFootnoteRange(); + return ret; + } - return getParagraphText(r); - } + public String[] getFootnoteText() + { + Range r = doc.getFootnoteRange(); - public String[] getMainTextboxText() { - Range r = doc.getMainTextboxRange(); + return getParagraphText( r ); + } - return getParagraphText(r); - } + public String[] getMainTextboxText() + { + Range r = doc.getMainTextboxRange(); + + return getParagraphText( r ); + } + + public String[] getEndnoteText() + { + Range r = doc.getEndnoteRange(); + + return getParagraphText( r ); + } + + public String[] getCommentsText() + { + Range r = doc.getCommentsRange(); + + return getParagraphText( r ); + } - public String[] getEndnoteText() { - Range r = doc.getEndnoteRange(); + protected static String[] getParagraphText( Range r ) + { + String[] ret; + ret = new String[r.numParagraphs()]; + for ( int i = 0; i < ret.length; i++ ) + { + Paragraph p = r.getParagraph( i ); + ret[i] = p.text(); - return getParagraphText(r); + // Fix the line ending + if ( ret[i].endsWith( "\r" ) ) + { + ret[i] = ret[i] + "\n"; + } + } + return ret; + } + + /** + * Add the header/footer text, if it's not empty + */ + private void appendHeaderFooter( String text, StringBuffer out ) + { + if ( text == null || text.length() == 0 ) + return; + + text = text.replace( '\r', '\n' ); + if ( !text.endsWith( "\n" ) ) + { + out.append( text ); + out.append( '\n' ); + return; + } + if ( text.endsWith( "\n\n" ) ) + { + out.append( text.substring( 0, text.length() - 1 ) ); + return; + } + out.append( text ); + return; + } + + /** + * Grab the text from the headers + */ + @Deprecated + public String getHeaderText() + { + HeaderStories hs = new HeaderStories( doc ); + + StringBuffer ret = new StringBuffer(); + if ( hs.getFirstHeader() != null ) + { + appendHeaderFooter( hs.getFirstHeader(), ret ); + } + if ( hs.getEvenHeader() != null ) + { + appendHeaderFooter( hs.getEvenHeader(), ret ); + } + if ( hs.getOddHeader() != null ) + { + appendHeaderFooter( hs.getOddHeader(), ret ); } - public String[] getCommentsText() { - Range r = doc.getCommentsRange(); + return ret.toString(); + } + + /** + * Grab the text from the footers + */ + @Deprecated + public String getFooterText() + { + HeaderStories hs = new HeaderStories( doc ); + + StringBuffer ret = new StringBuffer(); + if ( hs.getFirstFooter() != null ) + { + appendHeaderFooter( hs.getFirstFooter(), ret ); + } + if ( hs.getEvenFooter() != null ) + { + appendHeaderFooter( hs.getEvenFooter(), ret ); + } + if ( hs.getOddFooter() != null ) + { + appendHeaderFooter( hs.getOddFooter(), ret ); + } - return getParagraphText(r); + return ret.toString(); + } + + /** + * Grab the text out of the text pieces. Might also include various bits of + * crud, but will work in cases where the text piece -> paragraph mapping is + * broken. Fast too. + */ + public String getTextFromPieces() + { + String text = doc.getDocumentText(); + + // Fix line endings (Note - won't get all of them + text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" ); + text = text.replaceAll( "\r\r", "\r\n\r\n" ); + + if ( text.endsWith( "\r" ) ) + { + text += "\n"; } - protected static String[] getParagraphText(Range r) { - String[] ret; - ret = new String[r.numParagraphs()]; - for (int i = 0; i < ret.length; i++) { - Paragraph p = r.getParagraph(i); - ret[i] = p.text(); - - // Fix the line ending - if (ret[i].endsWith("\r")) { - ret[i] = ret[i] + "\n"; - } + return text; + } + + /** + * Grab the text, based on the WordToTextConverter. Shouldn't include any + * crud, but slower than getTextFromPieces(). + */ + public String getText() + { + try + { + final StringWriter stringWriter = new StringWriter(); + @SuppressWarnings( "unused" ) + WordToTextConverter wordToTextConverter = new WordToTextConverter() + { + { + HeaderStories hs = new HeaderStories( doc ); + + if ( hs.getFirstHeaderSubrange() != null ) + processDocumentPart( doc, hs.getFirstHeaderSubrange() ); + if ( hs.getEvenHeaderSubrange() != null ) + processDocumentPart( doc, hs.getEvenHeaderSubrange() ); + if ( hs.getOddHeaderSubrange() != null ) + processDocumentPart( doc, hs.getOddHeaderSubrange() ); + + processDocument( doc ); + processDocumentPart( doc, doc.getMainTextboxRange() ); + + if ( hs.getFirstFooterSubrange() != null ) + processDocumentPart( doc, hs.getFirstFooterSubrange() ); + if ( hs.getEvenFooterSubrange() != null ) + processDocumentPart( doc, hs.getEvenFooterSubrange() ); + if ( hs.getOddFooterSubrange() != null ) + processDocumentPart( doc, hs.getOddFooterSubrange() ); + + stringWriter.append( getText() ); } - return ret; + }; + return stringWriter.toString(); } - - /** - * Add the header/footer text, if it's not empty - */ - private void appendHeaderFooter(String text, StringBuffer out) { - if(text == null || text.length() == 0) - return; - - text = text.replace('\r', '\n'); - if(! text.endsWith("\n")) { - out.append(text); - out.append('\n'); - return; - } - if(text.endsWith("\n\n")) { - out.append(text.substring(0, text.length()-1)); - return; - } - out.append(text); - return; - } - /** - * Grab the text from the headers - */ - public String getHeaderText() { - HeaderStories hs = new HeaderStories(doc); - - StringBuffer ret = new StringBuffer(); - if(hs.getFirstHeader() != null) { - appendHeaderFooter(hs.getFirstHeader(), ret); - } - if(hs.getEvenHeader() != null) { - appendHeaderFooter(hs.getEvenHeader(), ret); - } - if(hs.getOddHeader() != null) { - appendHeaderFooter(hs.getOddHeader(), ret); - } - - return ret.toString(); - } - /** - * Grab the text from the footers - */ - public String getFooterText() { - HeaderStories hs = new HeaderStories(doc); - - StringBuffer ret = new StringBuffer(); - if(hs.getFirstFooter() != null) { - appendHeaderFooter(hs.getFirstFooter(), ret); - } - if(hs.getEvenFooter() != null) { - appendHeaderFooter(hs.getEvenFooter(), ret); - } - if(hs.getOddFooter() != null) { - appendHeaderFooter(hs.getOddFooter(), ret); - } - - return ret.toString(); - } - - /** - * Grab the text out of the text pieces. Might also include various - * bits of crud, but will work in cases where the text piece -> paragraph - * mapping is broken. Fast too. - */ - public String getTextFromPieces() { - String text = doc.getDocumentText(); - - // Fix line endings (Note - won't get all of them - text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n"); - text = text.replaceAll("\r\r", "\r\n\r\n"); - - if(text.endsWith("\r")) { - text += "\n"; - } - - return text; - } - - /** - * Grab the text, based on the paragraphs. Shouldn't include any crud, - * but slightly slower than getTextFromPieces(). - */ - public String getText() { - StringBuffer ret = new StringBuffer(); - - ret.append(getHeaderText()); - - ArrayList text = new ArrayList(); - text.addAll(Arrays.asList(getParagraphText())); - text.addAll(Arrays.asList(getMainTextboxText())); - text.addAll(Arrays.asList(getFootnoteText())); - text.addAll(Arrays.asList(getEndnoteText())); - - for(String p : text) { - ret.append(p); - } - - ret.append(getFooterText()); - - return ret.toString(); - } - - /** - * Removes any fields (eg macros, page markers etc) - * from the string. - */ - public static String stripFields(String text) { - return Range.stripFields(text); - } + catch ( Exception exc ) + { + throw new RuntimeException( exc ); + } + } + + /** + * Removes any fields (eg macros, page markers etc) from the string. + */ + public static String stripFields( String text ) + { + return Range.stripFields( text ); + } } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java index 72b5220dde..f50abd5b4f 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java @@ -17,17 +17,23 @@ public interface Field */ int getFieldStartOffset(); + CharacterRun getMarkEndCharacterRun( Range parent ); + /** * @return character position of end field mark */ int getMarkEndOffset(); + CharacterRun getMarkSeparatorCharacterRun( Range parent ); + /** * @return character position of separator field mark (if present, * {@link NullPointerException} otherwise) */ int getMarkSeparatorOffset(); + CharacterRun getMarkStartCharacterRun( Range parent ); + /** * @return character position of start field mark */ diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java index 4832b23f96..37131c3e15 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java @@ -112,6 +112,12 @@ class FieldImpl implements Field return startPlex.getFcStart(); } + public CharacterRun getMarkEndCharacterRun( Range parent ) + { + return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent ) + .getCharacterRun( 0 ); + } + /** * @return character position of end field mark */ @@ -120,6 +126,15 @@ class FieldImpl implements Field return endPlex.getFcStart(); } + public CharacterRun getMarkSeparatorCharacterRun( Range parent ) + { + if ( !hasSeparator() ) + return null; + + return new Range( getMarkSeparatorOffset(), + getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 ); + } + /** * @return character position of separator field mark (if present, * {@link NullPointerException} otherwise) @@ -129,6 +144,12 @@ class FieldImpl implements Field return separatorPlex.getFcStart(); } + public CharacterRun getMarkStartCharacterRun( Range parent ) + { + return new Range( getMarkStartOffset(), getMarkStartOffset() + 1, + parent ).getCharacterRun( 0 ); + } + /** * @return character position of start field mark */ diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java index fed191c7d4..5b8c80501c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java @@ -82,35 +82,96 @@ public final class HeaderStories { fib.getPlcfHddSize(), 0 ); } - public String getFootnoteSeparator() { - return getAt(0); - } - public String getFootnoteContSeparator() { - return getAt(1); - } - public String getFootnoteContNote() { - return getAt(2); - } - public String getEndnoteSeparator() { - return getAt(3); - } - public String getEndnoteContSeparator() { - return getAt(4); - } - public String getEndnoteContNote() { - return getAt(5); - } + @Deprecated + public String getFootnoteSeparator() + { + return getAt( 0 ); + } + + @Deprecated + public String getFootnoteContSeparator() + { + return getAt( 1 ); + } + + @Deprecated + public String getFootnoteContNote() + { + return getAt( 2 ); + } + + @Deprecated + public String getEndnoteSeparator() + { + return getAt( 3 ); + } + @Deprecated + public String getEndnoteContSeparator() + { + return getAt( 4 ); + } + + @Deprecated + public String getEndnoteContNote() + { + return getAt( 5 ); + } + + public Range getFootnoteSeparatorSubrange() + { + return getSubrangeAt( 0 ); + } + + public Range getFootnoteContSeparatorSubrange() + { + return getSubrangeAt( 1 ); + } + + public Range getFootnoteContNoteSubrange() + { + return getSubrangeAt( 2 ); + } + + public Range getEndnoteSeparatorSubrange() + { + return getSubrangeAt( 3 ); + } + + public Range getEndnoteContSeparatorSubrange() + { + return getSubrangeAt( 4 ); + } + public Range getEndnoteContNoteSubrange() + { + return getSubrangeAt( 5 ); + } + + @Deprecated public String getEvenHeader() { return getAt(6+0); } + @Deprecated public String getOddHeader() { return getAt(6+1); } + @Deprecated public String getFirstHeader() { return getAt(6+4); } + + + public Range getEvenHeaderSubrange() { + return getSubrangeAt(6+0); + } + public Range getOddHeaderSubrange() { + return getSubrangeAt(6+1); + } + public Range getFirstHeaderSubrange() { + return getSubrangeAt(6+4); + } + /** * Returns the correct, defined header for the given * one based page @@ -135,16 +196,39 @@ public final class HeaderStories { return getOddHeader(); } + @Deprecated + public String getEvenFooter() + { + return getAt( 6 + 2 ); + } + + @Deprecated + public String getOddFooter() + { + return getAt( 6 + 3 ); + } + + @Deprecated + public String getFirstFooter() + { + return getAt( 6 + 5 ); + } + + public Range getEvenFooterSubrange() + { + return getSubrangeAt( 6 + 2 ); + } + + public Range getOddFooterSubrange() + { + return getSubrangeAt( 6 + 3 ); + } + + public Range getFirstFooterSubrange() + { + return getSubrangeAt( 6 + 5 ); + } - public String getEvenFooter() { - return getAt(6+2); - } - public String getOddFooter() { - return getAt(6+3); - } - public String getFirstFooter() { - return getAt(6+5); - } /** * Returns the correct, defined footer for the given * one based page @@ -174,6 +258,7 @@ public final class HeaderStories { * Get the string that's pointed to by the * given plcfHdd index */ + @Deprecated private String getAt(int plcfHddIndex) { if(plcfHdd == null) return null; @@ -209,6 +294,32 @@ public final class HeaderStories { return text; } + private Range getSubrangeAt( int plcfHddIndex ) + { + if ( plcfHdd == null ) + return null; + + GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex ); + if ( prop.getStart() == prop.getEnd() ) + { + // Empty story + return null; + } + if ( prop.getEnd() < prop.getStart() ) + { + // Broken properties? + return null; + } + + final int headersLength = headerStories.getEndOffset() + - headerStories.getStartOffset(); + int start = Math.min( prop.getStart(), headersLength ); + int end = Math.min( prop.getEnd(), headersLength ); + + return new Range( headerStories.getStartOffset() + start, + headerStories.getStartOffset() + end, headerStories ); + } + public Range getRange() { return headerStories; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java new file mode 100644 index 0000000000..2f71f39ecc --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java @@ -0,0 +1,34 @@ +package org.apache.poi.hwpf.usermodel; + +import java.io.FileNotFoundException; + +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.util.Internal; + +@Internal +public class ObjectPoolImpl implements ObjectsPool +{ + private DirectoryEntry _objectPool; + + public ObjectPoolImpl( DirectoryEntry _objectPool ) + { + super(); + this._objectPool = _objectPool; + } + + public Entry getObjectById( String objId ) + { + if ( _objectPool == null ) + return null; + + try + { + return _objectPool.getEntry( objId ); + } + catch ( FileNotFoundException exc ) + { + return null; + } + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java new file mode 100644 index 0000000000..fa91d37b7f --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java @@ -0,0 +1,8 @@ +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.poifs.filesystem.Entry; + +public interface ObjectsPool +{ + public Entry getObjectById( String objId ); +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java index 3f23eea0c2..c99c4eb28d 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java @@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor; import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor; import org.apache.poi.hwpf.sprm.SprmBuffer; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * This class is the central class of the HWPF object model. All properties that @@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian; */ public class Range { // TODO -instantiable superclass + private POILogger logger = POILogFactory.getLogger( Range.class ); + public static final int TYPE_PARAGRAPH = 0; public static final int TYPE_CHARACTER = 1; public static final int TYPE_SECTION = 2; @@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass initAll(); if ( tableEndInclusive >= this._parEnd ) { - throw new ArrayIndexOutOfBoundsException( - "The table's bounds fall outside of this Range" ); + logger.log( POILogger.WARN, "The table's bounds ", "[" + + this._parStart + "; " + tableEndInclusive + ")", + " fall outside of this Range paragraphs numbers ", "[" + + this._parStart + "; " + this._parEnd + ")" ); } + if ( tableEndInclusive < 0 ) { throw new ArrayIndexOutOfBoundsException( diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java new file mode 100644 index 0000000000..ca1d809aec --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java @@ -0,0 +1,22 @@ +package org.apache.poi.hwpf.converter; + +import junit.framework.TestCase; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFTestDataSamples; + +public class TestWordToTextConverter extends TestCase +{ + + /** + * [FAILING] Bug 47731 - Word Extractor considers text copied from some + * website as an embedded object + */ + public void testBug47731() throws Exception + { + HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" ); + String foundText = WordToTextConverter.getText( doc ); + + assertTrue( foundText + .contains( "Soak the rice in water for three to four hours" ) ); + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java index cbc7db1d14..48c20dfbe3 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java @@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; * @author Nick Burch (nick at torchbox dot com) */ public final class TestWordExtractor extends TestCase { + + public static void assertEquals( String expected, String actual ) + { + String newExpected = expected.replaceAll( "\r\n", "\n" ) + .replaceAll( "\r", "\n" ).trim(); + String newActual = actual.replaceAll( "\r\n", "\n" ) + .replaceAll( "\r", "\n" ).trim(); + TestCase.assertEquals( newExpected, newActual ); + } + private String[] p_text1 = new String[] { "This is a simple word document\r\n", "\r\n", @@ -107,12 +117,14 @@ public final class TestWordExtractor extends TestCase { public void testGetText() { assertEquals(p_text1_block, extractor.getText()); - // For the 2nd, should give similar answers for - // the two methods, differing only in line endings - assertEquals( - extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), - extractor2.getText().replaceAll("[\\r\\n]", "")); - } + // For the 2nd, should give similar answers for + // the two methods, differing only in line endings + + // nope, they must have different results, because of garbage + // assertEquals( + // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), + // extractor2.getText().replaceAll("[\\r\\n]", "")); + } /** * Test textPieces based extraction @@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase { // Open directly for(DirectoryNode dir : files) { - WordExtractor extractor = new WordExtractor(dir, null); + WordExtractor extractor = new WordExtractor(dir); assertEquals(p_text1_block, extractor.getText()); } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java index 0ecfbf4a1f..01810569a5 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java @@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils; public class TestBugs extends TestCase { + public static void assertEquals( String expected, String actual ) + { + String newExpected = expected.replaceAll( "\r\n", "\n" ) + .replaceAll( "\r", "\n" ).trim(); + String newActual = actual.replaceAll( "\r\n", "\n" ) + .replaceAll( "\r", "\n" ).trim(); + TestCase.assertEquals( newExpected, newActual ); + } + private static void assertTableStructures( Range expected, Range actual ) { assertEquals( expected.numParagraphs(), actual.numParagraphs() );