]> source.dussan.org Git - poi.git/commitdiff
Add Word-to-Text converter and use it as replacement for WordExtractor
authorSergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
committerSergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155336 13f79535-47bb-0310-9956-ffa450edef68

26 files changed:
src/documentation/content/xdocs/status.xml
src/java/org/apache/poi/POIOLE2TextExtractor.java
src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java

index 6c77dd532c1d91c4d89e68cde1aa4bc2f8ae2334..aca3037dc9ac9ed9c97d18f7a497f727b84323c1 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.8-beta4" date="2011-??-??">
+           <action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
            <action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
            <action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
            <action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>
index 9ec09fe47a19c25415f848de038cb241488f9b46..00930c3fddde88d0019bdde9d4c82362e78f2dff 100644 (file)
@@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -39,7 +40,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
        public POIOLE2TextExtractor(POIDocument document) {
                super(document);
        }
-       
+
        /**
         * Returns the document information metadata for the document
         */
@@ -52,20 +53,28 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
        public SummaryInformation getSummaryInformation() {
                return document.getSummaryInformation();
        }
-       
+
        /**
-        * Returns an HPSF powered text extractor for the 
+        * Returns an HPSF powered text extractor for the
         *  document properties metadata, such as title and author.
         */
        public POITextExtractor getMetadataTextExtractor() {
                return new HPSFPropertiesExtractor(this);
        }
 
-       /**
-        * Return the underlying POIFS FileSystem of
-        *  this document.
-        */
-       public POIFSFileSystem getFileSystem() {
-               return document.directory.getFileSystem();
-       }
+    public DirectoryEntry getRoot()
+    {
+        return document.directory;
+    }
+
+    /**
+     * Return the underlying POIFS FileSystem of this document.
+     *
+     * @deprecated Use {@link #getRoot()} instead
+     */
+    @Deprecated
+    public POIFSFileSystem getFileSystem()
+    {
+        return document.directory.getFileSystem();
+    }
 }
index 8a7643b0132637cfb25d7c7fa0360f8d5145eec6..a4ff616125526aed8cfd58b533c98afcc9428d15 100644 (file)
@@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  */
 public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
    private DirectoryNode _dir;
-       private POIFSFileSystem _fs;
        boolean _includeSheetNames = true;
        boolean _formulasNotResults = false;
 
-       public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
-               super(null);
-               _dir = dir;
-               _fs = fs;
-       }
+    /**
+     * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
+    {
+        this( dir );
+    }
+
+    public EventBasedExcelExtractor( DirectoryNode dir )
+    {
+        super( null );
+        _dir = dir;
+    }
+
    public EventBasedExcelExtractor(POIFSFileSystem fs) {
-      this(fs.getRoot(), fs);
+      this(fs.getRoot());
    }
 
    /**
@@ -79,9 +89,9 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
     *  this document.
     */
    public POIFSFileSystem getFileSystem() {
-      return _fs;
+      return _dir.getFileSystem();
    }
-   
+
        /**
         * Would return the document information metadata for the document,
         *  if we supported it
@@ -200,7 +210,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
                                                outputNextStringValue = true;
                                                nextRow = frec.getRow();
                                        } else {
-                                               thisText = _ft.formatNumberDateCell(frec); 
+                                               thisText = _ft.formatNumberDateCell(frec);
                                        }
                                }
                                break;
@@ -234,7 +244,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
                        case NumberRecord.sid:
                                NumberRecord numrec = (NumberRecord) record;
                                thisRow = numrec.getRow();
-                               thisText = _ft.formatNumberDateCell(numrec); 
+                               thisText = _ft.formatNumberDateCell(numrec);
                                break;
                        default:
                                break;
index 44de2d638d2d391e7e3d7ddd5f4a1cfd1d9122d8..b620623595cc6847506d52b0c3a86dcfdd1ae576 100644 (file)
@@ -24,7 +24,6 @@ import java.io.InputStream;
 import java.io.PrintStream;
 
 import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFCellStyle;
 import org.apache.poi.hssf.usermodel.HSSFComment;
@@ -35,12 +34,13 @@ import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.ss.usermodel.HeaderFooter;
 
 /**
  * A text extractor for Excel files.
  * <p>
- * Returns the textual content of the file, suitable for 
+ * Returns the textual content of the file, suitable for
  *  indexing by something like Lucene, but not really
  *  intended for display to the user.
  * </p>
@@ -59,19 +59,27 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
        private boolean _includeCellComments = false;
        private boolean _includeBlankCells = false;
        private boolean _includeHeadersFooters = true;
-       
+
        public ExcelExtractor(HSSFWorkbook wb) {
                super(wb);
                _wb = wb;
                _formatter = new HSSFDataFormatter();
        }
        public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-               this(fs.getRoot(), fs);
+               this(fs.getRoot());
        }
-       public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-               this(new HSSFWorkbook(dir, fs, true));
+       /**
+     * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+        this( dir );
+    }
+    public ExcelExtractor(DirectoryNode dir) throws IOException {
+               this(new HSSFWorkbook(dir, true));
        }
-       
+
        private static final class CommandParseException extends Exception {
                public CommandParseException(String msg) {
                        super(msg);
@@ -183,7 +191,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                        return _headersFooters;
                }
        }
-       
+
        private static void printUsageMessage(PrintStream ps) {
                ps.println("Use:");
                ps.println("    " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
@@ -201,7 +209,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
         * Command line extractor.
         */
        public static void main(String[] args) {
-               
+
                CommandArgs cmdArgs;
                try {
                        cmdArgs = new CommandArgs(args);
@@ -211,12 +219,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                        System.exit(1);
                        return; // suppress compiler error
                }
-               
+
                if (cmdArgs.isRequestHelp()) {
                        printUsageMessage(System.out);
                        return;
                }
-               
+
                try {
                        InputStream is;
                        if(cmdArgs.getInputFile() == null) {
@@ -270,9 +278,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
         * Default is to include them.
         */
        public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
-               _includeHeadersFooters = includeHeadersFooters; 
+               _includeHeadersFooters = includeHeadersFooters;
        }
-       
+
        /**
         * Retrieves the text contents of the file
         */
@@ -282,12 +290,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                // We don't care about the difference between
                //  null (missing) and blank cells
                _wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL);
-               
+
                // Process each sheet in turn
                for(int i=0;i<_wb.getNumberOfSheets();i++) {
                        HSSFSheet sheet = _wb.getSheetAt(i);
                        if(sheet == null) { continue; }
-                       
+
                        if(_includeSheetNames) {
                                String name = _wb.getSheetName(i);
                                if(name != null) {
@@ -295,12 +303,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                        text.append("\n");
                                }
                        }
-                       
+
                        // Header text, if there is any
                        if(_includeHeadersFooters) {
                                text.append(_extractHeaderFooter(sheet.getHeader()));
                        }
-                       
+
                        int firstRow = sheet.getFirstRowNum();
                        int lastRow = sheet.getLastRowNum();
                        for(int j=firstRow;j<=lastRow;j++) {
@@ -313,7 +321,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                if(_includeBlankCells) {
                                        firstCell = 0;
                                }
-                               
+
                                for(int k=firstCell;k<lastCell;k++) {
                                        HSSFCell cell = row.getCell(k);
                                        boolean outputContents = true;
@@ -368,14 +376,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                                                                case HSSFCell.CELL_TYPE_ERROR:
                                                                                        text.append(ErrorEval.getText(cell.getErrorCellValue()));
                                                                                        break;
-                                                                                       
+
                                                                        }
                                                                }
                                                                break;
                                                        default:
                                                                throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
                                                }
-                                               
+
                                                // Output the comment, if requested and exists
                                                HSSFComment comment = cell.getCellComment();
                                                if(_includeCellComments && comment != null) {
@@ -385,29 +393,29 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                                        text.append(" Comment by "+comment.getAuthor()+": "+commentText);
                                                }
                                        }
-                                       
+
                                        // Output a tab if we're not on the last cell
                                        if(outputContents && k < (lastCell-1)) {
                                                text.append("\t");
                                        }
                                }
-                               
+
                                // Finish off the row
                                text.append("\n");
                        }
-                       
+
                        // Finally Footer text, if there is any
                        if(_includeHeadersFooters) {
                                text.append(_extractHeaderFooter(sheet.getFooter()));
                        }
                }
-               
+
                return text.toString();
        }
-       
+
        public static String _extractHeaderFooter(HeaderFooter hf) {
                StringBuffer text = new StringBuffer();
-               
+
                if(hf.getLeft() != null) {
                        text.append(hf.getLeft());
                }
@@ -423,7 +431,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                }
                if(text.length() > 0)
                        text.append("\n");
-               
+
                return text.toString();
        }
 }
index 844070fb508a90b7f08250ccf421f9ed660f79a2..bb50a7557112ef97422d8660189f87c24f61e969 100644 (file)
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */
-        
 
-package org.apache.poi.poifs.filesystem;
 
-import java.io.*;
+package org.apache.poi.poifs.filesystem;
 
-import java.util.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
 
 import org.apache.poi.hpsf.ClassID;
 
@@ -67,6 +68,12 @@ public interface DirectoryEntry
 
     public int getEntryCount();
 
+    /**
+     * Checks if entry with specified name present
+     */
+
+    public boolean hasEntry( final String name );
+
     /**
      * get a specified Entry by name
      *
index d6f86d5ef4ecf37a93b6f3dbd2d0c1a26e75a694..20effd8bdb851a817c5ea2dde6b3a4a7287ce06d 100644 (file)
@@ -15,7 +15,7 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */
-        
+
 
 package org.apache.poi.poifs.filesystem;
 
@@ -53,7 +53,7 @@ public class DirectoryNode
     // the POIFSFileSystem we belong to
     private POIFSFileSystem   _ofilesystem;
     // the NPOIFSFileSytem we belong to
-    private NPOIFSFileSystem  _nfilesystem; 
+    private NPOIFSFileSystem  _nfilesystem;
 
     // the path described by this document
     private POIFSDocumentPath _path;
@@ -72,7 +72,7 @@ public class DirectoryNode
     {
        this(property, parent, filesystem, (NPOIFSFileSystem)null);
     }
-    
+
     /**
      * create a DirectoryNode. This method is not public by design; it
      * is intended strictly for the internal use of this package
@@ -87,7 +87,7 @@ public class DirectoryNode
     {
        this(property, parent, (POIFSFileSystem)null, nfilesystem);
     }
-    
+
     private DirectoryNode(final DirectoryProperty property,
                           final DirectoryNode parent,
                           final POIFSFileSystem ofilesystem,
@@ -96,7 +96,7 @@ public class DirectoryNode
         super(property, parent);
         this._ofilesystem = ofilesystem;
         this._nfilesystem = nfilesystem;
-        
+
         if (parent == null)
         {
             _path = new POIFSDocumentPath();
@@ -143,23 +143,23 @@ public class DirectoryNode
     {
         return _path;
     }
-    
+
     /**
      * @return the filesystem that this belongs to
      */
     public POIFSFileSystem getFileSystem()
     {
-        return _ofilesystem; 
+        return _ofilesystem;
     }
-    
+
     /**
      * @return the filesystem that this belongs to
      */
     public NPOIFSFileSystem getNFileSystem()
     {
-        return _nfilesystem; 
+        return _nfilesystem;
     }
-    
+
     /**
      * open a document in the directory's entry's list of entries
      *
@@ -195,7 +195,7 @@ public class DirectoryNode
             throw new IOException("Entry '" + document.getName()
                                   + "' is not a DocumentEntry");
         }
-        
+
         DocumentEntry entry = (DocumentEntry)document;
         return new DocumentInputStream(entry);
     }
@@ -217,7 +217,7 @@ public class DirectoryNode
 
         (( DirectoryProperty ) getProperty()).addChild(property);
         _ofilesystem.addDocument(document);
-        
+
         _entries.add(rval);
         _byname.put(property.getName(), rval);
         return rval;
@@ -240,7 +240,7 @@ public class DirectoryNode
 
         (( DirectoryProperty ) getProperty()).addChild(property);
         _nfilesystem.addDocument(document);
-        
+
         _entries.add(rval);
         _byname.put(property.getName(), rval);
         return rval;
@@ -290,7 +290,7 @@ public class DirectoryNode
         {
             _entries.remove(entry);
                   _byname.remove(entry.getName());
-                  
+
                   if(_ofilesystem != null) {
                _ofilesystem.remove(entry);
                   } else {
@@ -342,6 +342,11 @@ public class DirectoryNode
         return _entries.size();
     }
 
+    public boolean hasEntry( String name )
+    {
+        return name != null && _byname.containsKey( name );
+    }
+
     /**
      * get a specified Entry by name
      *
@@ -430,7 +435,7 @@ public class DirectoryNode
     {
         DirectoryNode rval;
         DirectoryProperty property = new DirectoryProperty(name);
-        
+
         if(_ofilesystem != null) {
            rval = new DirectoryNode(property, _ofilesystem, this);
            _ofilesystem.addDirectory(property);
@@ -562,7 +567,7 @@ public class DirectoryNode
      * Returns an Iterator over all the entries
      */
     public Iterator<Entry> iterator() {
-        return getEntries(); 
+        return getEntries();
     }
 
     /* **********  END  begin implementation of POIFSViewable ********** */
index 57b9aa914fa446870da8c1ab53b75c65774527f5..685d92c84529adf8c3d9dff54b2a110c7d067cf8 100644 (file)
@@ -66,48 +66,48 @@ import org.apache.xmlbeans.XmlException;
 public class ExtractorFactory {
        public static final String CORE_DOCUMENT_REL =
                "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
-       
-       
+
+
        /** Should this thread prefer event based over usermodel based extractors? */
        private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
       protected Boolean initialValue() { return Boolean.FALSE; }
        };
        /** Should all threads prefer event based over usermodel based extractors? */
        private static Boolean allPreferEventExtractors;
-       
-   /** 
+
+   /**
     * Should this thread prefer event based over usermodel based extractors?
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is false. 
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is false.
     */
        public static boolean getThreadPrefersEventExtractors() {
           return threadPreferEventExtractors.get();
        }
-   /** 
-    * Should all threads prefer event based over usermodel based extractors? 
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is to use the thread level setting, which defaults to false. 
+   /**
+    * Should all threads prefer event based over usermodel based extractors?
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is to use the thread level setting, which defaults to false.
     */
        public static Boolean getAllThreadsPreferEventExtractors() {
           return allPreferEventExtractors;
        }
-       
-   /** 
+
+   /**
     * Should this thread prefer event based over usermodel based extractors?
-    * Will only be used if the All Threads setting is null. 
+    * Will only be used if the All Threads setting is null.
     */
    public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
       threadPreferEventExtractors.set(preferEventExtractors);
    }
-   /** 
+   /**
     * Should all threads prefer event based over usermodel based extractors?
-    * If set, will take preference over the Thread level setting. 
+    * If set, will take preference over the Thread level setting.
     */
    public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
       allPreferEventExtractors = preferEventExtractors;
    }
-       
-   
+
+
    /**
     * Should this thread use event based extractors is available?
     * Checks the all-threads one first, then thread specific.
@@ -118,8 +118,8 @@ public class ExtractorFactory {
       }
       return threadPreferEventExtractors.get();
    }
-   
-       
+
+
        public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
                InputStream inp = null;
         try {
@@ -137,14 +137,14 @@ public class ExtractorFactory {
             if(inp != null) inp.close();
         }
     }
-       
+
        public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
                // Figure out the kind of stream
                // If clearly doesn't do mark/reset, wrap up
                if(! inp.markSupported()) {
                        inp = new PushbackInputStream(inp, 8);
                }
-               
+
                if(POIFSFileSystem.hasPOIFSHeader(inp)) {
                        return createExtractor(new POIFSFileSystem(inp));
                }
@@ -153,16 +153,16 @@ public class ExtractorFactory {
                }
                throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
        }
-       
+
        public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
-       PackageRelationshipCollection core = 
+       PackageRelationshipCollection core =
             pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
        if(core.size() != 1) {
           throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
        }
 
        PackagePart corePart = pkg.getPart(core.getRelationship(0));
-        
+
        // Is it XSSF?
        for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
           if(corePart.getContentType().equals(rel.getContentType())) {
@@ -173,84 +173,98 @@ public class ExtractorFactory {
              }
           }
        }
-        
+
        // Is it XWPF?
        for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
           if(corePart.getContentType().equals(rel.getContentType())) {
              return new XWPFWordExtractor(pkg);
           }
        }
-       
+
        // Is it XSLF?
        for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
           if(corePart.getContentType().equals(rel.getContentType())) {
              return new XSLFPowerPointExtractor(pkg);
           }
        }
-       
+
        throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
        }
-       
+
        public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
           // Only ever an OLE2 one from the root of the FS
-               return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
+               return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
        }
-       public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
-               // Look for certain entries in the stream, to figure it
-               //  out from
-               for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
-                       Entry entry = entries.next();
-                       
-                       if(entry.getName().equals("Workbook")) {
-                          if(getPreferEventExtractor()) {
-               return new EventBasedExcelExtractor(poifsDir, fs);
-                          } else {
-                             return new ExcelExtractor(poifsDir, fs);
-                          }
-                       }
-                       if(entry.getName().equals("WordDocument")) {
-                           // Old or new style word document?
-                           try {
-                               return new WordExtractor(poifsDir, fs);
-                           } catch(OldWordFileFormatException e) {
-                               return new Word6Extractor(poifsDir, fs);
-                           }
-                       }
-                       if(entry.getName().equals("PowerPoint Document")) {
-                               return new PowerPointExtractor(poifsDir, fs);
-                       }
-                       if(entry.getName().equals("VisioDocument")) {
-                               return new VisioTextExtractor(poifsDir, fs);
-                       }
-         if(entry.getName().equals("Quill")) {
-            return new PublisherTextExtractor(poifsDir, fs);
-         }
-                       if(
-                entry.getName().equals("__substg1.0_1000001E") ||
-                entry.getName().equals("__substg1.0_1000001F") ||
-                entry.getName().equals("__substg1.0_0047001E") ||
-                entry.getName().equals("__substg1.0_0047001F") ||
-                entry.getName().equals("__substg1.0_0037001E") ||
-                entry.getName().equals("__substg1.0_0037001F")
-                       ) {
-                          return new OutlookTextExtactor(poifsDir, fs);
-                       }
-                       if(entry.getName().equals("Package")) {
-                          OPCPackage pkg = OPCPackage.open(
-                                poifsDir.createDocumentInputStream(entry.getName())
-                          );
-                          return createExtractor(pkg);
-                       }
-               }
-               throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
-       }
-       
-       
+
+    /**
+     * @deprecated Use {@link #createExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings("unused")
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
+            throws IOException, InvalidFormatException, OpenXML4JException, XmlException
+    {
+        return createExtractor(poifsDir);
+    }
+
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
+            InvalidFormatException, OpenXML4JException, XmlException
+    {
+        // Look for certain entries in the stream, to figure it
+        // out from
+        if (poifsDir.hasEntry("Workbook")) {
+            if (getPreferEventExtractor()) {
+                return new EventBasedExcelExtractor(poifsDir);
+            }
+            return new ExcelExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("WordDocument")) {
+            // Old or new style word document?
+            try {
+                return new WordExtractor(poifsDir);
+            } catch (OldWordFileFormatException e) {
+                return new Word6Extractor(poifsDir);
+            }
+        }
+
+        if (poifsDir.hasEntry("PowerPoint Document")) {
+            return new PowerPointExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("VisioDocument")) {
+            return new VisioTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("Quill")) {
+            return new PublisherTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
+                || poifsDir.hasEntry("__substg1.0_0047001E")
+                || poifsDir.hasEntry("__substg1.0_0047001F")
+                || poifsDir.hasEntry("__substg1.0_0037001E")
+                || poifsDir.hasEntry("__substg1.0_0037001F"))
+        {
+            return new OutlookTextExtactor(poifsDir);
+        }
+
+        for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
+            Entry entry = entries.next();
+
+            if (entry.getName().equals("Package")) {
+                OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
+                return createExtractor(pkg);
+            }
+        }
+        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+    }
+
        /**
         * Returns an array of text extractors, one for each of
         *  the embeded documents in the file (if there are any).
         * If there are no embeded documents, you'll get back an
-        *  empty array. Otherwise, you'll get one open 
+        *  empty array. Otherwise, you'll get one open
         *  {@link POITextExtractor} for each embeded file.
         */
        public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@@ -258,16 +272,16 @@ public class ExtractorFactory {
                ArrayList<Entry> dirs = new ArrayList<Entry>();
                // For anything else not directly held in as a POIFS directory
                ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-               
+
       // Find all the embeded directories
-               POIFSFileSystem fs = ext.getFileSystem();
-               if(fs == null) {
+               DirectoryEntry root = ext.getRoot();
+               if(root == null) {
                        throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
                }
-               
+
                if(ext instanceof ExcelExtractor) {
                        // These are in MBD... under the root
-                       Iterator<Entry> it = fs.getRoot().getEntries();
+                       Iterator<Entry> it = root.getEntries();
                        while(it.hasNext()) {
                                Entry entry = it.next();
                                if(entry.getName().startsWith("MBD")) {
@@ -278,7 +292,7 @@ public class ExtractorFactory {
                        // These are in ObjectPool -> _... under the root
                        try {
                                DirectoryEntry op = (DirectoryEntry)
-                                       fs.getRoot().getEntry("ObjectPool");
+                                       root.getEntry("ObjectPool");
                                Iterator<Entry> it = op.getEntries();
                                while(it.hasNext()) {
                                        Entry entry = it.next();
@@ -302,7 +316,7 @@ public class ExtractorFactory {
                      }
                   }
                }
-               
+
                // Create the extractors
                if(
                      (dirs == null || dirs.size() == 0) &&
@@ -310,11 +324,11 @@ public class ExtractorFactory {
                ){
                        return new POITextExtractor[0];
                }
-               
+
                ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
                for(int i=0; i<dirs.size(); i++) {
                        e.add( createExtractor(
-                                       (DirectoryNode)dirs.get(i), ext.getFileSystem()
+                                       (DirectoryNode)dirs.get(i)
                        ) );
                }
                for(int i=0; i<nonPOIFS.size(); i++) {
@@ -336,7 +350,7 @@ public class ExtractorFactory {
         * Returns an array of text extractors, one for each of
         *  the embeded documents in the file (if there are any).
         * If there are no embeded documents, you'll get back an
-        *  empty array. Otherwise, you'll get one open 
+        *  empty array. Otherwise, you'll get one open
         *  {@link POITextExtractor} for each embeded file.
         */
        public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
index 72843973fb10bc8a94e4ef15407b963635fc3221..3beb3ffe73e75ba1285bd5a60e38c74678aa26ad 100644 (file)
@@ -23,6 +23,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.hwpf.model.BookmarksTables;
 import org.apache.poi.hwpf.model.CHPBinTable;
 import org.apache.poi.hwpf.model.CPSplitCalculator;
@@ -190,7 +192,9 @@ public final class HWPFDocument extends HWPFDocumentCore
    * @param pfilesystem The POIFSFileSystem that contains the Word document.
    * @throws IOException If there is an unexpected IOException from the passed
    *         in POIFSFileSystem.
+   * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
    */
+  @Deprecated
   public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
   {
      this(directory);
index 50171c37e77b1018a30552552a18e15add00ecfc..fc2bb15e68641e19fba7db48ba766118b33e1295 100644 (file)
 
 package org.apache.poi.hwpf;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
 
+import org.apache.poi.hwpf.usermodel.ObjectsPool;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.EncryptedDocumentException;
 import org.apache.poi.POIDocument;
 import org.apache.poi.hwpf.model.CHPBinTable;
@@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
  */
 public abstract class HWPFDocumentCore extends POIDocument
 {
+  /** Holds OLE2 objects */
+  protected ObjectPoolImpl _objectPool;
+
   /** The FIB */
   protected FileInformationBlock _fib;
 
@@ -148,7 +158,21 @@ public abstract class HWPFDocumentCore extends POIDocument
     if(_fib.isFEncrypted()) {
        throw new EncryptedDocumentException("Cannot process encrypted word files!");
     }
-  }
+
+        {
+            DirectoryEntry objectPoolEntry;
+            try
+            {
+                objectPoolEntry = (DirectoryEntry) directory
+                        .getEntry( "ObjectPool" );
+            }
+            catch ( FileNotFoundException exc )
+            {
+                objectPoolEntry = directory.createDirectory( "ObjectPool" );
+            }
+            _objectPool = new ObjectPoolImpl( objectPoolEntry );
+        }
+    }
 
     /**
      * Returns the range which covers the whole of the document, but excludes
@@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore extends POIDocument
     return _fib;
   }
 
+    public ObjectsPool getObjectsPool()
+    {
+        return _objectPool;
+    }
+
     public abstract TextPieceTable getTextTable();
 }
index 182e6344458af957885ada978c3398ebef38c9d0..2d636e06c798a9409029a81db0bb956fb443feaa 100644 (file)
@@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
         this(fs.getRoot());
     }
 
+    @Deprecated
     public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
             throws IOException {
        this(directory);
index e71aed0ac69f49ca7210a669888f87808de07758..cc104b84aaa03fe7cdd35fe764776ed85202da27 100644 (file)
@@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Section;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
@@ -56,6 +57,32 @@ import org.w3c.dom.Element;
 @Beta
 public abstract class AbstractWordConverter
 {
+    private static final class Structure implements Comparable<Structure>
+    {
+        final int end;
+        final int start;
+        final Object structure;
+
+        Structure( Bookmark bookmark )
+        {
+            this.start = bookmark.getStart();
+            this.end = bookmark.getEnd();
+            this.structure = bookmark;
+        }
+
+        Structure( Field field )
+        {
+            this.start = field.getFieldStartOffset();
+            this.end = field.getFieldEndOffset();
+            this.structure = field;
+        }
+
+        public int compareTo( Structure o )
+        {
+            return start < o.start ? -1 : start == o.start ? 0 : 1;
+        }
+    }
+
     private static final byte BEL_MARK = 7;
 
     private static final byte FIELD_BEGIN_MARK = 19;
@@ -396,6 +423,13 @@ public abstract class AbstractWordConverter
                     processDrawnObject( doc, characterRun, block );
                     continue;
                 }
+                if ( characterRun.isOle2()
+                        && ( wordDocument instanceof HWPFDocument ) )
+                {
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
+                    processOle2( doc, characterRun, block );
+                    continue;
+                }
             }
 
             if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
@@ -613,10 +647,11 @@ public abstract class AbstractWordConverter
             CharacterRun characterRun, OfficeDrawing officeDrawing,
             String path, Element block );
 
-    protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range endnoteTextRange );
+    protected abstract void processEndnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range endnoteTextRange );
 
-    protected void processField( HWPFDocument hwpfDocument, Range parentRange,
+    protected void processField( HWPFDocument wordDocument, Range parentRange,
             int currentTableLevel, Field field, Element currentBlock )
     {
         switch ( field.getType() )
@@ -633,7 +668,7 @@ public abstract class AbstractWordConverter
                 if ( matcher.find() )
                 {
                     String pageref = matcher.group( 1 );
-                    processPageref( hwpfDocument, currentBlock,
+                    processPageref( wordDocument, currentBlock,
                             field.secondSubrange( parentRange ),
                             currentTableLevel, pageref );
                     return;
@@ -641,6 +676,36 @@ public abstract class AbstractWordConverter
             }
             break;
         }
+        case 58: // Embedded Object
+        {
+            if ( !field.hasSeparator() )
+            {
+                logger.log( POILogger.WARN, parentRange + " contains " + field
+                        + " with 'Embedded Object' but without separator mark" );
+                return;
+            }
+
+            CharacterRun separator = field
+                    .getMarkSeparatorCharacterRun( parentRange );
+
+            if ( separator.isOle2() )
+            {
+                // the only supported so far
+                boolean processed = processOle2( wordDocument, separator,
+                        currentBlock );
+
+                // if we didn't output OLE - output field value
+                if ( !processed )
+                {
+                    processCharacters( wordDocument, currentTableLevel,
+                            field.secondSubrange( parentRange ), currentBlock );
+                }
+
+                return;
+            }
+
+            break;
+        }
         case 88: // hyperlink
         {
             final Range firstSubrange = field.firstSubrange( parentRange );
@@ -653,7 +718,7 @@ public abstract class AbstractWordConverter
                 if ( matcher.find() )
                 {
                     String hyperlink = matcher.group( 1 );
-                    processHyperlink( hwpfDocument, currentBlock,
+                    processHyperlink( wordDocument, currentBlock,
                             field.secondSubrange( parentRange ),
                             currentTableLevel, hyperlink );
                     return;
@@ -665,12 +730,13 @@ public abstract class AbstractWordConverter
 
         logger.log( POILogger.WARN, parentRange + " contains " + field
                 + " with unsupported type or format" );
-        processCharacters( hwpfDocument, currentTableLevel,
+        processCharacters( wordDocument, currentTableLevel,
                 field.secondSubrange( parentRange ), currentBlock );
     }
 
-    protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range footnoteTextRange );
+    protected abstract void processFootnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range footnoteTextRange );
 
     protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
             Element currentBlock, Range textRange, int currentTableLevel,
@@ -732,6 +798,40 @@ public abstract class AbstractWordConverter
         }
     }
 
+    private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
+            Element block )
+    {
+        Entry entry = doc.getObjectsPool().getObjectById(
+                "_" + characterRun.getPicOffset() );
+        if ( entry == null )
+        {
+            logger.log( POILogger.WARN, "Referenced OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ),
+                    "' not found in ObjectPool" );
+            return false;
+        }
+
+        try
+        {
+            return processOle2( doc, block, entry );
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.WARN,
+                    "Unable to convert internal OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
+                    exc );
+            return false;
+        }
+    }
+
+    @SuppressWarnings( "unused" )
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        return false;
+    }
+
     protected abstract void processPageref( HWPFDocumentCore wordDocument,
             Element currentBlock, Range textRange, int currentTableLevel,
             String pageref );
@@ -896,30 +996,4 @@ public abstract class AbstractWordConverter
         return endMark;
     }
 
-    private static final class Structure implements Comparable<Structure>
-    {
-        final int end;
-        final int start;
-        final Object structure;
-
-        Structure( Bookmark bookmark )
-        {
-            this.start = bookmark.getStart();
-            this.end = bookmark.getEnd();
-            this.structure = bookmark;
-        }
-
-        Structure( Field field )
-        {
-            this.start = field.getFieldStartOffset();
-            this.end = field.getFieldEndOffset();
-            this.structure = field;
-        }
-
-        public int compareTo( Structure o )
-        {
-            return start < o.start ? -1 : start == o.start ? 0 : 1;
-        }
-    }
-
 }
index e7b85b0b748efbe9347e49a0f22da26e34126558..eeb5d7843f94757657f17d557ee61bcff589d147 100644 (file)
@@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.IOUtils;
@@ -422,6 +423,19 @@ public class AbstractWordUtils
         return !isEmpty( str );
     }
 
+    public static HWPFDocumentCore loadDoc( final DirectoryNode root )
+            throws IOException
+    {
+        try
+        {
+            return new HWPFDocument( root );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( root );
+        }
+    }
+
     public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
     {
         final FileInputStream istream = new FileInputStream( docFile );
@@ -438,16 +452,13 @@ public class AbstractWordUtils
     public static HWPFDocumentCore loadDoc( InputStream inputStream )
             throws IOException
     {
-        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
-                .verifyAndBuildPOIFS( inputStream );
-        try
-        {
-            return new HWPFDocument( poifsFileSystem );
-        }
-        catch ( OldWordFileFormatException exc )
-        {
-            return new HWPFOldDocument( poifsFileSystem );
-        }
+        return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
+    }
+
+    public static HWPFDocumentCore loadDoc(
+            final POIFSFileSystem poifsFileSystem ) throws IOException
+    {
+        return loadDoc( poifsFileSystem.getRoot() );
     }
 
     static String substringBeforeLast( String str, String separator )
index dd9cfe8824055376b2d6f9b1e4d527d6ac759fc7..7e3e4dfe7df3debb4e300320fc6d8368edfb7f0c 100644 (file)
@@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter
     }
 
     @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
     {
         final String textIndex = String.valueOf( internalLinkCounter
                 .incrementAndGet() );
@@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter
         setId( backwardLink, forwardLinkName );
         endnote.appendChild( backwardLink );
 
-        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
+        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
+                endnote );
 
         WordToFoUtils.compactInlines( endnote );
         this.endnotes.add( endnote );
index 040d32a8793331b829819a7b10ac0018411a5eb4..e6233d2c3eefa8f2ab7e4697ba36de171819f866 100644 (file)
@@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
 @Beta
 public class WordToHtmlConverter extends AbstractWordConverter
 {
-
     /**
      * Holds properties values, applied to current <tt>p</tt> element. Those
      * properties shall not be doubled in children <tt>span</tt> elements.
@@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
     }
 
     @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
     {
-        processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
+        processNoteAutonumbered( wordDocument, "end", noteIndex, block,
+                endnoteTextRange );
     }
 
     @Override
index edea5dab6c5d6249e8755e7c74ca14a2e2e29ec5..fa9b87fcd74f2ddb076c7478e37d6230bbbb9d5c 100644 (file)
@@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;
 
 import java.io.File;
 import java.io.FileWriter;
+import java.io.StringWriter;
+import java.lang.reflect.Method;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerFactory;
@@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -33,6 +39,29 @@ import org.w3c.dom.Element;
 public class WordToTextConverter extends AbstractWordConverter
 {
 
+    public static String getText( DirectoryNode root ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
+        return getText( wordDocument );
+    }
+
+    public static String getText( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils
+                .loadDoc( docFile );
+        return getText( wordDocument );
+    }
+
+    public static String getText( final HWPFDocumentCore wordDocument )
+            throws Exception
+    {
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+        return wordToTextConverter.getText();
+    }
+
     /**
      * Java main() interface to interact with {@link WordToTextConverter}
      * 
@@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter
 
     private Element notes = null;
 
+    private boolean outputSummaryInformation = false;
+
     private final TextDocumentFacade textDocumentFacade;
 
+    /**
+     * Creates new instance of {@link WordToTextConverter}. Can be used for
+     * output several {@link HWPFDocument}s into single text document.
+     * 
+     * @throws ParserConfigurationException
+     *             if an internal {@link DocumentBuilder} cannot be created
+     */
+    public WordToTextConverter() throws ParserConfigurationException
+    {
+        this.textDocumentFacade = new TextDocumentFacade(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+    }
+
     /**
      * Creates new instance of {@link WordToTextConverter}. Can be used for
      * output several {@link HWPFDocument}s into single text document.
@@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter
         return textDocumentFacade.getDocument();
     }
 
+    public String getText() throws Exception
+    {
+        StringWriter stringWriter = new StringWriter();
+        DOMSource domSource = new DOMSource( getDocument() );
+        StreamResult streamResult = new StreamResult( stringWriter );
+
+        TransformerFactory tf = TransformerFactory.newInstance();
+        Transformer serializer = tf.newTransformer();
+        // TODO set encoding from a command argument
+        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+        serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+        serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+        serializer.transform( domSource, streamResult );
+
+        return stringWriter.toString();
+    }
+
+    public boolean isOutputSummaryInformation()
+    {
+        return outputSummaryInformation;
+    }
+
     @Override
     protected void outputCharacters( Element block, CharacterRun characterRun,
             String text )
@@ -138,18 +205,24 @@ public class WordToTextConverter extends AbstractWordConverter
     protected void processDocumentInformation(
             SummaryInformation summaryInformation )
     {
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
-            textDocumentFacade.setTitle( summaryInformation.getTitle() );
+        if ( isOutputSummaryInformation() )
+        {
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
+                textDocumentFacade.setTitle( summaryInformation.getTitle() );
 
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
-            textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
+                textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
 
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
-            textDocumentFacade
-                    .addDescription( summaryInformation.getComments() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getComments() ) )
+                textDocumentFacade.addDescription( summaryInformation
+                        .getComments() );
 
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
-            textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getKeywords() ) )
+                textDocumentFacade.addKeywords( summaryInformation
+                        .getKeywords() );
+        }
     }
 
     @Override
@@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter
         note.appendChild( textDocumentFacade.createText( "\n" ) );
     }
 
+    @Override
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        if ( !( entry instanceof DirectoryNode ) )
+            return false;
+        DirectoryNode directoryNode = (DirectoryNode) entry;
+
+        // even if no ExtractorFactory in classpath
+        if ( directoryNode.hasEntry( "WordDocument" ) )
+        {
+            String text = WordToTextConverter.getText( (DirectoryNode) entry );
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+
+        try
+        {
+            Class<?> cls = Class
+                    .forName( "org.apache.poi.extractor.ExtractorFactory" );
+            Method createExtractor = cls.getMethod( "createExtractor",
+                    DirectoryNode.class );
+            Object extractor = createExtractor.invoke( null, directoryNode );
+
+            Method getText = extractor.getClass().getMethod( "getText" );
+            String text = (String) getText.invoke( extractor );
+
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+        catch ( ClassNotFoundException exc )
+        {
+            // no extractor in classpath
+        }
+
+        return false;
+    }
+
     @Override
     protected void processPageref( HWPFDocumentCore wordDocument,
             Element currentBlock, Range textRange, int currentTableLevel,
@@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter
         textDocumentFacade.body.appendChild( sectionElement );
     }
 
-    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
             Table table )
     {
         final int tableRows = table.numRows();
@@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter
                     tableCellElement.appendChild( textDocumentFacade
                             .createText( "\t" ) );
 
-                processParagraphes( hwpfDocument, tableCellElement, tableCell,
-                        table.getTableLevel() );
+                processCharacters( wordDocument, table.getTableLevel(),
+                        tableCell, tableCellElement );
                 tableRowElement.appendChild( tableCellElement );
             }
 
@@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter
         }
     }
 
+    public void setOutputSummaryInformation( boolean outputDocumentInformation )
+    {
+        this.outputSummaryInformation = outputDocumentInformation;
+    }
+
 }
index 123f501781e4a63b59c7c90fc4529ebfc239456c..b5331e8be11c208281059777c9343764f24dfa57 100644 (file)
@@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
 
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFOldDocument;
@@ -47,16 +51,32 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
                this( new POIFSFileSystem(is) );
        }
 
-       /**
-        * Create a new Word Extractor
-        * @param fs POIFSFileSystem containing the word file
-        */
-       public Word6Extractor(POIFSFileSystem fs) throws IOException {
-               this(fs.getRoot(), fs);
-       }
-       public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-           this(new HWPFOldDocument(dir,fs));
-       }
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public Word6Extractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( fs.getRoot() );
+    }
+
+    /**
+     * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public Word6Extractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFOldDocument( dir ) );
+    }
 
        /**
         * Create a new Word Extractor
@@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
      * Get the text from the word file, as an array with one String
      *  per paragraph
      */
+       @Deprecated
        public String[] getParagraphText() {
            String[] ret;
 
@@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
            return ret;
        }
 
-    public String getText() {
-        StringBuffer text = new StringBuffer();
-        
-        for(String t : getParagraphText()) {
-            text.append(t);
+    public String getText()
+    {
+        try
+        {
+            WordToTextConverter wordToTextConverter = new WordToTextConverter();
+            wordToTextConverter.processDocument( doc );
+            return wordToTextConverter.getText();
         }
+        catch ( Exception exc )
+        {
+            // fall-back
+            StringBuffer text = new StringBuffer();
 
-        return text.toString();
+            for ( String t : getParagraphText() )
+            {
+                text.append( t );
+            }
+
+            return text.toString();
+        }
     }
 }
index 464b11a406907aa0883d3584cfadad8d43275fba..8438df4f0402f7741e885efb247824782298645a 100644 (file)
@@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
@@ -33,231 +36,300 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
  * Class to extract the text from a Word Document.
- *
- * You should use either getParagraphText() or getText() unless
- *  you have a strong reason otherwise.
- *
+ * 
+ * You should use either getParagraphText() or getText() unless you have a
+ * strong reason otherwise.
+ * 
  * @author Nick Burch
  */
-public final class WordExtractor extends POIOLE2TextExtractor {
-       private POIFSFileSystem fs;
-       private HWPFDocument doc;
-
-       /**
-        * Create a new Word Extractor
-        * @param is InputStream containing the word file
-        */
-       public WordExtractor(InputStream is) throws IOException {
-               this( HWPFDocument.verifyAndBuildPOIFS(is) );
-       }
-
-       /**
-        * Create a new Word Extractor
-        * @param fs POIFSFileSystem containing the word file
-        */
-       public WordExtractor(POIFSFileSystem fs) throws IOException {
-               this(new HWPFDocument(fs));
-               this.fs = fs;
-       }
-       public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-               this(new HWPFDocument(dir, fs));
-               this.fs = fs;
-       }
-
-       /**
-        * Create a new Word Extractor
-        * @param doc The HWPFDocument to extract from
-        */
-       public WordExtractor(HWPFDocument doc) {
-               super(doc);
-               this.doc = doc;
-       }
-
-       /**
-        * Command line extractor, so people will stop moaning that
-        *  they can't just run this.
-        */
-       public static void main(String[] args) throws IOException {
-               if(args.length == 0) {
-                       System.err.println("Use:");
-                       System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
-                       System.exit(1);
-               }
-
-               // Process the first argument as a file
-               FileInputStream fin = new FileInputStream(args[0]);
-               WordExtractor extractor = new WordExtractor(fin);
-               System.out.println(extractor.getText());
-       }
-
-       /**
-        * Get the text from the word file, as an array with one String
-        *  per paragraph
-        */
-        public String[] getParagraphText() {
-                String[] ret;
-
-                // Extract using the model code
-                try {
-                        Range r = doc.getRange();
-
-                        ret = getParagraphText(r);
-                } catch (Exception e) {
-                        // Something's up with turning the text pieces into paragraphs
-                        // Fall back to ripping out the text pieces
-                        ret = new String[1];
-                        ret[0] = getTextFromPieces();
-                }
+public final class WordExtractor extends POIOLE2TextExtractor
+{
+    private HWPFDocument doc;
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param is
+     *            InputStream containing the word file
+     */
+    public WordExtractor( InputStream is ) throws IOException
+    {
+        this( HWPFDocument.verifyAndBuildPOIFS( is ) );
+    }
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public WordExtractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( new HWPFDocument( fs ) );
+    }
+
+    /**
+     * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public WordExtractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFDocument( dir ) );
+    }
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param doc
+     *            The HWPFDocument to extract from
+     */
+    public WordExtractor( HWPFDocument doc )
+    {
+        super( doc );
+        this.doc = doc;
+    }
+
+    /**
+     * Command line extractor, so people will stop moaning that they can't just
+     * run this.
+     */
+    public static void main( String[] args ) throws IOException
+    {
+        if ( args.length == 0 )
+        {
+            System.err.println( "Use:" );
+            System.err
+                    .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
+            System.exit( 1 );
+        }
 
-                return ret;
+        // Process the first argument as a file
+        FileInputStream fin = new FileInputStream( args[0] );
+        WordExtractor extractor = new WordExtractor( fin );
+        System.out.println( extractor.getText() );
+    }
+
+    /**
+     * Get the text from the word file, as an array with one String per
+     * paragraph
+     */
+    public String[] getParagraphText()
+    {
+        String[] ret;
+
+        // Extract using the model code
+        try
+        {
+            Range r = doc.getRange();
+
+            ret = getParagraphText( r );
+        }
+        catch ( Exception e )
+        {
+            // Something's up with turning the text pieces into paragraphs
+            // Fall back to ripping out the text pieces
+            ret = new String[1];
+            ret[0] = getTextFromPieces();
         }
 
-        public String[] getFootnoteText() {
-                Range r = doc.getFootnoteRange();
+        return ret;
+    }
 
-                return getParagraphText(r);
-        }
+    public String[] getFootnoteText()
+    {
+        Range r = doc.getFootnoteRange();
 
-        public String[] getMainTextboxText() {
-                Range r = doc.getMainTextboxRange();
+        return getParagraphText( r );
+    }
 
-                return getParagraphText(r);
-        }
+    public String[] getMainTextboxText()
+    {
+        Range r = doc.getMainTextboxRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getEndnoteText()
+    {
+        Range r = doc.getEndnoteRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getCommentsText()
+    {
+        Range r = doc.getCommentsRange();
+
+        return getParagraphText( r );
+    }
 
-        public String[] getEndnoteText() {
-                Range r = doc.getEndnoteRange();
+    protected static String[] getParagraphText( Range r )
+    {
+        String[] ret;
+        ret = new String[r.numParagraphs()];
+        for ( int i = 0; i < ret.length; i++ )
+        {
+            Paragraph p = r.getParagraph( i );
+            ret[i] = p.text();
 
-                return getParagraphText(r);
+            // Fix the line ending
+            if ( ret[i].endsWith( "\r" ) )
+            {
+                ret[i] = ret[i] + "\n";
+            }
+        }
+        return ret;
+    }
+
+    /**
+     * Add the header/footer text, if it's not empty
+     */
+    private void appendHeaderFooter( String text, StringBuffer out )
+    {
+        if ( text == null || text.length() == 0 )
+            return;
+
+        text = text.replace( '\r', '\n' );
+        if ( !text.endsWith( "\n" ) )
+        {
+            out.append( text );
+            out.append( '\n' );
+            return;
+        }
+        if ( text.endsWith( "\n\n" ) )
+        {
+            out.append( text.substring( 0, text.length() - 1 ) );
+            return;
+        }
+        out.append( text );
+        return;
+    }
+
+    /**
+     * Grab the text from the headers
+     */
+    @Deprecated
+    public String getHeaderText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstHeader() != null )
+        {
+            appendHeaderFooter( hs.getFirstHeader(), ret );
+        }
+        if ( hs.getEvenHeader() != null )
+        {
+            appendHeaderFooter( hs.getEvenHeader(), ret );
+        }
+        if ( hs.getOddHeader() != null )
+        {
+            appendHeaderFooter( hs.getOddHeader(), ret );
         }
 
-        public String[] getCommentsText() {
-                Range r = doc.getCommentsRange();
+        return ret.toString();
+    }
+
+    /**
+     * Grab the text from the footers
+     */
+    @Deprecated
+    public String getFooterText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstFooter() != null )
+        {
+            appendHeaderFooter( hs.getFirstFooter(), ret );
+        }
+        if ( hs.getEvenFooter() != null )
+        {
+            appendHeaderFooter( hs.getEvenFooter(), ret );
+        }
+        if ( hs.getOddFooter() != null )
+        {
+            appendHeaderFooter( hs.getOddFooter(), ret );
+        }
 
-                return getParagraphText(r);
+        return ret.toString();
+    }
+
+    /**
+     * Grab the text out of the text pieces. Might also include various bits of
+     * crud, but will work in cases where the text piece -> paragraph mapping is
+     * broken. Fast too.
+     */
+    public String getTextFromPieces()
+    {
+        String text = doc.getDocumentText();
+
+        // Fix line endings (Note - won't get all of them
+        text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
+        text = text.replaceAll( "\r\r", "\r\n\r\n" );
+
+        if ( text.endsWith( "\r" ) )
+        {
+            text += "\n";
         }
 
-        protected static String[] getParagraphText(Range r) {
-                String[] ret;
-                ret = new String[r.numParagraphs()];
-                for (int i = 0; i < ret.length; i++) {
-                        Paragraph p = r.getParagraph(i);
-                        ret[i] = p.text();
-
-                        // Fix the line ending
-                        if (ret[i].endsWith("\r")) {
-                                ret[i] = ret[i] + "\n";
-                        }
+        return text;
+    }
+
+    /**
+     * Grab the text, based on the WordToTextConverter. Shouldn't include any
+     * crud, but slower than getTextFromPieces().
+     */
+    public String getText()
+    {
+        try
+        {
+            final StringWriter stringWriter = new StringWriter();
+            @SuppressWarnings( "unused" )
+            WordToTextConverter wordToTextConverter = new WordToTextConverter()
+            {
+                {
+                    HeaderStories hs = new HeaderStories( doc );
+
+                    if ( hs.getFirstHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstHeaderSubrange() );
+                    if ( hs.getEvenHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenHeaderSubrange() );
+                    if ( hs.getOddHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getOddHeaderSubrange() );
+
+                    processDocument( doc );
+                    processDocumentPart( doc, doc.getMainTextboxRange() );
+
+                    if ( hs.getFirstFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstFooterSubrange() );
+                    if ( hs.getEvenFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenFooterSubrange() );
+                    if ( hs.getOddFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getOddFooterSubrange() );
+
+                    stringWriter.append( getText() );
                 }
-                return ret;
+            };
+            return stringWriter.toString();
         }
-
-        /**
-        * Add the header/footer text, if it's not empty
-        */
-       private void appendHeaderFooter(String text, StringBuffer out) {
-               if(text == null || text.length() == 0)
-                       return;
-
-               text = text.replace('\r', '\n');
-               if(! text.endsWith("\n")) {
-                       out.append(text);
-                       out.append('\n');
-                       return;
-               }
-               if(text.endsWith("\n\n")) {
-                       out.append(text.substring(0, text.length()-1));
-                       return;
-               }
-               out.append(text);
-               return;
-       }
-       /**
-        * Grab the text from the headers
-        */
-       public String getHeaderText() {
-               HeaderStories hs = new HeaderStories(doc);
-
-               StringBuffer ret = new StringBuffer();
-               if(hs.getFirstHeader() != null) {
-                       appendHeaderFooter(hs.getFirstHeader(), ret);
-               }
-               if(hs.getEvenHeader() != null) {
-                       appendHeaderFooter(hs.getEvenHeader(), ret);
-               }
-               if(hs.getOddHeader() != null) {
-                       appendHeaderFooter(hs.getOddHeader(), ret);
-               }
-
-               return ret.toString();
-       }
-       /**
-        * Grab the text from the footers
-        */
-       public String getFooterText() {
-               HeaderStories hs = new HeaderStories(doc);
-
-               StringBuffer ret = new StringBuffer();
-               if(hs.getFirstFooter() != null) {
-                       appendHeaderFooter(hs.getFirstFooter(), ret);
-               }
-               if(hs.getEvenFooter() != null) {
-                       appendHeaderFooter(hs.getEvenFooter(), ret);
-               }
-               if(hs.getOddFooter() != null) {
-                       appendHeaderFooter(hs.getOddFooter(), ret);
-               }
-
-               return ret.toString();
-       }
-
-       /**
-        * Grab the text out of the text pieces. Might also include various
-        *  bits of crud, but will work in cases where the text piece -> paragraph
-        *  mapping is broken. Fast too.
-        */
-       public String getTextFromPieces() {
-       String text = doc.getDocumentText();
-
-       // Fix line endings (Note - won't get all of them
-       text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
-       text = text.replaceAll("\r\r", "\r\n\r\n");
-
-       if(text.endsWith("\r")) {
-               text += "\n";
-       }
-
-       return text;
-       }
-
-       /**
-        * Grab the text, based on the paragraphs. Shouldn't include any crud,
-        *  but slightly slower than getTextFromPieces().
-        */
-       public String getText() {
-          StringBuffer ret = new StringBuffer();
-
-          ret.append(getHeaderText());
-
-          ArrayList<String> text = new ArrayList<String>();
-          text.addAll(Arrays.asList(getParagraphText()));
-          text.addAll(Arrays.asList(getMainTextboxText()));
-          text.addAll(Arrays.asList(getFootnoteText()));
-          text.addAll(Arrays.asList(getEndnoteText()));
-
-          for(String p : text) {
-             ret.append(p);
-          }
-
-          ret.append(getFooterText());
-
-          return ret.toString();
-       }
-
-       /**
-        * Removes any fields (eg macros, page markers etc)
-        *  from the string.
-        */
-       public static String stripFields(String text) {
-               return Range.stripFields(text);
-       }
+        catch ( Exception exc )
+        {
+            throw new RuntimeException( exc );
+        }
+    }
+
+    /**
+     * Removes any fields (eg macros, page markers etc) from the string.
+     */
+    public static String stripFields( String text )
+    {
+        return Range.stripFields( text );
+    }
 }
index 72b5220ddea9f028cd620610dc18b0fbe490b6f2..f50abd5b4f46df833b80a3cd3fa37b359cf9fd41 100644 (file)
@@ -17,17 +17,23 @@ public interface Field
      */
     int getFieldStartOffset();
 
+    CharacterRun getMarkEndCharacterRun( Range parent );
+
     /**
      * @return character position of end field mark
      */
     int getMarkEndOffset();
 
+    CharacterRun getMarkSeparatorCharacterRun( Range parent );
+
     /**
      * @return character position of separator field mark (if present,
      *         {@link NullPointerException} otherwise)
      */
     int getMarkSeparatorOffset();
 
+    CharacterRun getMarkStartCharacterRun( Range parent );
+
     /**
      * @return character position of start field mark
      */
index 4832b23f96eb15405bdc0aa70d232277b590e8c3..37131c3e1529be3fd1118b8fe6954b4378e166ec 100644 (file)
@@ -112,6 +112,12 @@ class FieldImpl implements Field
         return startPlex.getFcStart();
     }
 
+    public CharacterRun getMarkEndCharacterRun( Range parent )
+    {
+        return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
+                .getCharacterRun( 0 );
+    }
+
     /**
      * @return character position of end field mark
      */
@@ -120,6 +126,15 @@ class FieldImpl implements Field
         return endPlex.getFcStart();
     }
 
+    public CharacterRun getMarkSeparatorCharacterRun( Range parent )
+    {
+        if ( !hasSeparator() )
+            return null;
+
+        return new Range( getMarkSeparatorOffset(),
+                getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
+    }
+
     /**
      * @return character position of separator field mark (if present,
      *         {@link NullPointerException} otherwise)
@@ -129,6 +144,12 @@ class FieldImpl implements Field
         return separatorPlex.getFcStart();
     }
 
+    public CharacterRun getMarkStartCharacterRun( Range parent )
+    {
+        return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
+                parent ).getCharacterRun( 0 );
+    }
+
     /**
      * @return character position of start field mark
      */
index fed191c7d4975287a6715865ec812940b14195d5..5b8c80501c2ec29d757a6a00aae0d517e26f7ac6 100644 (file)
@@ -82,35 +82,96 @@ public final class HeaderStories {
                 fib.getPlcfHddSize(), 0 );
     }
 
-       public String getFootnoteSeparator() {
-               return getAt(0);
-       }
-       public String getFootnoteContSeparator() {
-               return getAt(1);
-       }
-       public String getFootnoteContNote() {
-               return getAt(2);
-       }
-       public String getEndnoteSeparator() {
-               return getAt(3);
-       }
-       public String getEndnoteContSeparator() {
-               return getAt(4);
-       }
-       public String getEndnoteContNote() {
-               return getAt(5);
-       }
+    @Deprecated
+    public String getFootnoteSeparator()
+    {
+        return getAt( 0 );
+    }
+
+    @Deprecated
+    public String getFootnoteContSeparator()
+    {
+        return getAt( 1 );
+    }
+
+    @Deprecated
+    public String getFootnoteContNote()
+    {
+        return getAt( 2 );
+    }
+
+    @Deprecated
+    public String getEndnoteSeparator()
+    {
+        return getAt( 3 );
+    }
 
+    @Deprecated
+    public String getEndnoteContSeparator()
+    {
+        return getAt( 4 );
+    }
+
+    @Deprecated
+    public String getEndnoteContNote()
+    {
+        return getAt( 5 );
+    }
+
+    public Range getFootnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 0 );
+    }
+
+    public Range getFootnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 1 );
+    }
+
+    public Range getFootnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 2 );
+    }
+
+    public Range getEndnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 3 );
+    }
+
+    public Range getEndnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 4 );
+    }
 
+    public Range getEndnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 5 );
+    }
+
+       @Deprecated
        public String getEvenHeader() {
                return getAt(6+0);
        }
+    @Deprecated
        public String getOddHeader() {
                return getAt(6+1);
        }
+    @Deprecated
        public String getFirstHeader() {
                return getAt(6+4);
        }
+       
+
+    public Range getEvenHeaderSubrange() {
+        return getSubrangeAt(6+0);
+    }
+    public Range getOddHeaderSubrange() {
+        return getSubrangeAt(6+1);
+    }
+    public Range getFirstHeaderSubrange() {
+        return getSubrangeAt(6+4);
+    }
+    
        /**
         * Returns the correct, defined header for the given
         *  one based page
@@ -135,16 +196,39 @@ public final class HeaderStories {
                return getOddHeader();
        }
 
+       @Deprecated
+    public String getEvenFooter()
+    {
+        return getAt( 6 + 2 );
+    }
+
+    @Deprecated
+    public String getOddFooter()
+    {
+        return getAt( 6 + 3 );
+    }
+
+    @Deprecated
+    public String getFirstFooter()
+    {
+        return getAt( 6 + 5 );
+    }
+
+    public Range getEvenFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 2 );
+    }
+
+    public Range getOddFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 3 );
+    }
+
+    public Range getFirstFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 5 );
+    }
 
-       public String getEvenFooter() {
-               return getAt(6+2);
-       }
-       public String getOddFooter() {
-               return getAt(6+3);
-       }
-       public String getFirstFooter() {
-               return getAt(6+5);
-       }
        /**
         * Returns the correct, defined footer for the given
         *  one based page
@@ -174,6 +258,7 @@ public final class HeaderStories {
         * Get the string that's pointed to by the
         *  given plcfHdd index
         */
+    @Deprecated
        private String getAt(int plcfHddIndex) {
                if(plcfHdd == null) return null;
 
@@ -209,6 +294,32 @@ public final class HeaderStories {
                return text;
        }
 
+    private Range getSubrangeAt( int plcfHddIndex )
+    {
+        if ( plcfHdd == null )
+            return null;
+
+        GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
+        if ( prop.getStart() == prop.getEnd() )
+        {
+            // Empty story
+            return null;
+        }
+        if ( prop.getEnd() < prop.getStart() )
+        {
+            // Broken properties?
+            return null;
+        }
+
+        final int headersLength = headerStories.getEndOffset()
+                - headerStories.getStartOffset();
+        int start = Math.min( prop.getStart(), headersLength );
+        int end = Math.min( prop.getEnd(), headersLength );
+
+        return new Range( headerStories.getStartOffset() + start,
+                headerStories.getStartOffset() + end, headerStories );
+    }
+
        public Range getRange() {
                return headerStories;
        }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
new file mode 100644 (file)
index 0000000..2f71f39
--- /dev/null
@@ -0,0 +1,34 @@
+package org.apache.poi.hwpf.usermodel;
+
+import java.io.FileNotFoundException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.util.Internal;
+
+@Internal
+public class ObjectPoolImpl implements ObjectsPool
+{
+    private DirectoryEntry _objectPool;
+
+    public ObjectPoolImpl( DirectoryEntry _objectPool )
+    {
+        super();
+        this._objectPool = _objectPool;
+    }
+
+    public Entry getObjectById( String objId )
+    {
+        if ( _objectPool == null )
+            return null;
+
+        try
+        {
+            return _objectPool.getEntry( objId );
+        }
+        catch ( FileNotFoundException exc )
+        {
+            return null;
+        }
+    }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
new file mode 100644 (file)
index 0000000..fa91d37
--- /dev/null
@@ -0,0 +1,8 @@
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.poifs.filesystem.Entry;
+
+public interface ObjectsPool
+{
+    public Entry getObjectById( String objId );
+}
index 3f23eea0c2d82d5e5357d839c76d0876b9c37a8e..c99c4eb28d78a35e8488a37d632d6478b76c0ce2 100644 (file)
@@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
 import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 
 /**
  * This class is the central class of the HWPF object model. All properties that
@@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
  */
 public class Range { // TODO -instantiable superclass
 
+    private POILogger logger = POILogFactory.getLogger( Range.class );
+    
        public static final int TYPE_PARAGRAPH = 0;
        public static final int TYPE_CHARACTER = 1;
        public static final int TYPE_SECTION = 2;
@@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass
         initAll();
         if ( tableEndInclusive >= this._parEnd )
         {
-            throw new ArrayIndexOutOfBoundsException(
-                    "The table's bounds fall outside of this Range" );
+            logger.log( POILogger.WARN, "The table's bounds ", "["
+                    + this._parStart + "; " + tableEndInclusive + ")",
+                    " fall outside of this Range paragraphs numbers ", "["
+                            + this._parStart + "; " + this._parEnd + ")" );
         }
+
         if ( tableEndInclusive < 0 )
         {
             throw new ArrayIndexOutOfBoundsException(
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
new file mode 100644 (file)
index 0000000..ca1d809
--- /dev/null
@@ -0,0 +1,22 @@
+package org.apache.poi.hwpf.converter;
+
+import junit.framework.TestCase;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+public class TestWordToTextConverter extends TestCase
+{
+
+    /**
+     * [FAILING] Bug 47731 - Word Extractor considers text copied from some
+     * website as an embedded object
+     */
+    public void testBug47731() throws Exception
+    {
+        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
+        String foundText = WordToTextConverter.getText( doc );
+
+        assertTrue( foundText
+                .contains( "Soak the rice in water for three to four hours" ) );
+    }
+}
index cbc7db1d142f722358ebc2c99161440a8b9227bf..48c20dfbe39983ec9454661bd3d4f58828993071 100644 (file)
@@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  * @author Nick Burch (nick at torchbox dot com)
  */
 public final class TestWordExtractor extends TestCase {
+
+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
        private String[] p_text1 = new String[] {
                        "This is a simple word document\r\n",
                        "\r\n",
@@ -107,12 +117,14 @@ public final class TestWordExtractor extends TestCase {
        public void testGetText() {
                assertEquals(p_text1_block, extractor.getText());
 
-               // For the 2nd, should give similar answers for
-               //  the two methods, differing only in line endings
-               assertEquals(
-                     extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), 
-                     extractor2.getText().replaceAll("[\\r\\n]", ""));
-       }
+        // For the 2nd, should give similar answers for
+        // the two methods, differing only in line endings
+
+        // nope, they must have different results, because of garbage
+        // assertEquals(
+        // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+        // extractor2.getText().replaceAll("[\\r\\n]", ""));
+    }
 
        /**
         * Test textPieces based extraction
@@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase {
        
        // Open directly 
        for(DirectoryNode dir : files) {
-          WordExtractor extractor = new WordExtractor(dir, null);
+          WordExtractor extractor = new WordExtractor(dir);
           assertEquals(p_text1_block, extractor.getText());
        }
 
index 0ecfbf4a1f4a5dd0b3510c2eb310b73a87c12a69..01810569a5ec7b99c469243f3f6011dca14ec50d 100644 (file)
@@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils;
 public class TestBugs extends TestCase
 {
 
+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
     private static void assertTableStructures( Range expected, Range actual )
     {
         assertEquals( expected.numParagraphs(), actual.numParagraphs() );