Add Word-to-Text converter and use it as replacement for WordExtractor

author Sergey Vladimirov <sergey@apache.org>

Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)

committer Sergey Vladimirov <sergey@apache.org>

Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
author Sergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
committer Sergey Vladimirov <sergey@apache.org>
Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 6c77dd532c1d91c4d89e68cde1aa4bc2f8ae2334..aca3037dc9ac9ed9c97d18f7a497f727b84323c1 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
  
      <changes>
          <release version="3.8-beta4" date="2011-??-??">
+           <action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
             <action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
             <action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
             <action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>
diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java

index 9ec09fe47a19c25415f848de038cb241488f9b46..00930c3fddde88d0019bdde9d4c82362e78f2dff 100644 (file)
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@@ -19,6 +19,7 @@ package org.apache.poi;
  import org.apache.poi.hpsf.DocumentSummaryInformation;
  import org.apache.poi.hpsf.SummaryInformation;
  import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
@@ -39,7 +40,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
         public POIOLE2TextExtractor(POIDocument document) {
                 super(document);
         }
-       
+
         /**
          * Returns the document information metadata for the document
          */
@@ -52,20 +53,28 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
         public SummaryInformation getSummaryInformation() {
                 return document.getSummaryInformation();
         }
-       
+
         /**
-        * Returns an HPSF powered text extractor for the 
+        * Returns an HPSF powered text extractor for the
          *  document properties metadata, such as title and author.
          */
         public POITextExtractor getMetadataTextExtractor() {
                 return new HPSFPropertiesExtractor(this);
         }
  
-       /**
-        * Return the underlying POIFS FileSystem of
-        *  this document.
-        */
-       public POIFSFileSystem getFileSystem() {
-               return document.directory.getFileSystem();
-       }
+    public DirectoryEntry getRoot()
+    {
+        return document.directory;
+    }
+
+    /**
+     * Return the underlying POIFS FileSystem of this document.
+     *
+     * @deprecated Use {@link #getRoot()} instead
+     */
+    @Deprecated
+    public POIFSFileSystem getFileSystem()
+    {
+        return document.directory.getFileSystem();
+    }
  }
diff --git a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java

index 8a7643b0132637cfb25d7c7fa0360f8d5145eec6..a4ff616125526aed8cfd58b533c98afcc9428d15 100644 (file)
--- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
@@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   */
  public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
     private DirectoryNode _dir;
-       private POIFSFileSystem _fs;
         boolean _includeSheetNames = true;
         boolean _formulasNotResults = false;
  
-       public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
-               super(null);
-               _dir = dir;
-               _fs = fs;
-       }
+    /**
+     * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
+    {
+        this( dir );
+    }
+
+    public EventBasedExcelExtractor( DirectoryNode dir )
+    {
+        super( null );
+        _dir = dir;
+    }
+
     public EventBasedExcelExtractor(POIFSFileSystem fs) {
-      this(fs.getRoot(), fs);
+      this(fs.getRoot());
     }
  
     /**
@@ -79,9 +89,9 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
      *  this document.
      */
     public POIFSFileSystem getFileSystem() {
-      return _fs;
+      return _dir.getFileSystem();
     }
-   
+
         /**
          * Would return the document information metadata for the document,
          *  if we supported it
@@ -200,7 +210,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
                                                 outputNextStringValue = true;
                                                 nextRow = frec.getRow();
                                         } else {
-                                               thisText = _ft.formatNumberDateCell(frec); 
+                                               thisText = _ft.formatNumberDateCell(frec);
                                         }
                                 }
                                 break;
@@ -234,7 +244,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
                         case NumberRecord.sid:
                                 NumberRecord numrec = (NumberRecord) record;
                                 thisRow = numrec.getRow();
-                               thisText = _ft.formatNumberDateCell(numrec); 
+                               thisText = _ft.formatNumberDateCell(numrec);
                                 break;
                         default:
                                 break;
diff --git a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java

index 44de2d638d2d391e7e3d7ddd5f4a1cfd1d9122d8..b620623595cc6847506d52b0c3a86dcfdd1ae576 100644 (file)
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@@ -24,7 +24,6 @@ import java.io.InputStream;
  import java.io.PrintStream;
  
  import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.ss.formula.eval.ErrorEval;
  import org.apache.poi.hssf.usermodel.HSSFCell;
  import org.apache.poi.hssf.usermodel.HSSFCellStyle;
  import org.apache.poi.hssf.usermodel.HSSFComment;
@@ -35,12 +34,13 @@ import org.apache.poi.hssf.usermodel.HSSFSheet;
  import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.formula.eval.ErrorEval;
  import org.apache.poi.ss.usermodel.HeaderFooter;
  
  /**
   * A text extractor for Excel files.
   * <p>
- * Returns the textual content of the file, suitable for 
+ * Returns the textual content of the file, suitable for
   *  indexing by something like Lucene, but not really
   *  intended for display to the user.
   * </p>
@@ -59,19 +59,27 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
         private boolean _includeCellComments = false;
         private boolean _includeBlankCells = false;
         private boolean _includeHeadersFooters = true;
-       
+
         public ExcelExtractor(HSSFWorkbook wb) {
                 super(wb);
                 _wb = wb;
                 _formatter = new HSSFDataFormatter();
         }
         public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-               this(fs.getRoot(), fs);
+               this(fs.getRoot());
         }
-       public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-               this(new HSSFWorkbook(dir, fs, true));
+       /**
+     * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+        this( dir );
+    }
+    public ExcelExtractor(DirectoryNode dir) throws IOException {
+               this(new HSSFWorkbook(dir, true));
         }
-       
+
         private static final class CommandParseException extends Exception {
                 public CommandParseException(String msg) {
                         super(msg);
@@ -183,7 +191,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                         return _headersFooters;
                 }
         }
-       
+
         private static void printUsageMessage(PrintStream ps) {
                 ps.println("Use:");
                 ps.println("    " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
@@ -201,7 +209,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
          * Command line extractor.
          */
         public static void main(String[] args) {
-               
+
                 CommandArgs cmdArgs;
                 try {
                         cmdArgs = new CommandArgs(args);
@@ -211,12 +219,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                         System.exit(1);
                         return; // suppress compiler error
                 }
-               
+
                 if (cmdArgs.isRequestHelp()) {
                         printUsageMessage(System.out);
                         return;
                 }
-               
+
                 try {
                         InputStream is;
                         if(cmdArgs.getInputFile() == null) {
@@ -270,9 +278,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
          * Default is to include them.
          */
         public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
-               _includeHeadersFooters = includeHeadersFooters; 
+               _includeHeadersFooters = includeHeadersFooters;
         }
-       
+
         /**
          * Retrieves the text contents of the file
          */
@@ -282,12 +290,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                 // We don't care about the difference between
                 //  null (missing) and blank cells
                 _wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL);
-               
+
                 // Process each sheet in turn
                 for(int i=0;i<_wb.getNumberOfSheets();i++) {
                         HSSFSheet sheet = _wb.getSheetAt(i);
                         if(sheet == null) { continue; }
-                       
+
                         if(_includeSheetNames) {
                                 String name = _wb.getSheetName(i);
                                 if(name != null) {
@@ -295,12 +303,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                         text.append("\n");
                                 }
                         }
-                       
+
                         // Header text, if there is any
                         if(_includeHeadersFooters) {
                                 text.append(_extractHeaderFooter(sheet.getHeader()));
                         }
-                       
+
                         int firstRow = sheet.getFirstRowNum();
                         int lastRow = sheet.getLastRowNum();
                         for(int j=firstRow;j<=lastRow;j++) {
@@ -313,7 +321,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                 if(_includeBlankCells) {
                                         firstCell = 0;
                                 }
-                               
+
                                 for(int k=firstCell;k<lastCell;k++) {
                                         HSSFCell cell = row.getCell(k);
                                         boolean outputContents = true;
@@ -368,14 +376,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                                                                 case HSSFCell.CELL_TYPE_ERROR:
                                                                                         text.append(ErrorEval.getText(cell.getErrorCellValue()));
                                                                                         break;
-                                                                                       
+
                                                                         }
                                                                 }
                                                                 break;
                                                         default:
                                                                 throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
                                                 }
-                                               
+
                                                 // Output the comment, if requested and exists
                                                 HSSFComment comment = cell.getCellComment();
                                                 if(_includeCellComments && comment != null) {
@@ -385,29 +393,29 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                                                         text.append(" Comment by "+comment.getAuthor()+": "+commentText);
                                                 }
                                         }
-                                       
+
                                         // Output a tab if we're not on the last cell
                                         if(outputContents && k < (lastCell-1)) {
                                                 text.append("\t");
                                         }
                                 }
-                               
+
                                 // Finish off the row
                                 text.append("\n");
                         }
-                       
+
                         // Finally Footer text, if there is any
                         if(_includeHeadersFooters) {
                                 text.append(_extractHeaderFooter(sheet.getFooter()));
                         }
                 }
-               
+
                 return text.toString();
         }
-       
+
         public static String _extractHeaderFooter(HeaderFooter hf) {
                 StringBuffer text = new StringBuffer();
-               
+
                 if(hf.getLeft() != null) {
                         text.append(hf.getLeft());
                 }
@@ -423,7 +431,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
                 }
                 if(text.length() > 0)
                         text.append("\n");
-               
+
                 return text.toString();
         }
  }
diff --git a/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java b/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java

index 844070fb508a90b7f08250ccf421f9ed660f79a2..bb50a7557112ef97422d8660189f87c24f61e969 100644 (file)
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
@@ -15,13 +15,14 @@
     See the License for the specific language governing permissions and
     limitations under the License.
  ==================================================================== */
-        
  
-package org.apache.poi.poifs.filesystem;
  
-import java.io.*;
+package org.apache.poi.poifs.filesystem;
  
-import java.util.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
  
  import org.apache.poi.hpsf.ClassID;
  
@@ -67,6 +68,12 @@ public interface DirectoryEntry
  
      public int getEntryCount();
  
+    /**
+     * Checks if entry with specified name present
+     */
+
+    public boolean hasEntry( final String name );
+
      /**
       * get a specified Entry by name
       *
diff --git a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java

index d6f86d5ef4ecf37a93b6f3dbd2d0c1a26e75a694..20effd8bdb851a817c5ea2dde6b3a4a7287ce06d 100644 (file)
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
@@ -15,7 +15,7 @@
     See the License for the specific language governing permissions and
     limitations under the License.
  ==================================================================== */
-        
+
  
  package org.apache.poi.poifs.filesystem;
  
@@ -53,7 +53,7 @@ public class DirectoryNode
      // the POIFSFileSystem we belong to
      private POIFSFileSystem   _ofilesystem;
      // the NPOIFSFileSytem we belong to
-    private NPOIFSFileSystem  _nfilesystem; 
+    private NPOIFSFileSystem  _nfilesystem;
  
      // the path described by this document
      private POIFSDocumentPath _path;
@@ -72,7 +72,7 @@ public class DirectoryNode
      {
         this(property, parent, filesystem, (NPOIFSFileSystem)null);
      }
-    
+
      /**
       * create a DirectoryNode. This method is not public by design; it
       * is intended strictly for the internal use of this package
@@ -87,7 +87,7 @@ public class DirectoryNode
      {
         this(property, parent, (POIFSFileSystem)null, nfilesystem);
      }
-    
+
      private DirectoryNode(final DirectoryProperty property,
                            final DirectoryNode parent,
                            final POIFSFileSystem ofilesystem,
@@ -96,7 +96,7 @@ public class DirectoryNode
          super(property, parent);
          this._ofilesystem = ofilesystem;
          this._nfilesystem = nfilesystem;
-        
+
          if (parent == null)
          {
              _path = new POIFSDocumentPath();
@@ -143,23 +143,23 @@ public class DirectoryNode
      {
          return _path;
      }
-    
+
      /**
       * @return the filesystem that this belongs to
       */
      public POIFSFileSystem getFileSystem()
      {
-        return _ofilesystem; 
+        return _ofilesystem;
      }
-    
+
      /**
       * @return the filesystem that this belongs to
       */
      public NPOIFSFileSystem getNFileSystem()
      {
-        return _nfilesystem; 
+        return _nfilesystem;
      }
-    
+
      /**
       * open a document in the directory's entry's list of entries
       *
@@ -195,7 +195,7 @@ public class DirectoryNode
              throw new IOException("Entry '" + document.getName()
                                    + "' is not a DocumentEntry");
          }
-        
+
          DocumentEntry entry = (DocumentEntry)document;
          return new DocumentInputStream(entry);
      }
@@ -217,7 +217,7 @@ public class DirectoryNode
  
          (( DirectoryProperty ) getProperty()).addChild(property);
          _ofilesystem.addDocument(document);
-        
+
          _entries.add(rval);
          _byname.put(property.getName(), rval);
          return rval;
@@ -240,7 +240,7 @@ public class DirectoryNode
  
          (( DirectoryProperty ) getProperty()).addChild(property);
          _nfilesystem.addDocument(document);
-        
+
          _entries.add(rval);
          _byname.put(property.getName(), rval);
          return rval;
@@ -290,7 +290,7 @@ public class DirectoryNode
          {
              _entries.remove(entry);
                    _byname.remove(entry.getName());
-                  
+
                    if(_ofilesystem != null) {
                 _ofilesystem.remove(entry);
                    } else {
@@ -342,6 +342,11 @@ public class DirectoryNode
          return _entries.size();
      }
  
+    public boolean hasEntry( String name )
+    {
+        return name != null && _byname.containsKey( name );
+    }
+
      /**
       * get a specified Entry by name
       *
@@ -430,7 +435,7 @@ public class DirectoryNode
      {
          DirectoryNode rval;
          DirectoryProperty property = new DirectoryProperty(name);
-        
+
          if(_ofilesystem != null) {
             rval = new DirectoryNode(property, _ofilesystem, this);
             _ofilesystem.addDirectory(property);
@@ -562,7 +567,7 @@ public class DirectoryNode
       * Returns an Iterator over all the entries
       */
      public Iterator<Entry> iterator() {
-        return getEntries(); 
+        return getEntries();
      }
  
      /* **********  END  begin implementation of POIFSViewable ********** */
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java

index 57b9aa914fa446870da8c1ab53b75c65774527f5..685d92c84529adf8c3d9dff54b2a110c7d067cf8 100644 (file)
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -66,48 +66,48 @@ import org.apache.xmlbeans.XmlException;
  public class ExtractorFactory {
         public static final String CORE_DOCUMENT_REL =
                 "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
-       
-       
+
+
         /** Should this thread prefer event based over usermodel based extractors? */
         private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
        protected Boolean initialValue() { return Boolean.FALSE; }
         };
         /** Should all threads prefer event based over usermodel based extractors? */
         private static Boolean allPreferEventExtractors;
-       
-   /** 
+
+   /**
      * Should this thread prefer event based over usermodel based extractors?
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is false. 
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is false.
      */
         public static boolean getThreadPrefersEventExtractors() {
            return threadPreferEventExtractors.get();
         }
-   /** 
-    * Should all threads prefer event based over usermodel based extractors? 
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is to use the thread level setting, which defaults to false. 
+   /**
+    * Should all threads prefer event based over usermodel based extractors?
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is to use the thread level setting, which defaults to false.
      */
         public static Boolean getAllThreadsPreferEventExtractors() {
            return allPreferEventExtractors;
         }
-       
-   /** 
+
+   /**
      * Should this thread prefer event based over usermodel based extractors?
-    * Will only be used if the All Threads setting is null. 
+    * Will only be used if the All Threads setting is null.
      */
     public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
        threadPreferEventExtractors.set(preferEventExtractors);
     }
-   /** 
+   /**
      * Should all threads prefer event based over usermodel based extractors?
-    * If set, will take preference over the Thread level setting. 
+    * If set, will take preference over the Thread level setting.
      */
     public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
        allPreferEventExtractors = preferEventExtractors;
     }
-       
-   
+
+
     /**
      * Should this thread use event based extractors is available?
      * Checks the all-threads one first, then thread specific.
@@ -118,8 +118,8 @@ public class ExtractorFactory {
        }
        return threadPreferEventExtractors.get();
     }
-   
-       
+
+
         public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
                 InputStream inp = null;
          try {
@@ -137,14 +137,14 @@ public class ExtractorFactory {
              if(inp != null) inp.close();
          }
      }
-       
+
         public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
                 // Figure out the kind of stream
                 // If clearly doesn't do mark/reset, wrap up
                 if(! inp.markSupported()) {
                         inp = new PushbackInputStream(inp, 8);
                 }
-               
+
                 if(POIFSFileSystem.hasPOIFSHeader(inp)) {
                         return createExtractor(new POIFSFileSystem(inp));
                 }
@@ -153,16 +153,16 @@ public class ExtractorFactory {
                 }
                 throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
         }
-       
+
         public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
-       PackageRelationshipCollection core = 
+       PackageRelationshipCollection core =
              pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
         if(core.size() != 1) {
            throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
         }
  
         PackagePart corePart = pkg.getPart(core.getRelationship(0));
-        
+
         // Is it XSSF?
         for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
            if(corePart.getContentType().equals(rel.getContentType())) {
@@ -173,84 +173,98 @@ public class ExtractorFactory {
               }
            }
         }
-        
+
         // Is it XWPF?
         for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
            if(corePart.getContentType().equals(rel.getContentType())) {
               return new XWPFWordExtractor(pkg);
            }
         }
-       
+
         // Is it XSLF?
         for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
            if(corePart.getContentType().equals(rel.getContentType())) {
               return new XSLFPowerPointExtractor(pkg);
            }
         }
-       
+
         throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
         }
-       
+
         public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
            // Only ever an OLE2 one from the root of the FS
-               return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
+               return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
         }
-       public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
-               // Look for certain entries in the stream, to figure it
-               //  out from
-               for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
-                       Entry entry = entries.next();
-                       
-                       if(entry.getName().equals("Workbook")) {
-                          if(getPreferEventExtractor()) {
-               return new EventBasedExcelExtractor(poifsDir, fs);
-                          } else {
-                             return new ExcelExtractor(poifsDir, fs);
-                          }
-                       }
-                       if(entry.getName().equals("WordDocument")) {
-                           // Old or new style word document?
-                           try {
-                               return new WordExtractor(poifsDir, fs);
-                           } catch(OldWordFileFormatException e) {
-                               return new Word6Extractor(poifsDir, fs);
-                           }
-                       }
-                       if(entry.getName().equals("PowerPoint Document")) {
-                               return new PowerPointExtractor(poifsDir, fs);
-                       }
-                       if(entry.getName().equals("VisioDocument")) {
-                               return new VisioTextExtractor(poifsDir, fs);
-                       }
-         if(entry.getName().equals("Quill")) {
-            return new PublisherTextExtractor(poifsDir, fs);
-         }
-                       if(
-                entry.getName().equals("__substg1.0_1000001E") ||
-                entry.getName().equals("__substg1.0_1000001F") ||
-                entry.getName().equals("__substg1.0_0047001E") ||
-                entry.getName().equals("__substg1.0_0047001F") ||
-                entry.getName().equals("__substg1.0_0037001E") ||
-                entry.getName().equals("__substg1.0_0037001F")
-                       ) {
-                          return new OutlookTextExtactor(poifsDir, fs);
-                       }
-                       if(entry.getName().equals("Package")) {
-                          OPCPackage pkg = OPCPackage.open(
-                                poifsDir.createDocumentInputStream(entry.getName())
-                          );
-                          return createExtractor(pkg);
-                       }
-               }
-               throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
-       }
-       
-       
+
+    /**
+     * @deprecated Use {@link #createExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings("unused")
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
+            throws IOException, InvalidFormatException, OpenXML4JException, XmlException
+    {
+        return createExtractor(poifsDir);
+    }
+
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
+            InvalidFormatException, OpenXML4JException, XmlException
+    {
+        // Look for certain entries in the stream, to figure it
+        // out from
+        if (poifsDir.hasEntry("Workbook")) {
+            if (getPreferEventExtractor()) {
+                return new EventBasedExcelExtractor(poifsDir);
+            }
+            return new ExcelExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("WordDocument")) {
+            // Old or new style word document?
+            try {
+                return new WordExtractor(poifsDir);
+            } catch (OldWordFileFormatException e) {
+                return new Word6Extractor(poifsDir);
+            }
+        }
+
+        if (poifsDir.hasEntry("PowerPoint Document")) {
+            return new PowerPointExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("VisioDocument")) {
+            return new VisioTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("Quill")) {
+            return new PublisherTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
+                || poifsDir.hasEntry("__substg1.0_0047001E")
+                || poifsDir.hasEntry("__substg1.0_0047001F")
+                || poifsDir.hasEntry("__substg1.0_0037001E")
+                || poifsDir.hasEntry("__substg1.0_0037001F"))
+        {
+            return new OutlookTextExtactor(poifsDir);
+        }
+
+        for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
+            Entry entry = entries.next();
+
+            if (entry.getName().equals("Package")) {
+                OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
+                return createExtractor(pkg);
+            }
+        }
+        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+    }
+
         /**
          * Returns an array of text extractors, one for each of
          *  the embeded documents in the file (if there are any).
          * If there are no embeded documents, you'll get back an
-        *  empty array. Otherwise, you'll get one open 
+        *  empty array. Otherwise, you'll get one open
          *  {@link POITextExtractor} for each embeded file.
          */
         public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@@ -258,16 +272,16 @@ public class ExtractorFactory {
                 ArrayList<Entry> dirs = new ArrayList<Entry>();
                 // For anything else not directly held in as a POIFS directory
                 ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-               
+
        // Find all the embeded directories
-               POIFSFileSystem fs = ext.getFileSystem();
-               if(fs == null) {
+               DirectoryEntry root = ext.getRoot();
+               if(root == null) {
                         throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
                 }
-               
+
                 if(ext instanceof ExcelExtractor) {
                         // These are in MBD... under the root
-                       Iterator<Entry> it = fs.getRoot().getEntries();
+                       Iterator<Entry> it = root.getEntries();
                         while(it.hasNext()) {
                                 Entry entry = it.next();
                                 if(entry.getName().startsWith("MBD")) {
@@ -278,7 +292,7 @@ public class ExtractorFactory {
                         // These are in ObjectPool -> _... under the root
                         try {
                                 DirectoryEntry op = (DirectoryEntry)
-                                       fs.getRoot().getEntry("ObjectPool");
+                                       root.getEntry("ObjectPool");
                                 Iterator<Entry> it = op.getEntries();
                                 while(it.hasNext()) {
                                         Entry entry = it.next();
@@ -302,7 +316,7 @@ public class ExtractorFactory {
                       }
                    }
                 }
-               
+
                 // Create the extractors
                 if(
                       (dirs == null || dirs.size() == 0) &&
@@ -310,11 +324,11 @@ public class ExtractorFactory {
                 ){
                         return new POITextExtractor[0];
                 }
-               
+
                 ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
                 for(int i=0; i<dirs.size(); i++) {
                         e.add( createExtractor(
-                                       (DirectoryNode)dirs.get(i), ext.getFileSystem()
+                                       (DirectoryNode)dirs.get(i)
                         ) );
                 }
                 for(int i=0; i<nonPOIFS.size(); i++) {
@@ -336,7 +350,7 @@ public class ExtractorFactory {
          * Returns an array of text extractors, one for each of
          *  the embeded documents in the file (if there are any).
          * If there are no embeded documents, you'll get back an
-        *  empty array. Otherwise, you'll get one open 
+        *  empty array. Otherwise, you'll get one open
          *  {@link POITextExtractor} for each embeded file.
          */
         public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java

index 72843973fb10bc8a94e4ef15407b963635fc3221..3beb3ffe73e75ba1285bd5a60e38c74678aa26ad 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@@ -23,6 +23,8 @@ import java.io.IOException;
  import java.io.InputStream;
  import java.io.OutputStream;
  
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
  import org.apache.poi.hwpf.model.BookmarksTables;
  import org.apache.poi.hwpf.model.CHPBinTable;
  import org.apache.poi.hwpf.model.CPSplitCalculator;
@@ -190,7 +192,9 @@ public final class HWPFDocument extends HWPFDocumentCore
     * @param pfilesystem The POIFSFileSystem that contains the Word document.
     * @throws IOException If there is an unexpected IOException from the passed
     *         in POIFSFileSystem.
+   * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
     */
+  @Deprecated
    public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
    {
       this(directory);
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java

index 50171c37e77b1018a30552552a18e15add00ecfc..fc2bb15e68641e19fba7db48ba766118b33e1295 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@@ -17,10 +17,17 @@
  
  package org.apache.poi.hwpf;
  
+import java.io.FileNotFoundException;
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.PushbackInputStream;
  
+import org.apache.poi.hwpf.usermodel.ObjectsPool;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
  import org.apache.poi.EncryptedDocumentException;
  import org.apache.poi.POIDocument;
  import org.apache.poi.hwpf.model.CHPBinTable;
@@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
   */
  public abstract class HWPFDocumentCore extends POIDocument
  {
+  /** Holds OLE2 objects */
+  protected ObjectPoolImpl _objectPool;
+
    /** The FIB */
    protected FileInformationBlock _fib;
  
@@ -148,7 +158,21 @@ public abstract class HWPFDocumentCore extends POIDocument
      if(_fib.isFEncrypted()) {
         throw new EncryptedDocumentException("Cannot process encrypted word files!");
      }
-  }
+
+        {
+            DirectoryEntry objectPoolEntry;
+            try
+            {
+                objectPoolEntry = (DirectoryEntry) directory
+                        .getEntry( "ObjectPool" );
+            }
+            catch ( FileNotFoundException exc )
+            {
+                objectPoolEntry = directory.createDirectory( "ObjectPool" );
+            }
+            _objectPool = new ObjectPoolImpl( objectPoolEntry );
+        }
+    }
  
      /**
       * Returns the range which covers the whole of the document, but excludes
@@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore extends POIDocument
      return _fib;
    }
  
+    public ObjectsPool getObjectsPool()
+    {
+        return _objectPool;
+    }
+
      public abstract TextPieceTable getTextTable();
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java

index 182e6344458af957885ada978c3398ebef38c9d0..2d636e06c798a9409029a81db0bb956fb443feaa 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
          this(fs.getRoot());
      }
  
+    @Deprecated
      public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
              throws IOException {
         this(directory);
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java

index e71aed0ac69f49ca7210a669888f87808de07758..cc104b84aaa03fe7cdd35fe764776ed85202da27 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
@@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Section;
  import org.apache.poi.hwpf.usermodel.Table;
  import org.apache.poi.hwpf.usermodel.TableCell;
  import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.Entry;
  import org.apache.poi.util.Beta;
  import org.apache.poi.util.POILogFactory;
  import org.apache.poi.util.POILogger;
@@ -56,6 +57,32 @@ import org.w3c.dom.Element;
  @Beta
  public abstract class AbstractWordConverter
  {
+    private static final class Structure implements Comparable<Structure>
+    {
+        final int end;
+        final int start;
+        final Object structure;
+
+        Structure( Bookmark bookmark )
+        {
+            this.start = bookmark.getStart();
+            this.end = bookmark.getEnd();
+            this.structure = bookmark;
+        }
+
+        Structure( Field field )
+        {
+            this.start = field.getFieldStartOffset();
+            this.end = field.getFieldEndOffset();
+            this.structure = field;
+        }
+
+        public int compareTo( Structure o )
+        {
+            return start < o.start ? -1 : start == o.start ? 0 : 1;
+        }
+    }
+
      private static final byte BEL_MARK = 7;
  
      private static final byte FIELD_BEGIN_MARK = 19;
@@ -396,6 +423,13 @@ public abstract class AbstractWordConverter
                      processDrawnObject( doc, characterRun, block );
                      continue;
                  }
+                if ( characterRun.isOle2()
+                        && ( wordDocument instanceof HWPFDocument ) )
+                {
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
+                    processOle2( doc, characterRun, block );
+                    continue;
+                }
              }
  
              if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
@@ -613,10 +647,11 @@ public abstract class AbstractWordConverter
              CharacterRun characterRun, OfficeDrawing officeDrawing,
              String path, Element block );
  
-    protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range endnoteTextRange );
+    protected abstract void processEndnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range endnoteTextRange );
  
-    protected void processField( HWPFDocument hwpfDocument, Range parentRange,
+    protected void processField( HWPFDocument wordDocument, Range parentRange,
              int currentTableLevel, Field field, Element currentBlock )
      {
          switch ( field.getType() )
@@ -633,7 +668,7 @@ public abstract class AbstractWordConverter
                  if ( matcher.find() )
                  {
                      String pageref = matcher.group( 1 );
-                    processPageref( hwpfDocument, currentBlock,
+                    processPageref( wordDocument, currentBlock,
                              field.secondSubrange( parentRange ),
                              currentTableLevel, pageref );
                      return;
@@ -641,6 +676,36 @@ public abstract class AbstractWordConverter
              }
              break;
          }
+        case 58: // Embedded Object
+        {
+            if ( !field.hasSeparator() )
+            {
+                logger.log( POILogger.WARN, parentRange + " contains " + field
+                        + " with 'Embedded Object' but without separator mark" );
+                return;
+            }
+
+            CharacterRun separator = field
+                    .getMarkSeparatorCharacterRun( parentRange );
+
+            if ( separator.isOle2() )
+            {
+                // the only supported so far
+                boolean processed = processOle2( wordDocument, separator,
+                        currentBlock );
+
+                // if we didn't output OLE - output field value
+                if ( !processed )
+                {
+                    processCharacters( wordDocument, currentTableLevel,
+                            field.secondSubrange( parentRange ), currentBlock );
+                }
+
+                return;
+            }
+
+            break;
+        }
          case 88: // hyperlink
          {
              final Range firstSubrange = field.firstSubrange( parentRange );
@@ -653,7 +718,7 @@ public abstract class AbstractWordConverter
                  if ( matcher.find() )
                  {
                      String hyperlink = matcher.group( 1 );
-                    processHyperlink( hwpfDocument, currentBlock,
+                    processHyperlink( wordDocument, currentBlock,
                              field.secondSubrange( parentRange ),
                              currentTableLevel, hyperlink );
                      return;
@@ -665,12 +730,13 @@ public abstract class AbstractWordConverter
  
          logger.log( POILogger.WARN, parentRange + " contains " + field
                  + " with unsupported type or format" );
-        processCharacters( hwpfDocument, currentTableLevel,
+        processCharacters( wordDocument, currentTableLevel,
                  field.secondSubrange( parentRange ), currentBlock );
      }
  
-    protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range footnoteTextRange );
+    protected abstract void processFootnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range footnoteTextRange );
  
      protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
              Element currentBlock, Range textRange, int currentTableLevel,
@@ -732,6 +798,40 @@ public abstract class AbstractWordConverter
          }
      }
  
+    private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
+            Element block )
+    {
+        Entry entry = doc.getObjectsPool().getObjectById(
+                "_" + characterRun.getPicOffset() );
+        if ( entry == null )
+        {
+            logger.log( POILogger.WARN, "Referenced OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ),
+                    "' not found in ObjectPool" );
+            return false;
+        }
+
+        try
+        {
+            return processOle2( doc, block, entry );
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.WARN,
+                    "Unable to convert internal OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
+                    exc );
+            return false;
+        }
+    }
+
+    @SuppressWarnings( "unused" )
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        return false;
+    }
+
      protected abstract void processPageref( HWPFDocumentCore wordDocument,
              Element currentBlock, Range textRange, int currentTableLevel,
              String pageref );
@@ -896,30 +996,4 @@ public abstract class AbstractWordConverter
          return endMark;
      }
  
-    private static final class Structure implements Comparable<Structure>
-    {
-        final int end;
-        final int start;
-        final Object structure;
-
-        Structure( Bookmark bookmark )
-        {
-            this.start = bookmark.getStart();
-            this.end = bookmark.getEnd();
-            this.structure = bookmark;
-        }
-
-        Structure( Field field )
-        {
-            this.start = field.getFieldStartOffset();
-            this.end = field.getFieldEndOffset();
-            this.structure = field;
-        }
-
-        public int compareTo( Structure o )
-        {
-            return start < o.start ? -1 : start == o.start ? 0 : 1;
-        }
-    }
-
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java

index e7b85b0b748efbe9347e49a0f22da26e34126558..eeb5d7843f94757657f17d557ee61bcff589d147 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
@@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph;
  import org.apache.poi.hwpf.usermodel.Table;
  import org.apache.poi.hwpf.usermodel.TableCell;
  import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  import org.apache.poi.util.Beta;
  import org.apache.poi.util.IOUtils;
@@ -422,6 +423,19 @@ public class AbstractWordUtils
          return !isEmpty( str );
      }
  
+    public static HWPFDocumentCore loadDoc( final DirectoryNode root )
+            throws IOException
+    {
+        try
+        {
+            return new HWPFDocument( root );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( root );
+        }
+    }
+
      public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
      {
          final FileInputStream istream = new FileInputStream( docFile );
@@ -438,16 +452,13 @@ public class AbstractWordUtils
      public static HWPFDocumentCore loadDoc( InputStream inputStream )
              throws IOException
      {
-        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
-                .verifyAndBuildPOIFS( inputStream );
-        try
-        {
-            return new HWPFDocument( poifsFileSystem );
-        }
-        catch ( OldWordFileFormatException exc )
-        {
-            return new HWPFOldDocument( poifsFileSystem );
-        }
+        return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
+    }
+
+    public static HWPFDocumentCore loadDoc(
+            final POIFSFileSystem poifsFileSystem ) throws IOException
+    {
+        return loadDoc( poifsFileSystem.getRoot() );
      }
  
      static String substringBeforeLast( String str, String separator )
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java

index dd9cfe8824055376b2d6f9b1e4d527d6ac759fc7..7e3e4dfe7df3debb4e300320fc6d8368edfb7f0c 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
@@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter
      }
  
      @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
      {
          final String textIndex = String.valueOf( internalLinkCounter
                  .incrementAndGet() );
@@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter
          setId( backwardLink, forwardLinkName );
          endnote.appendChild( backwardLink );
  
-        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
+        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
+                endnote );
  
          WordToFoUtils.compactInlines( endnote );
          this.endnotes.add( endnote );
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java

index 040d32a8793331b829819a7b10ac0018411a5eb4..e6233d2c3eefa8f2ab7e4697ba36de171819f866 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
@@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
  @Beta
  public class WordToHtmlConverter extends AbstractWordConverter
  {
-
      /**
       * Holds properties values, applied to current <tt>p</tt> element. Those
       * properties shall not be doubled in children <tt>span</tt> elements.
@@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
      }
  
      @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
      {
-        processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
+        processNoteAutonumbered( wordDocument, "end", noteIndex, block,
+                endnoteTextRange );
      }
  
      @Override
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java

index edea5dab6c5d6249e8755e7c74ca14a2e2e29ec5..fa9b87fcd74f2ddb076c7478e37d6230bbbb9d5c 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
@@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;
  
  import java.io.File;
  import java.io.FileWriter;
+import java.io.StringWriter;
+import java.lang.reflect.Method;
  import java.util.List;
  import java.util.concurrent.atomic.AtomicInteger;
  
+import javax.xml.parsers.DocumentBuilder;
  import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
  import javax.xml.transform.OutputKeys;
  import javax.xml.transform.Transformer;
  import javax.xml.transform.TransformerFactory;
@@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section;
  import org.apache.poi.hwpf.usermodel.Table;
  import org.apache.poi.hwpf.usermodel.TableCell;
  import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
  import org.apache.poi.util.Beta;
  import org.w3c.dom.Document;
  import org.w3c.dom.Element;
@@ -33,6 +39,29 @@ import org.w3c.dom.Element;
  public class WordToTextConverter extends AbstractWordConverter
  {
  
+    public static String getText( DirectoryNode root ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
+        return getText( wordDocument );
+    }
+
+    public static String getText( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils
+                .loadDoc( docFile );
+        return getText( wordDocument );
+    }
+
+    public static String getText( final HWPFDocumentCore wordDocument )
+            throws Exception
+    {
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+        return wordToTextConverter.getText();
+    }
+
      /**
       * Java main() interface to interact with {@link WordToTextConverter}
       * 
@@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter
  
      private Element notes = null;
  
+    private boolean outputSummaryInformation = false;
+
      private final TextDocumentFacade textDocumentFacade;
  
+    /**
+     * Creates new instance of {@link WordToTextConverter}. Can be used for
+     * output several {@link HWPFDocument}s into single text document.
+     * 
+     * @throws ParserConfigurationException
+     *             if an internal {@link DocumentBuilder} cannot be created
+     */
+    public WordToTextConverter() throws ParserConfigurationException
+    {
+        this.textDocumentFacade = new TextDocumentFacade(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+    }
+
      /**
       * Creates new instance of {@link WordToTextConverter}. Can be used for
       * output several {@link HWPFDocument}s into single text document.
@@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter
          return textDocumentFacade.getDocument();
      }
  
+    public String getText() throws Exception
+    {
+        StringWriter stringWriter = new StringWriter();
+        DOMSource domSource = new DOMSource( getDocument() );
+        StreamResult streamResult = new StreamResult( stringWriter );
+
+        TransformerFactory tf = TransformerFactory.newInstance();
+        Transformer serializer = tf.newTransformer();
+        // TODO set encoding from a command argument
+        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+        serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+        serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+        serializer.transform( domSource, streamResult );
+
+        return stringWriter.toString();
+    }
+
+    public boolean isOutputSummaryInformation()
+    {
+        return outputSummaryInformation;
+    }
+
      @Override
      protected void outputCharacters( Element block, CharacterRun characterRun,
              String text )
@@ -138,18 +205,24 @@ public class WordToTextConverter extends AbstractWordConverter
      protected void processDocumentInformation(
              SummaryInformation summaryInformation )
      {
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
-            textDocumentFacade.setTitle( summaryInformation.getTitle() );
+        if ( isOutputSummaryInformation() )
+        {
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
+                textDocumentFacade.setTitle( summaryInformation.getTitle() );
  
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
-            textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
+                textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
  
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
-            textDocumentFacade
-                    .addDescription( summaryInformation.getComments() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getComments() ) )
+                textDocumentFacade.addDescription( summaryInformation
+                        .getComments() );
  
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
-            textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getKeywords() ) )
+                textDocumentFacade.addKeywords( summaryInformation
+                        .getKeywords() );
+        }
      }
  
      @Override
@@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter
          note.appendChild( textDocumentFacade.createText( "\n" ) );
      }
  
+    @Override
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        if ( !( entry instanceof DirectoryNode ) )
+            return false;
+        DirectoryNode directoryNode = (DirectoryNode) entry;
+
+        // even if no ExtractorFactory in classpath
+        if ( directoryNode.hasEntry( "WordDocument" ) )
+        {
+            String text = WordToTextConverter.getText( (DirectoryNode) entry );
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+
+        try
+        {
+            Class<?> cls = Class
+                    .forName( "org.apache.poi.extractor.ExtractorFactory" );
+            Method createExtractor = cls.getMethod( "createExtractor",
+                    DirectoryNode.class );
+            Object extractor = createExtractor.invoke( null, directoryNode );
+
+            Method getText = extractor.getClass().getMethod( "getText" );
+            String text = (String) getText.invoke( extractor );
+
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+        catch ( ClassNotFoundException exc )
+        {
+            // no extractor in classpath
+        }
+
+        return false;
+    }
+
      @Override
      protected void processPageref( HWPFDocumentCore wordDocument,
              Element currentBlock, Range textRange, int currentTableLevel,
@@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter
          textDocumentFacade.body.appendChild( sectionElement );
      }
  
-    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
              Table table )
      {
          final int tableRows = table.numRows();
@@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter
                      tableCellElement.appendChild( textDocumentFacade
                              .createText( "\t" ) );
  
-                processParagraphes( hwpfDocument, tableCellElement, tableCell,
-                        table.getTableLevel() );
+                processCharacters( wordDocument, table.getTableLevel(),
+                        tableCell, tableCellElement );
                  tableRowElement.appendChild( tableCellElement );
              }
  
@@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter
          }
      }
  
+    public void setOutputSummaryInformation( boolean outputDocumentInformation )
+    {
+        this.outputSummaryInformation = outputDocumentInformation;
+    }
+
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java

index 123f501781e4a63b59c7c90fc4529ebfc239456c..b5331e8be11c208281059777c9343764f24dfa57 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;
  
  import java.io.IOException;
  import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
  
  import org.apache.poi.POIOLE2TextExtractor;
  import org.apache.poi.hwpf.HWPFOldDocument;
@@ -47,16 +51,32 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
                 this( new POIFSFileSystem(is) );
         }
  
-       /**
-        * Create a new Word Extractor
-        * @param fs POIFSFileSystem containing the word file
-        */
-       public Word6Extractor(POIFSFileSystem fs) throws IOException {
-               this(fs.getRoot(), fs);
-       }
-       public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-           this(new HWPFOldDocument(dir,fs));
-       }
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public Word6Extractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( fs.getRoot() );
+    }
+
+    /**
+     * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public Word6Extractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFOldDocument( dir ) );
+    }
  
         /**
          * Create a new Word Extractor
@@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
       * Get the text from the word file, as an array with one String
       *  per paragraph
       */
+       @Deprecated
         public String[] getParagraphText() {
             String[] ret;
  
@@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
             return ret;
         }
  
-    public String getText() {
-        StringBuffer text = new StringBuffer();
-        
-        for(String t : getParagraphText()) {
-            text.append(t);
+    public String getText()
+    {
+        try
+        {
+            WordToTextConverter wordToTextConverter = new WordToTextConverter();
+            wordToTextConverter.processDocument( doc );
+            return wordToTextConverter.getText();
          }
+        catch ( Exception exc )
+        {
+            // fall-back
+            StringBuffer text = new StringBuffer();
  
-        return text.toString();
+            for ( String t : getParagraphText() )
+            {
+                text.append( t );
+            }
+
+            return text.toString();
+        }
      }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index 464b11a406907aa0883d3584cfadad8d43275fba..8438df4f0402f7741e885efb247824782298645a 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
  import java.io.FileInputStream;
  import java.io.IOException;
  import java.io.InputStream;
+import java.io.StringWriter;
  import java.util.ArrayList;
  import java.util.Arrays;
  
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+
  import org.apache.poi.POIOLE2TextExtractor;
  import org.apache.poi.hwpf.HWPFDocument;
  import org.apache.poi.hwpf.usermodel.HeaderStories;
@@ -33,231 +36,300 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  
  /**
   * Class to extract the text from a Word Document.
- *
- * You should use either getParagraphText() or getText() unless
- *  you have a strong reason otherwise.
- *
+ * 
+ * You should use either getParagraphText() or getText() unless you have a
+ * strong reason otherwise.
+ * 
   * @author Nick Burch
   */
-public final class WordExtractor extends POIOLE2TextExtractor {
-       private POIFSFileSystem fs;
-       private HWPFDocument doc;
-
-       /**
-        * Create a new Word Extractor
-        * @param is InputStream containing the word file
-        */
-       public WordExtractor(InputStream is) throws IOException {
-               this( HWPFDocument.verifyAndBuildPOIFS(is) );
-       }
-
-       /**
-        * Create a new Word Extractor
-        * @param fs POIFSFileSystem containing the word file
-        */
-       public WordExtractor(POIFSFileSystem fs) throws IOException {
-               this(new HWPFDocument(fs));
-               this.fs = fs;
-       }
-       public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-               this(new HWPFDocument(dir, fs));
-               this.fs = fs;
-       }
-
-       /**
-        * Create a new Word Extractor
-        * @param doc The HWPFDocument to extract from
-        */
-       public WordExtractor(HWPFDocument doc) {
-               super(doc);
-               this.doc = doc;
-       }
-
-       /**
-        * Command line extractor, so people will stop moaning that
-        *  they can't just run this.
-        */
-       public static void main(String[] args) throws IOException {
-               if(args.length == 0) {
-                       System.err.println("Use:");
-                       System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
-                       System.exit(1);
-               }
-
-               // Process the first argument as a file
-               FileInputStream fin = new FileInputStream(args[0]);
-               WordExtractor extractor = new WordExtractor(fin);
-               System.out.println(extractor.getText());
-       }
-
-       /**
-        * Get the text from the word file, as an array with one String
-        *  per paragraph
-        */
-        public String[] getParagraphText() {
-                String[] ret;
-
-                // Extract using the model code
-                try {
-                        Range r = doc.getRange();
-
-                        ret = getParagraphText(r);
-                } catch (Exception e) {
-                        // Something's up with turning the text pieces into paragraphs
-                        // Fall back to ripping out the text pieces
-                        ret = new String[1];
-                        ret[0] = getTextFromPieces();
-                }
+public final class WordExtractor extends POIOLE2TextExtractor
+{
+    private HWPFDocument doc;
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param is
+     *            InputStream containing the word file
+     */
+    public WordExtractor( InputStream is ) throws IOException
+    {
+        this( HWPFDocument.verifyAndBuildPOIFS( is ) );
+    }
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public WordExtractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( new HWPFDocument( fs ) );
+    }
+
+    /**
+     * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public WordExtractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFDocument( dir ) );
+    }
+
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param doc
+     *            The HWPFDocument to extract from
+     */
+    public WordExtractor( HWPFDocument doc )
+    {
+        super( doc );
+        this.doc = doc;
+    }
+
+    /**
+     * Command line extractor, so people will stop moaning that they can't just
+     * run this.
+     */
+    public static void main( String[] args ) throws IOException
+    {
+        if ( args.length == 0 )
+        {
+            System.err.println( "Use:" );
+            System.err
+                    .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
+            System.exit( 1 );
+        }
  
-                return ret;
+        // Process the first argument as a file
+        FileInputStream fin = new FileInputStream( args[0] );
+        WordExtractor extractor = new WordExtractor( fin );
+        System.out.println( extractor.getText() );
+    }
+
+    /**
+     * Get the text from the word file, as an array with one String per
+     * paragraph
+     */
+    public String[] getParagraphText()
+    {
+        String[] ret;
+
+        // Extract using the model code
+        try
+        {
+            Range r = doc.getRange();
+
+            ret = getParagraphText( r );
+        }
+        catch ( Exception e )
+        {
+            // Something's up with turning the text pieces into paragraphs
+            // Fall back to ripping out the text pieces
+            ret = new String[1];
+            ret[0] = getTextFromPieces();
          }
  
-        public String[] getFootnoteText() {
-                Range r = doc.getFootnoteRange();
+        return ret;
+    }
  
-                return getParagraphText(r);
-        }
+    public String[] getFootnoteText()
+    {
+        Range r = doc.getFootnoteRange();
  
-        public String[] getMainTextboxText() {
-                Range r = doc.getMainTextboxRange();
+        return getParagraphText( r );
+    }
  
-                return getParagraphText(r);
-        }
+    public String[] getMainTextboxText()
+    {
+        Range r = doc.getMainTextboxRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getEndnoteText()
+    {
+        Range r = doc.getEndnoteRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getCommentsText()
+    {
+        Range r = doc.getCommentsRange();
+
+        return getParagraphText( r );
+    }
  
-        public String[] getEndnoteText() {
-                Range r = doc.getEndnoteRange();
+    protected static String[] getParagraphText( Range r )
+    {
+        String[] ret;
+        ret = new String[r.numParagraphs()];
+        for ( int i = 0; i < ret.length; i++ )
+        {
+            Paragraph p = r.getParagraph( i );
+            ret[i] = p.text();
  
-                return getParagraphText(r);
+            // Fix the line ending
+            if ( ret[i].endsWith( "\r" ) )
+            {
+                ret[i] = ret[i] + "\n";
+            }
+        }
+        return ret;
+    }
+
+    /**
+     * Add the header/footer text, if it's not empty
+     */
+    private void appendHeaderFooter( String text, StringBuffer out )
+    {
+        if ( text == null || text.length() == 0 )
+            return;
+
+        text = text.replace( '\r', '\n' );
+        if ( !text.endsWith( "\n" ) )
+        {
+            out.append( text );
+            out.append( '\n' );
+            return;
+        }
+        if ( text.endsWith( "\n\n" ) )
+        {
+            out.append( text.substring( 0, text.length() - 1 ) );
+            return;
+        }
+        out.append( text );
+        return;
+    }
+
+    /**
+     * Grab the text from the headers
+     */
+    @Deprecated
+    public String getHeaderText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstHeader() != null )
+        {
+            appendHeaderFooter( hs.getFirstHeader(), ret );
+        }
+        if ( hs.getEvenHeader() != null )
+        {
+            appendHeaderFooter( hs.getEvenHeader(), ret );
+        }
+        if ( hs.getOddHeader() != null )
+        {
+            appendHeaderFooter( hs.getOddHeader(), ret );
          }
  
-        public String[] getCommentsText() {
-                Range r = doc.getCommentsRange();
+        return ret.toString();
+    }
+
+    /**
+     * Grab the text from the footers
+     */
+    @Deprecated
+    public String getFooterText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstFooter() != null )
+        {
+            appendHeaderFooter( hs.getFirstFooter(), ret );
+        }
+        if ( hs.getEvenFooter() != null )
+        {
+            appendHeaderFooter( hs.getEvenFooter(), ret );
+        }
+        if ( hs.getOddFooter() != null )
+        {
+            appendHeaderFooter( hs.getOddFooter(), ret );
+        }
  
-                return getParagraphText(r);
+        return ret.toString();
+    }
+
+    /**
+     * Grab the text out of the text pieces. Might also include various bits of
+     * crud, but will work in cases where the text piece -> paragraph mapping is
+     * broken. Fast too.
+     */
+    public String getTextFromPieces()
+    {
+        String text = doc.getDocumentText();
+
+        // Fix line endings (Note - won't get all of them
+        text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
+        text = text.replaceAll( "\r\r", "\r\n\r\n" );
+
+        if ( text.endsWith( "\r" ) )
+        {
+            text += "\n";
          }
  
-        protected static String[] getParagraphText(Range r) {
-                String[] ret;
-                ret = new String[r.numParagraphs()];
-                for (int i = 0; i < ret.length; i++) {
-                        Paragraph p = r.getParagraph(i);
-                        ret[i] = p.text();
-
-                        // Fix the line ending
-                        if (ret[i].endsWith("\r")) {
-                                ret[i] = ret[i] + "\n";
-                        }
+        return text;
+    }
+
+    /**
+     * Grab the text, based on the WordToTextConverter. Shouldn't include any
+     * crud, but slower than getTextFromPieces().
+     */
+    public String getText()
+    {
+        try
+        {
+            final StringWriter stringWriter = new StringWriter();
+            @SuppressWarnings( "unused" )
+            WordToTextConverter wordToTextConverter = new WordToTextConverter()
+            {
+                {
+                    HeaderStories hs = new HeaderStories( doc );
+
+                    if ( hs.getFirstHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstHeaderSubrange() );
+                    if ( hs.getEvenHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenHeaderSubrange() );
+                    if ( hs.getOddHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getOddHeaderSubrange() );
+
+                    processDocument( doc );
+                    processDocumentPart( doc, doc.getMainTextboxRange() );
+
+                    if ( hs.getFirstFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstFooterSubrange() );
+                    if ( hs.getEvenFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenFooterSubrange() );
+                    if ( hs.getOddFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getOddFooterSubrange() );
+
+                    stringWriter.append( getText() );
                  }
-                return ret;
+            };
+            return stringWriter.toString();
          }
-
-        /**
-        * Add the header/footer text, if it's not empty
-        */
-       private void appendHeaderFooter(String text, StringBuffer out) {
-               if(text == null || text.length() == 0)
-                       return;
-
-               text = text.replace('\r', '\n');
-               if(! text.endsWith("\n")) {
-                       out.append(text);
-                       out.append('\n');
-                       return;
-               }
-               if(text.endsWith("\n\n")) {
-                       out.append(text.substring(0, text.length()-1));
-                       return;
-               }
-               out.append(text);
-               return;
-       }
-       /**
-        * Grab the text from the headers
-        */
-       public String getHeaderText() {
-               HeaderStories hs = new HeaderStories(doc);
-
-               StringBuffer ret = new StringBuffer();
-               if(hs.getFirstHeader() != null) {
-                       appendHeaderFooter(hs.getFirstHeader(), ret);
-               }
-               if(hs.getEvenHeader() != null) {
-                       appendHeaderFooter(hs.getEvenHeader(), ret);
-               }
-               if(hs.getOddHeader() != null) {
-                       appendHeaderFooter(hs.getOddHeader(), ret);
-               }
-
-               return ret.toString();
-       }
-       /**
-        * Grab the text from the footers
-        */
-       public String getFooterText() {
-               HeaderStories hs = new HeaderStories(doc);
-
-               StringBuffer ret = new StringBuffer();
-               if(hs.getFirstFooter() != null) {
-                       appendHeaderFooter(hs.getFirstFooter(), ret);
-               }
-               if(hs.getEvenFooter() != null) {
-                       appendHeaderFooter(hs.getEvenFooter(), ret);
-               }
-               if(hs.getOddFooter() != null) {
-                       appendHeaderFooter(hs.getOddFooter(), ret);
-               }
-
-               return ret.toString();
-       }
-
-       /**
-        * Grab the text out of the text pieces. Might also include various
-        *  bits of crud, but will work in cases where the text piece -> paragraph
-        *  mapping is broken. Fast too.
-        */
-       public String getTextFromPieces() {
-       String text = doc.getDocumentText();
-
-       // Fix line endings (Note - won't get all of them
-       text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
-       text = text.replaceAll("\r\r", "\r\n\r\n");
-
-       if(text.endsWith("\r")) {
-               text += "\n";
-       }
-
-       return text;
-       }
-
-       /**
-        * Grab the text, based on the paragraphs. Shouldn't include any crud,
-        *  but slightly slower than getTextFromPieces().
-        */
-       public String getText() {
-          StringBuffer ret = new StringBuffer();
-
-          ret.append(getHeaderText());
-
-          ArrayList<String> text = new ArrayList<String>();
-          text.addAll(Arrays.asList(getParagraphText()));
-          text.addAll(Arrays.asList(getMainTextboxText()));
-          text.addAll(Arrays.asList(getFootnoteText()));
-          text.addAll(Arrays.asList(getEndnoteText()));
-
-          for(String p : text) {
-             ret.append(p);
-          }
-
-          ret.append(getFooterText());
-
-          return ret.toString();
-       }
-
-       /**
-        * Removes any fields (eg macros, page markers etc)
-        *  from the string.
-        */
-       public static String stripFields(String text) {
-               return Range.stripFields(text);
-       }
+        catch ( Exception exc )
+        {
+            throw new RuntimeException( exc );
+        }
+    }
+
+    /**
+     * Removes any fields (eg macros, page markers etc) from the string.
+     */
+    public static String stripFields( String text )
+    {
+        return Range.stripFields( text );
+    }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java

index 72b5220ddea9f028cd620610dc18b0fbe490b6f2..f50abd5b4f46df833b80a3cd3fa37b359cf9fd41 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
@@ -17,17 +17,23 @@ public interface Field
       */
      int getFieldStartOffset();
  
+    CharacterRun getMarkEndCharacterRun( Range parent );
+
      /**
       * @return character position of end field mark
       */
      int getMarkEndOffset();
  
+    CharacterRun getMarkSeparatorCharacterRun( Range parent );
+
      /**
       * @return character position of separator field mark (if present,
       *         {@link NullPointerException} otherwise)
       */
      int getMarkSeparatorOffset();
  
+    CharacterRun getMarkStartCharacterRun( Range parent );
+
      /**
       * @return character position of start field mark
       */
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java

index 4832b23f96eb15405bdc0aa70d232277b590e8c3..37131c3e1529be3fd1118b8fe6954b4378e166ec 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
@@ -112,6 +112,12 @@ class FieldImpl implements Field
          return startPlex.getFcStart();
      }
  
+    public CharacterRun getMarkEndCharacterRun( Range parent )
+    {
+        return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
+                .getCharacterRun( 0 );
+    }
+
      /**
       * @return character position of end field mark
       */
@@ -120,6 +126,15 @@ class FieldImpl implements Field
          return endPlex.getFcStart();
      }
  
+    public CharacterRun getMarkSeparatorCharacterRun( Range parent )
+    {
+        if ( !hasSeparator() )
+            return null;
+
+        return new Range( getMarkSeparatorOffset(),
+                getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
+    }
+
      /**
       * @return character position of separator field mark (if present,
       *         {@link NullPointerException} otherwise)
@@ -129,6 +144,12 @@ class FieldImpl implements Field
          return separatorPlex.getFcStart();
      }
  
+    public CharacterRun getMarkStartCharacterRun( Range parent )
+    {
+        return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
+                parent ).getCharacterRun( 0 );
+    }
+
      /**
       * @return character position of start field mark
       */
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java

index fed191c7d4975287a6715865ec812940b14195d5..5b8c80501c2ec29d757a6a00aae0d517e26f7ac6 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
@@ -82,35 +82,96 @@ public final class HeaderStories {
                  fib.getPlcfHddSize(), 0 );
      }
  
-       public String getFootnoteSeparator() {
-               return getAt(0);
-       }
-       public String getFootnoteContSeparator() {
-               return getAt(1);
-       }
-       public String getFootnoteContNote() {
-               return getAt(2);
-       }
-       public String getEndnoteSeparator() {
-               return getAt(3);
-       }
-       public String getEndnoteContSeparator() {
-               return getAt(4);
-       }
-       public String getEndnoteContNote() {
-               return getAt(5);
-       }
+    @Deprecated
+    public String getFootnoteSeparator()
+    {
+        return getAt( 0 );
+    }
+
+    @Deprecated
+    public String getFootnoteContSeparator()
+    {
+        return getAt( 1 );
+    }
+
+    @Deprecated
+    public String getFootnoteContNote()
+    {
+        return getAt( 2 );
+    }
+
+    @Deprecated
+    public String getEndnoteSeparator()
+    {
+        return getAt( 3 );
+    }
  
+    @Deprecated
+    public String getEndnoteContSeparator()
+    {
+        return getAt( 4 );
+    }
+
+    @Deprecated
+    public String getEndnoteContNote()
+    {
+        return getAt( 5 );
+    }
+
+    public Range getFootnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 0 );
+    }
+
+    public Range getFootnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 1 );
+    }
+
+    public Range getFootnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 2 );
+    }
+
+    public Range getEndnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 3 );
+    }
+
+    public Range getEndnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 4 );
+    }
  
+    public Range getEndnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 5 );
+    }
+
+       @Deprecated
         public String getEvenHeader() {
                 return getAt(6+0);
         }
+    @Deprecated
         public String getOddHeader() {
                 return getAt(6+1);
         }
+    @Deprecated
         public String getFirstHeader() {
                 return getAt(6+4);
         }
+       
+
+    public Range getEvenHeaderSubrange() {
+        return getSubrangeAt(6+0);
+    }
+    public Range getOddHeaderSubrange() {
+        return getSubrangeAt(6+1);
+    }
+    public Range getFirstHeaderSubrange() {
+        return getSubrangeAt(6+4);
+    }
+    
         /**
          * Returns the correct, defined header for the given
          *  one based page
@@ -135,16 +196,39 @@ public final class HeaderStories {
                 return getOddHeader();
         }
  
+       @Deprecated
+    public String getEvenFooter()
+    {
+        return getAt( 6 + 2 );
+    }
+
+    @Deprecated
+    public String getOddFooter()
+    {
+        return getAt( 6 + 3 );
+    }
+
+    @Deprecated
+    public String getFirstFooter()
+    {
+        return getAt( 6 + 5 );
+    }
+
+    public Range getEvenFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 2 );
+    }
+
+    public Range getOddFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 3 );
+    }
+
+    public Range getFirstFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 5 );
+    }
  
-       public String getEvenFooter() {
-               return getAt(6+2);
-       }
-       public String getOddFooter() {
-               return getAt(6+3);
-       }
-       public String getFirstFooter() {
-               return getAt(6+5);
-       }
         /**
          * Returns the correct, defined footer for the given
          *  one based page
@@ -174,6 +258,7 @@ public final class HeaderStories {
          * Get the string that's pointed to by the
          *  given plcfHdd index
          */
+    @Deprecated
         private String getAt(int plcfHddIndex) {
                 if(plcfHdd == null) return null;
  
@@ -209,6 +294,32 @@ public final class HeaderStories {
                 return text;
         }
  
+    private Range getSubrangeAt( int plcfHddIndex )
+    {
+        if ( plcfHdd == null )
+            return null;
+
+        GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
+        if ( prop.getStart() == prop.getEnd() )
+        {
+            // Empty story
+            return null;
+        }
+        if ( prop.getEnd() < prop.getStart() )
+        {
+            // Broken properties?
+            return null;
+        }
+
+        final int headersLength = headerStories.getEndOffset()
+                - headerStories.getStartOffset();
+        int start = Math.min( prop.getStart(), headersLength );
+        int end = Math.min( prop.getEnd(), headersLength );
+
+        return new Range( headerStories.getStartOffset() + start,
+                headerStories.getStartOffset() + end, headerStories );
+    }
+
         public Range getRange() {
                 return headerStories;
         }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java

new file mode 100644 (file)

index 0000000..2f71f39
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
@@ -0,0 +1,34 @@
+package org.apache.poi.hwpf.usermodel;
+
+import java.io.FileNotFoundException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.util.Internal;
+
+@Internal
+public class ObjectPoolImpl implements ObjectsPool
+{
+    private DirectoryEntry _objectPool;
+
+    public ObjectPoolImpl( DirectoryEntry _objectPool )
+    {
+        super();
+        this._objectPool = _objectPool;
+    }
+
+    public Entry getObjectById( String objId )
+    {
+        if ( _objectPool == null )
+            return null;
+
+        try
+        {
+            return _objectPool.getEntry( objId );
+        }
+        catch ( FileNotFoundException exc )
+        {
+            return null;
+        }
+    }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java

new file mode 100644 (file)

index 0000000..fa91d37
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
@@ -0,0 +1,8 @@
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.poifs.filesystem.Entry;
+
+public interface ObjectsPool
+{
+    public Entry getObjectById( String objId );
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java

index 3f23eea0c2d82d5e5357d839c76d0876b9c37a8e..c99c4eb28d78a35e8488a37d632d6478b76c0ce2 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
  import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
  import org.apache.poi.hwpf.sprm.SprmBuffer;
  import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
  
  /**
   * This class is the central class of the HWPF object model. All properties that
@@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
   */
  public class Range { // TODO -instantiable superclass
  
+    private POILogger logger = POILogFactory.getLogger( Range.class );
+    
         public static final int TYPE_PARAGRAPH = 0;
         public static final int TYPE_CHARACTER = 1;
         public static final int TYPE_SECTION = 2;
@@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass
          initAll();
          if ( tableEndInclusive >= this._parEnd )
          {
-            throw new ArrayIndexOutOfBoundsException(
-                    "The table's bounds fall outside of this Range" );
+            logger.log( POILogger.WARN, "The table's bounds ", "["
+                    + this._parStart + "; " + tableEndInclusive + ")",
+                    " fall outside of this Range paragraphs numbers ", "["
+                            + this._parStart + "; " + this._parEnd + ")" );
          }
+
          if ( tableEndInclusive < 0 )
          {
              throw new ArrayIndexOutOfBoundsException(
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java

new file mode 100644 (file)

index 0000000..ca1d809
--- /dev/null
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
@@ -0,0 +1,22 @@
+package org.apache.poi.hwpf.converter;
+
+import junit.framework.TestCase;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+public class TestWordToTextConverter extends TestCase
+{
+
+    /**
+     * [FAILING] Bug 47731 - Word Extractor considers text copied from some
+     * website as an embedded object
+     */
+    public void testBug47731() throws Exception
+    {
+        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
+        String foundText = WordToTextConverter.getText( doc );
+
+        assertTrue( foundText
+                .contains( "Soak the rice in water for three to four hours" ) );
+    }
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java

index cbc7db1d142f722358ebc2c99161440a8b9227bf..48c20dfbe39983ec9454661bd3d4f58828993071 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   * @author Nick Burch (nick at torchbox dot com)
   */
  public final class TestWordExtractor extends TestCase {
+
+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
         private String[] p_text1 = new String[] {
                         "This is a simple word document\r\n",
                         "\r\n",
@@ -107,12 +117,14 @@ public final class TestWordExtractor extends TestCase {
         public void testGetText() {
                 assertEquals(p_text1_block, extractor.getText());
  
-               // For the 2nd, should give similar answers for
-               //  the two methods, differing only in line endings
-               assertEquals(
-                     extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), 
-                     extractor2.getText().replaceAll("[\\r\\n]", ""));
-       }
+        // For the 2nd, should give similar answers for
+        // the two methods, differing only in line endings
+
+        // nope, they must have different results, because of garbage
+        // assertEquals(
+        // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+        // extractor2.getText().replaceAll("[\\r\\n]", ""));
+    }
  
         /**
          * Test textPieces based extraction
@@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase {
         
         // Open directly 
         for(DirectoryNode dir : files) {
-          WordExtractor extractor = new WordExtractor(dir, null);
+          WordExtractor extractor = new WordExtractor(dir);
            assertEquals(p_text1_block, extractor.getText());
         }
  
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java

index 0ecfbf4a1f4a5dd0b3510c2eb310b73a87c12a69..01810569a5ec7b99c469243f3f6011dca14ec50d 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils;
  public class TestBugs extends TestCase
  {
  
+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
      private static void assertTableStructures( Range expected, Range actual )
      {
          assertEquals( expected.numParagraphs(), actual.numParagraphs() );
author	Sergey Vladimirov <sergey@apache.org>
	Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
committer	Sergey Vladimirov <sergey@apache.org>
	Tue, 9 Aug 2011 12:38:52 +0000 (12:38 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/java/org/apache/poi/POIOLE2TextExtractor.java		patch \| blob \| history
src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java		patch \| blob \| history
src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java		patch \| blob \| history
src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java		patch \| blob \| history
src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java		patch \| blob \| history
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java		patch \| blob \| history