]> source.dussan.org Git - poi.git/commitdiff
Most of support suggested by Phil Varner on the list - ExtractorFactory can now be...
authorNick Burch <nick@apache.org>
Mon, 25 Jan 2010 19:02:13 +0000 (19:02 +0000)
committerNick Burch <nick@apache.org>
Mon, 25 Jan 2010 19:02:13 +0000 (19:02 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@902927 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java
src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java

index ddd44109b67c8ad4933037ba2f71003de3390e46..7842c6136f2e4ace54da8ab9592ad269175d31cd 100644 (file)
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
            <action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
            <action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
            <action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action>
index a11aee96efbae828daa1436225e5cb5504bdb7b9..ede5612cba1d5aacd5f3e3a861083369d9e95a80 100644 (file)
@@ -22,6 +22,7 @@ import java.io.IOException;
 
 import org.apache.poi.hssf.eventusermodel.HSSFUserException;
 import org.apache.poi.hssf.record.*;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -51,11 +52,33 @@ public class HSSFEventFactory {
         * @param fs  a POIFS filesystem containing your workbook
         */
        public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
-               InputStream in = fs.createDocumentInputStream("Workbook");
-
-               processEvents(req, in);
+          processWorkbookEvents(req, fs.getRoot());
        }
 
+   /**
+    * Processes a file into essentially record events.
+    *
+    * @param req an Instance of HSSFRequest which has your registered listeners
+    * @param fs  a POIFS filesystem containing your workbook
+    */
+   public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException {
+      InputStream in = dir.createDocumentInputStream("Workbook");
+
+      processEvents(req, in);
+   }
+
+   /**
+    * Processes a file into essentially record events.
+    *
+    * @param req an Instance of HSSFRequest which has your registered listeners
+    * @param fs  a POIFS filesystem containing your workbook
+    * @return    numeric user-specified result code.
+    */
+   public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
+      throws IOException, HSSFUserException {
+      return abortableProcessWorkbookEvents(req, fs.getRoot());
+   }
+
        /**
         * Processes a file into essentially record events.
         *
@@ -63,9 +86,9 @@ public class HSSFEventFactory {
         * @param fs  a POIFS filesystem containing your workbook
         * @return    numeric user-specified result code.
         */
-       public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
+       public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir)
                throws IOException, HSSFUserException {
-               InputStream in = fs.createDocumentInputStream("Workbook");
+               InputStream in = dir.createDocumentInputStream("Workbook");
                return abortableProcessEvents(req, in);
        }
 
index 7b28ed55e164eed876017c23b55cb69a0e674986..15560e19c99cafd6a9dde26e9329ef1d87d58879 100644 (file)
@@ -46,6 +46,7 @@ import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.hssf.record.StringRecord;
 import org.apache.poi.hssf.usermodel.HSSFDateUtil;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
@@ -65,15 +66,28 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  * http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link>
  */
 public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
+   private DirectoryNode _dir;
        private POIFSFileSystem _fs;
        boolean _includeSheetNames = true;
        boolean _formulasNotResults = false;
 
-       public EventBasedExcelExtractor(POIFSFileSystem fs) {
+       public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
                super(null);
+               _dir = dir;
                _fs = fs;
        }
+   public EventBasedExcelExtractor(POIFSFileSystem fs) {
+      this(fs.getRoot(), fs);
+   }
 
+   /**
+    * Return the underlying POIFS FileSystem of
+    *  this document.
+    */
+   public POIFSFileSystem getFileSystem() {
+      return _fs;
+   }
+   
        /**
         * Would return the document information metadata for the document,
         *  if we supported it
@@ -134,7 +148,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
                HSSFRequest request = new HSSFRequest();
                request.addListenerForAllRecords(ft);
 
-               factory.processWorkbookEvents(request, _fs);
+               factory.processWorkbookEvents(request, _dir);
 
                return tl;
        }
index 24a2632be682630d1df38089494df588e3b39a33..ed7f22ac8dc4845580fa0d6e4de515caec21e515 100644 (file)
@@ -36,6 +36,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.poi.hsmf.datatypes.AttachmentChunks;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -63,6 +64,59 @@ public class ExtractorFactory {
        public static final String CORE_DOCUMENT_REL =
                "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
        
+       
+       /** Should this thread prefer event based over usermodel based extractors? */
+       private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
+      protected Boolean initialValue() { return Boolean.FALSE; }
+       };
+       /** Should all threads prefer event based over usermodel based extractors? */
+       private static Boolean allPreferEventExtractors;
+       
+   /** 
+    * Should this thread prefer event based over usermodel based extractors?
+    * (usermodel extractors tend to be more accurate, but use more memory) 
+    * Default is false. 
+    */
+       public static boolean getThreadPrefersEventExtractors() {
+          return threadPreferEventExtractors.get();
+       }
+   /** 
+    * Should all threads prefer event based over usermodel based extractors? 
+    * (usermodel extractors tend to be more accurate, but use more memory) 
+    * Default is to use the thread level setting, which defaults to false. 
+    */
+       public static Boolean getAllThreadsPreferEventExtractors() {
+          return allPreferEventExtractors;
+       }
+       
+   /** 
+    * Should this thread prefer event based over usermodel based extractors?
+    * Will only be used if the All Threads setting is null. 
+    */
+   public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+      threadPreferEventExtractors.set(preferEventExtractors);
+   }
+   /** 
+    * Should all threads prefer event based over usermodel based extractors?
+    * If set, will take preference over the Thread level setting. 
+    */
+   public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+      allPreferEventExtractors = preferEventExtractors;
+   }
+       
+   
+   /**
+    * Should this thread use event based extractors is available?
+    * Checks the all-threads one first, then thread specific.
+    */
+   protected static boolean getPreferEventExtractor() {
+      if(allPreferEventExtractors != null) {
+         return allPreferEventExtractors;
+      }
+      return threadPreferEventExtractors.get();
+   }
+   
+       
        public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
                InputStream inp = new PushbackInputStream( 
                        new FileInputStream(f), 8);
@@ -106,7 +160,12 @@ public class ExtractorFactory {
             corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
             corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
             corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
-            return new XSSFExcelExtractor(pkg);
+           if(getPreferEventExtractor()) {
+              // TODO
+              return new XSSFExcelExtractor(pkg);
+           } else {
+              return new XSSFExcelExtractor(pkg);
+           }
         }
 
         if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
@@ -148,7 +207,11 @@ public class ExtractorFactory {
                        Entry entry = entries.next();
                        
                        if(entry.getName().equals("Workbook")) {
-                               return new ExcelExtractor(poifsDir, fs);
+                          if(getPreferEventExtractor()) {
+               return new EventBasedExcelExtractor(poifsDir, fs);
+                          } else {
+                             return new ExcelExtractor(poifsDir, fs);
+                          }
                        }
                        if(entry.getName().equals("WordDocument")) {
                                return new WordExtractor(poifsDir, fs);
index c127427a23a5150716d529a19c334e64eea265a8..81f55cc9f7e96b7069419209a4e654997b20b757 100644 (file)
@@ -27,6 +27,7 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -390,6 +391,72 @@ public class TestExtractorFactory extends TestCase {
                        // Good
                }
        }
+       
+       public void testPreferEventBased() throws Exception {
+          assertEquals(false, ExtractorFactory.getPreferEventExtractor());
+          assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
+          assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+          
+          ExtractorFactory.setThreadPrefersEventExtractors(true);
+          
+      assertEquals(true, ExtractorFactory.getPreferEventExtractor());
+      assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
+      assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+      
+      ExtractorFactory.setAllThreadsPreferEventExtractors(false);
+      
+      assertEquals(false, ExtractorFactory.getPreferEventExtractor());
+      assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
+      assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
+      
+      ExtractorFactory.setAllThreadsPreferEventExtractors(null);
+      
+      assertEquals(true, ExtractorFactory.getPreferEventExtractor());
+      assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
+      assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+      
+      
+      // Check we get the right extractors now
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
+            instanceof EventBasedExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
+      );
+      
+      assertTrue(
+            ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
+            instanceof XSSFExcelExtractor // TODO
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
+      );
+      
+      
+      // Put back to normal
+      ExtractorFactory.setThreadPrefersEventExtractors(false);
+      assertEquals(false, ExtractorFactory.getPreferEventExtractor());
+      assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
+      assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+      
+      // And back
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
+            instanceof ExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
+      );
+      
+      assertTrue(
+            ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
+            instanceof XSSFExcelExtractor
+      );
+      assertTrue(
+            ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
+      );
+       }
 
    /**
     * Test embeded docs text extraction. For now, only