<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
+ <action dev="POI-DEVELOPERS" type="add">ExtractorFactory can now be told to prefer Event Based extractors (current Excel only) on a per-thread or overall basis</action>
<action dev="POI-DEVELOPERS" type="fix">48544 - avoid failures in XLSX2CSV when shared string table is missing</action>
<action dev="POI-DEVELOPERS" type="fix">48571 - properly close all IO streams created in OPCPackage</action>
<action dev="POI-DEVELOPERS" type="fix">48572 - always copy all declared inner classes and interfaces when generating poi-ooxml-schemas</action>
import org.apache.poi.hssf.eventusermodel.HSSFUserException;
import org.apache.poi.hssf.record.*;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* @param fs a POIFS filesystem containing your workbook
*/
public void processWorkbookEvents(HSSFRequest req, POIFSFileSystem fs) throws IOException {
- InputStream in = fs.createDocumentInputStream("Workbook");
-
- processEvents(req, in);
+ processWorkbookEvents(req, fs.getRoot());
}
+ /**
+ * Processes a file into essentially record events.
+ *
+ * @param req an Instance of HSSFRequest which has your registered listeners
+ * @param fs a POIFS filesystem containing your workbook
+ */
+ public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException {
+ InputStream in = dir.createDocumentInputStream("Workbook");
+
+ processEvents(req, in);
+ }
+
+ /**
+ * Processes a file into essentially record events.
+ *
+ * @param req an Instance of HSSFRequest which has your registered listeners
+ * @param fs a POIFS filesystem containing your workbook
+ * @return numeric user-specified result code.
+ */
+ public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
+ throws IOException, HSSFUserException {
+ return abortableProcessWorkbookEvents(req, fs.getRoot());
+ }
+
/**
* Processes a file into essentially record events.
*
* @param fs a POIFS filesystem containing your workbook
* @return numeric user-specified result code.
*/
- public short abortableProcessWorkbookEvents(HSSFRequest req, POIFSFileSystem fs)
+ public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir)
throws IOException, HSSFUserException {
- InputStream in = fs.createDocumentInputStream("Workbook");
+ InputStream in = dir.createDocumentInputStream("Workbook");
return abortableProcessEvents(req, in);
}
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</link>
*/
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
+ private DirectoryNode _dir;
private POIFSFileSystem _fs;
boolean _includeSheetNames = true;
boolean _formulasNotResults = false;
- public EventBasedExcelExtractor(POIFSFileSystem fs) {
+ public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
super(null);
+ _dir = dir;
_fs = fs;
}
+ public EventBasedExcelExtractor(POIFSFileSystem fs) {
+ this(fs.getRoot(), fs);
+ }
+ /**
+ * Return the underlying POIFS FileSystem of
+ * this document.
+ */
+ public POIFSFileSystem getFileSystem() {
+ return _fs;
+ }
+
/**
* Would return the document information metadata for the document,
* if we supported it
HSSFRequest request = new HSSFRequest();
request.addListenerForAllRecords(ft);
- factory.processWorkbookEvents(request, _fs);
+ factory.processWorkbookEvents(request, _dir);
return tl;
}
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
public static final String CORE_DOCUMENT_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
+
+ /** Should this thread prefer event based over usermodel based extractors? */
+ private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
+ protected Boolean initialValue() { return Boolean.FALSE; }
+ };
+ /** Should all threads prefer event based over usermodel based extractors? */
+ private static Boolean allPreferEventExtractors;
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is false.
+ */
+ public static boolean getThreadPrefersEventExtractors() {
+ return threadPreferEventExtractors.get();
+ }
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is to use the thread level setting, which defaults to false.
+ */
+ public static Boolean getAllThreadsPreferEventExtractors() {
+ return allPreferEventExtractors;
+ }
+
+ /**
+ * Should this thread prefer event based over usermodel based extractors?
+ * Will only be used if the All Threads setting is null.
+ */
+ public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
+ threadPreferEventExtractors.set(preferEventExtractors);
+ }
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * If set, will take preference over the Thread level setting.
+ */
+ public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
+ allPreferEventExtractors = preferEventExtractors;
+ }
+
+
+ /**
+ * Should this thread use event based extractors is available?
+ * Checks the all-threads one first, then thread specific.
+ */
+ protected static boolean getPreferEventExtractor() {
+ if(allPreferEventExtractors != null) {
+ return allPreferEventExtractors;
+ }
+ return threadPreferEventExtractors.get();
+ }
+
+
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
InputStream inp = new PushbackInputStream(
new FileInputStream(f), 8);
corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) ||
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) ||
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) {
- return new XSSFExcelExtractor(pkg);
+ if(getPreferEventExtractor()) {
+ // TODO
+ return new XSSFExcelExtractor(pkg);
+ } else {
+ return new XSSFExcelExtractor(pkg);
+ }
}
if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) ||
Entry entry = entries.next();
if(entry.getName().equals("Workbook")) {
- return new ExcelExtractor(poifsDir, fs);
+ if(getPreferEventExtractor()) {
+ return new EventBasedExcelExtractor(poifsDir, fs);
+ } else {
+ return new ExcelExtractor(poifsDir, fs);
+ }
}
if(entry.getName().equals("WordDocument")) {
return new WordExtractor(poifsDir, fs);
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
+import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
// Good
}
}
+
+ public void testPreferEventBased() throws Exception {
+ assertEquals(false, ExtractorFactory.getPreferEventExtractor());
+ assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
+ assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+
+ ExtractorFactory.setThreadPrefersEventExtractors(true);
+
+ assertEquals(true, ExtractorFactory.getPreferEventExtractor());
+ assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
+ assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+
+ ExtractorFactory.setAllThreadsPreferEventExtractors(false);
+
+ assertEquals(false, ExtractorFactory.getPreferEventExtractor());
+ assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
+ assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
+
+ ExtractorFactory.setAllThreadsPreferEventExtractors(null);
+
+ assertEquals(true, ExtractorFactory.getPreferEventExtractor());
+ assertEquals(true, ExtractorFactory.getThreadPrefersEventExtractors());
+ assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+
+
+ // Check we get the right extractors now
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
+ instanceof EventBasedExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
+ instanceof XSSFExcelExtractor // TODO
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
+ );
+
+
+ // Put back to normal
+ ExtractorFactory.setThreadPrefersEventExtractors(false);
+ assertEquals(false, ExtractorFactory.getPreferEventExtractor());
+ assertEquals(false, ExtractorFactory.getThreadPrefersEventExtractors());
+ assertEquals(null, ExtractorFactory.getAllThreadsPreferEventExtractors());
+
+ // And back
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
+ instanceof ExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()))
+ instanceof XSSFExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString())).getText().length() > 200
+ );
+ }
/**
* Test embeded docs text extraction. For now, only