From f37c8f303a0b8fbf77d4121bbe8c46a771833afd Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Fri, 8 Jan 2010 16:44:08 +0000 Subject: [PATCH] Add embeded (attachment) support to the outlook text extractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 3 +- .../poi/extractor/ExtractorFactory.java | 54 +++++++++++++++---- .../poi/extractor/TestExtractorFactory.java | 21 +++++++- .../hsmf/extractor/OutlookTextExtactor.java | 7 +++ 4 files changed, 74 insertions(+), 11 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 15b3750815..83042ae6b0 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,7 +34,8 @@ - Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files + Support attachments as embeded documents within the new OutlookTextExtractor + Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files Some improvements to HSMF parsing of .msg files Initialise the link type of HSSFHyperLink, so that getType() on it works 48425 - improved performance of DateUtil.isCellDateFormatted() diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 7657635e75..55d8499f5e 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.extractor; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; @@ -139,9 +142,14 @@ public class ExtractorFactory { if(entry.getName().equals("VisioDocument")) { return new VisioTextExtractor(poifsDir, fs); } - if(entry.getName().equals("__substg1.0_1000001E") || + if( + entry.getName().equals("__substg1.0_1000001E") || + entry.getName().equals("__substg1.0_1000001F") || entry.getName().equals("__substg1.0_0047001E") || - entry.getName().equals("__substg1.0_0037001E")) { + entry.getName().equals("__substg1.0_0047001F") || + entry.getName().equals("__substg1.0_0037001E") || + entry.getName().equals("__substg1.0_0037001F") + ) { return new OutlookTextExtactor(poifsDir, fs); } } @@ -157,8 +165,12 @@ public class ExtractorFactory { * {@link POITextExtractor} for each embeded file. */ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { - // Find all the embeded directories + // All the embded directories we spotted ArrayList dirs = new ArrayList(); + // For anything else not directly held in as a POIFS directory + ArrayList nonPOIFS = new ArrayList(); + + // Find all the embeded directories POIFSFileSystem fs = ext.getFileSystem(); if(fs == null) { throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); @@ -189,20 +201,44 @@ public class ExtractorFactory { } else if(ext instanceof PowerPointExtractor) { // Tricky, not stored directly in poifs // TODO + } else if(ext instanceof OutlookTextExtactor) { + // Stored in the Attachment blocks + MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); + for(AttachmentChunks attachment : msg.getAttachmentFiles()) { + if(attachment.attachData != null) { + byte[] data = attachment.attachData.getValue(); + nonPOIFS.add( new ByteArrayInputStream(data) ); + } + } } // Create the extractors - if(dirs == null || dirs.size() == 0) { + if( + (dirs == null || dirs.size() == 0) && + (nonPOIFS == null || nonPOIFS.size() == 0) + ){ return new POITextExtractor[0]; } - POITextExtractor[] te = new POITextExtractor[dirs.size()]; - for(int i=0; i e = new ArrayList(); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + } + assertEquals(0, numPpt); + assertEquals(0, numXls); + assertEquals(1, numWord); // TODO - PowerPoint // TODO - Visio - // TODO - Outlook } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java index 54a2ddda67..a6ada5bb95 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java @@ -44,6 +44,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { this(new MAPIMessage(inp)); } + /** + * Returns the underlying MAPI message + */ + public MAPIMessage getMAPIMessage() { + return (MAPIMessage)document; + } + /** * Outputs something a little like a RFC822 email */ -- 2.39.5