diff options
author | Nick Burch <nick@apache.org> | 2010-01-08 16:44:08 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2010-01-08 16:44:08 +0000 |
commit | f37c8f303a0b8fbf77d4121bbe8c46a771833afd (patch) | |
tree | 24a5eb9a49dafc3aca245f575f58671be6cc7c9a /src | |
parent | 07551a092544bb9f654227ff942e5605363defaa (diff) | |
download | poi-f37c8f303a0b8fbf77d4121bbe8c46a771833afd.tar.gz poi-f37c8f303a0b8fbf77d4121bbe8c46a771833afd.zip |
Add embeded (attachment) support to the outlook text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
4 files changed, 74 insertions, 11 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 15b3750815..83042ae6b0 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,7 +34,8 @@ <changes> <release version="3.7-SNAPSHOT" date="2010-??-??"> - <action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action> + <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action> + <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action> <action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action> <action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action> <action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action> diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 7657635e75..55d8499f5e 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.extractor; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; @@ -139,9 +142,14 @@ public class ExtractorFactory { if(entry.getName().equals("VisioDocument")) { return new VisioTextExtractor(poifsDir, fs); } - if(entry.getName().equals("__substg1.0_1000001E") || + if( + entry.getName().equals("__substg1.0_1000001E") || + entry.getName().equals("__substg1.0_1000001F") || entry.getName().equals("__substg1.0_0047001E") || - entry.getName().equals("__substg1.0_0037001E")) { + entry.getName().equals("__substg1.0_0047001F") || + entry.getName().equals("__substg1.0_0037001E") || + entry.getName().equals("__substg1.0_0037001F") + ) { return new OutlookTextExtactor(poifsDir, fs); } } @@ -157,8 +165,12 @@ public class ExtractorFactory { * {@link POITextExtractor} for each embeded file. */ public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException { - // Find all the embeded directories + // All the embded directories we spotted ArrayList<Entry> dirs = new ArrayList<Entry>(); + // For anything else not directly held in as a POIFS directory + ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>(); + + // Find all the embeded directories POIFSFileSystem fs = ext.getFileSystem(); if(fs == null) { throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); @@ -189,20 +201,44 @@ public class ExtractorFactory { } else if(ext instanceof PowerPointExtractor) { // Tricky, not stored directly in poifs // TODO + } else if(ext instanceof OutlookTextExtactor) { + // Stored in the Attachment blocks + MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); + for(AttachmentChunks attachment : msg.getAttachmentFiles()) { + if(attachment.attachData != null) { + byte[] data = attachment.attachData.getValue(); + nonPOIFS.add( new ByteArrayInputStream(data) ); + } + } } // Create the extractors - if(dirs == null || dirs.size() == 0) { + if( + (dirs == null || dirs.size() == 0) && + (nonPOIFS == null || nonPOIFS.size() == 0) + ){ return new POITextExtractor[0]; } - POITextExtractor[] te = new POITextExtractor[dirs.size()]; - for(int i=0; i<te.length; i++) { - te[i] = createExtractor( + ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>(); + for(int i=0; i<dirs.size(); i++) { + e.add( createExtractor( (DirectoryNode)dirs.get(i), ext.getFileSystem() - ); + ) ); + } + for(int i=0; i<nonPOIFS.size(); i++) { + try { + e.add( createExtractor(nonPOIFS.get(i)) ); + } catch(IllegalArgumentException ie) { + // Ignore, just means it didn't contain + // a format we support as yet + } catch(XmlException xe) { + throw new IOException(xe.getMessage()); + } catch(OpenXML4JException oe) { + throw new IOException(oe.getMessage()); + } } - return te; + return e.toArray(new POITextExtractor[e.size()]); } /** diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index f9b88794bd..d327c55828 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase { private File pptx; private File msg; + private File msgEmb; + private File vsd; protected void setUp() throws Exception { @@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase { POIDataSamples olTests = POIDataSamples.getHSMFInstance(); msg = olTests.getFile("quick.msg"); + msgEmb = olTests.getFile("attachment_test_msg.msg"); } public void testFile() throws Exception { @@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase { assertEquals(1, numPpt); assertEquals(2, numXls); assertEquals(1, numWord); + + // Outlook + ext = (OutlookTextExtactor) + ExtractorFactory.createExtractor(msgEmb); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; + assertEquals(1, embeds.length); + for(int i=0; i<embeds.length; i++) { + assertTrue(embeds[i].getText().length() > 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + } + assertEquals(0, numPpt); + assertEquals(0, numXls); + assertEquals(1, numWord); // TODO - PowerPoint // TODO - Visio - // TODO - Outlook } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java index 54a2ddda67..a6ada5bb95 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java @@ -45,6 +45,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor { } /** + * Returns the underlying MAPI message + */ + public MAPIMessage getMAPIMessage() { + return (MAPIMessage)document; + } + + /** * Outputs something a little like a RFC822 email */ public String getText() { |