aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2010-01-08 16:44:08 +0000
committerNick Burch <nick@apache.org>2010-01-08 16:44:08 +0000
commitf37c8f303a0b8fbf77d4121bbe8c46a771833afd (patch)
tree24a5eb9a49dafc3aca245f575f58671be6cc7c9a /src
parent07551a092544bb9f654227ff942e5605363defaa (diff)
downloadpoi-f37c8f303a0b8fbf77d4121bbe8c46a771833afd.tar.gz
poi-f37c8f303a0b8fbf77d4121bbe8c46a771833afd.zip
Add embeded (attachment) support to the outlook text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897258 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/documentation/content/xdocs/status.xml3
-rw-r--r--src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java54
-rw-r--r--src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java21
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java7
4 files changed, 74 insertions, 11 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml
index 15b3750815..83042ae6b0 100644
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,7 +34,8 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
- <action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
+ <action dev="POI-DEVELOPERS" type="add">Support attachments as embeded documents within the new OutlookTextExtractor</action>
+ <action dev="POI-DEVELOPERS" type="add">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
index 7657635e75..55d8499f5e 100644
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -16,6 +16,7 @@
==================================================================== */
package org.apache.poi.extractor;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@@ -31,6 +32,8 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
@@ -139,9 +142,14 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
- if(entry.getName().equals("__substg1.0_1000001E") ||
+ if(
+ entry.getName().equals("__substg1.0_1000001E") ||
+ entry.getName().equals("__substg1.0_1000001F") ||
entry.getName().equals("__substg1.0_0047001E") ||
- entry.getName().equals("__substg1.0_0037001E")) {
+ entry.getName().equals("__substg1.0_0047001F") ||
+ entry.getName().equals("__substg1.0_0037001E") ||
+ entry.getName().equals("__substg1.0_0037001F")
+ ) {
return new OutlookTextExtactor(poifsDir, fs);
}
}
@@ -157,8 +165,12 @@ public class ExtractorFactory {
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
- // Find all the embeded directories
+ // All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
+ // For anything else not directly held in as a POIFS directory
+ ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+
+ // Find all the embeded directories
POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
@@ -189,20 +201,44 @@ public class ExtractorFactory {
} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
+ } else if(ext instanceof OutlookTextExtactor) {
+ // Stored in the Attachment blocks
+ MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
+ for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ if(attachment.attachData != null) {
+ byte[] data = attachment.attachData.getValue();
+ nonPOIFS.add( new ByteArrayInputStream(data) );
+ }
+ }
}
// Create the extractors
- if(dirs == null || dirs.size() == 0) {
+ if(
+ (dirs == null || dirs.size() == 0) &&
+ (nonPOIFS == null || nonPOIFS.size() == 0)
+ ){
return new POITextExtractor[0];
}
- POITextExtractor[] te = new POITextExtractor[dirs.size()];
- for(int i=0; i<te.length; i++) {
- te[i] = createExtractor(
+ ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+ for(int i=0; i<dirs.size(); i++) {
+ e.add( createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem()
- );
+ ) );
+ }
+ for(int i=0; i<nonPOIFS.size(); i++) {
+ try {
+ e.add( createExtractor(nonPOIFS.get(i)) );
+ } catch(IllegalArgumentException ie) {
+ // Ignore, just means it didn't contain
+ // a format we support as yet
+ } catch(XmlException xe) {
+ throw new IOException(xe.getMessage());
+ } catch(OpenXML4JException oe) {
+ throw new IOException(oe.getMessage());
+ }
}
- return te;
+ return e.toArray(new POITextExtractor[e.size()]);
}
/**
diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
index f9b88794bd..d327c55828 100644
--- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
+++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
@@ -59,6 +59,8 @@ public class TestExtractorFactory extends TestCase {
private File pptx;
private File msg;
+ private File msgEmb;
+
private File vsd;
protected void setUp() throws Exception {
@@ -86,6 +88,7 @@ public class TestExtractorFactory extends TestCase {
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg");
+ msgEmb = olTests.getFile("attachment_test_msg.msg");
}
public void testFile() throws Exception {
@@ -404,9 +407,25 @@ public class TestExtractorFactory extends TestCase {
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
+
+ // Outlook
+ ext = (OutlookTextExtactor)
+ ExtractorFactory.createExtractor(msgEmb);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ numWord = 0; numXls = 0; numPpt = 0;
+ assertEquals(1, embeds.length);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ }
+ assertEquals(0, numPpt);
+ assertEquals(0, numXls);
+ assertEquals(1, numWord);
// TODO - PowerPoint
// TODO - Visio
- // TODO - Outlook
}
}
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
index 54a2ddda67..a6ada5bb95 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
@@ -45,6 +45,13 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
}
/**
+ * Returns the underlying MAPI message
+ */
+ public MAPIMessage getMAPIMessage() {
+ return (MAPIMessage)document;
+ }
+
+ /**
* Outputs something a little like a RFC822 email
*/
public String getText() {