==================================================================== */
package org.apache.poi.extractor;
+import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
- if(entry.getName().equals("__substg1.0_1000001E") ||
+ if(
+ entry.getName().equals("__substg1.0_1000001E") ||
+ entry.getName().equals("__substg1.0_1000001F") ||
entry.getName().equals("__substg1.0_0047001E") ||
- entry.getName().equals("__substg1.0_0037001E")) {
+ entry.getName().equals("__substg1.0_0047001F") ||
+ entry.getName().equals("__substg1.0_0037001E") ||
+ entry.getName().equals("__substg1.0_0037001F")
+ ) {
return new OutlookTextExtactor(poifsDir, fs);
}
}
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
- // Find all the embeded directories
+ // All the embded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();
+ // For anything else not directly held in as a POIFS directory
+ ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+
+ // Find all the embeded directories
POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
+ } else if(ext instanceof OutlookTextExtactor) {
+ // Stored in the Attachment blocks
+ MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
+ for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ if(attachment.attachData != null) {
+ byte[] data = attachment.attachData.getValue();
+ nonPOIFS.add( new ByteArrayInputStream(data) );
+ }
+ }
}
// Create the extractors
- if(dirs == null || dirs.size() == 0) {
+ if(
+ (dirs == null || dirs.size() == 0) &&
+ (nonPOIFS == null || nonPOIFS.size() == 0)
+ ){
return new POITextExtractor[0];
}
- POITextExtractor[] te = new POITextExtractor[dirs.size()];
- for(int i=0; i<te.length; i++) {
- te[i] = createExtractor(
+ ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
+ for(int i=0; i<dirs.size(); i++) {
+ e.add( createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem()
- );
+ ) );
+ }
+ for(int i=0; i<nonPOIFS.size(); i++) {
+ try {
+ e.add( createExtractor(nonPOIFS.get(i)) );
+ } catch(IllegalArgumentException ie) {
+ // Ignore, just means it didn't contain
+ // a format we support as yet
+ } catch(XmlException xe) {
+ throw new IOException(xe.getMessage());
+ } catch(OpenXML4JException oe) {
+ throw new IOException(oe.getMessage());
+ }
}
- return te;
+ return e.toArray(new POITextExtractor[e.size()]);
}
/**
private File pptx;
private File msg;
+ private File msgEmb;
+
private File vsd;
protected void setUp() throws Exception {
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = olTests.getFile("quick.msg");
+ msgEmb = olTests.getFile("attachment_test_msg.msg");
}
public void testFile() throws Exception {
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
+
+ // Outlook
+ ext = (OutlookTextExtactor)
+ ExtractorFactory.createExtractor(msgEmb);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ numWord = 0; numXls = 0; numPpt = 0;
+ assertEquals(1, embeds.length);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ }
+ assertEquals(0, numPpt);
+ assertEquals(0, numXls);
+ assertEquals(1, numWord);
// TODO - PowerPoint
// TODO - Visio
- // TODO - Outlook
}
}