import static org.junit.Assert.assertNotNull;
+import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
+import java.util.Arrays;
import org.apache.poi.hmef.HMEFMessage;
import org.apache.poi.hmef.attribute.MAPIAttribute;
import org.apache.poi.hmef.attribute.MAPIStringAttribute;
+import org.apache.poi.hmef.attribute.TNEFAttribute;
+import org.apache.poi.hmef.attribute.TNEFProperty;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.poifs.filesystem.FileMagic;
+import org.apache.poi.util.LittleEndian;
import org.junit.Test;
public class HMEFFileHandler extends AbstractFileHandler {
+ @Override
+ public void handleExtracting(File file) throws Exception {
+ FileMagic fm = FileMagic.valueOf(file);
+ if (fm == FileMagic.OLE2) {
+ super.handleExtracting(file);
+ }
+ }
+
@Override
public void handleFile(InputStream stream, String path) throws Exception {
HMEFMessage msg = new HMEFMessage(stream);
-
+
// list all properties
StringBuilder props = new StringBuilder();
for(MAPIAttribute att : msg.getMessageMAPIAttributes()) {
props.append(att.getType()).append(": ").append(MAPIStringAttribute.getAsString( att)).append("\n");
}
-
+
// there are two test-files that have no body...
- if(!msg.getSubject().equals("Testing TNEF Message") && !msg.getSubject().equals("TNEF test message with attachments")) {
- assertNotNull("Had: " + msg.getBody() + ", " + msg.getSubject() + ", " + msg.getAttachments() + ": " + props,
- msg.getBody());
+ String[] HTML_BODY = {
+ "Testing TNEF Message", "TNEF test message with attachments", "Test"
+ };
+ String bodyStr;
+ if(Arrays.asList(HTML_BODY).contains(msg.getSubject())) {
+ MAPIAttribute bodyHtml = msg.getMessageMAPIAttribute(MAPIProperty.BODY_HTML);
+ assertNotNull(bodyHtml);
+ bodyStr = new String(bodyHtml.getData(), getEncoding(msg));
+ } else {
+ bodyStr = msg.getBody();
}
- assertNotNull("Had: " + msg.getBody() + ", " + msg.getSubject() + ", " + msg.getAttachments() + ": " + props,
- msg.getSubject());
+ assertNotNull("Body is not set", bodyStr);
+ assertNotNull("Subject is not set", msg.getSubject());
}
-
+
// a test-case to test this locally without executing the full TestAllFiles
@Test
public void test() throws Exception {
handleFile(stream, path);
}
}
+
+ private String getEncoding(HMEFMessage tnefDat) {
+ TNEFAttribute oemCP = tnefDat.getMessageAttribute(TNEFProperty.ID_OEMCODEPAGE);
+ MAPIAttribute cpId = tnefDat.getMessageMAPIAttribute(MAPIProperty.INTERNET_CPID);
+ int codePage = 1252;
+ if (oemCP != null) {
+ codePage = LittleEndian.getInt(oemCP.getData());
+ } else if (cpId != null) {
+ codePage = LittleEndian.getInt(cpId.getData());
+ }
+ switch (codePage) {
+ // see http://en.wikipedia.org/wiki/Code_page for more
+ case 1252: return "Windows-1252";
+ case 20127: return "US-ASCII";
+ default: return "cp"+codePage;
+ }
+ }
+
}
/**
* Figures out the correct POITextExtractor for your supplied
* document, and returns it.
- *
+ *
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - rather than using this, for most cases you would be better
@SuppressWarnings("WeakerAccess")
public final class ExtractorFactory {
private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
-
+
public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
} catch (NotOLE2FileException ne) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
- throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
+ throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
} catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
// ensure file-handle release
IOUtils.closeQuietly(fs);
InputStream is = FileMagic.prepareToCheckMagic(inp);
FileMagic fm = FileMagic.valueOf(is);
-
+
switch (fm) {
case OLE2:
POIFSFileSystem fs = new POIFSFileSystem(is);
- boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
+ boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
case OOXML:
return createExtractor(OPCPackage.open(is));
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
- * @throws IOException If an error occurs while reading the file
- * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
+ * @throws IOException If an error occurs while reading the file
+ * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
// Check for the normal Office core document
PackageRelationshipCollection core;
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
-
+
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
-
+
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
-
+
// Grab the core document part, and try to identify from that
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
final String contentType = corePart.getContentType();
-
+
// Is it XSSF?
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new XSSFExcelExtractor(pkg);
}
}
-
+
// Is it XWPF?
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new XWPFWordExtractor(pkg);
}
}
-
+
// Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new SlideShowExtractor<>(new XMLSlideShow(pkg));
}
}
-
- // special handling for SlideShow-Theme-files,
+
+ // special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new SlideShowExtractor<>(new XMLSlideShow(pkg));
}
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
throw new IllegalStateException("Not yet supported");
}
-
+
private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
throws IOException {
String pass = Biff8EncryptionKey.getCurrentUserPassword();
if (pass == null) {
pass = Decryptor.DEFAULT_PASSWORD;
}
-
+
EncryptionInfo ei = new EncryptionInfo(fs);
Decryptor dec = ei.getDecryptor();
InputStream is = null;