<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
- <action dev="POI-DEVELOPERS" type="fix">Add a text extractor to HSMF for simpler extraction of text from .msg files</action>
+ <action dev="POI-DEVELOPERS" type="fix">Add a text extractor (OutlookTextExtractor) to HSMF for simpler extraction of text from .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF parsing of .msg files</action>
<action dev="POI-DEVELOPERS" type="fix">Initialise the link type of HSSFHyperLink, so that getType() on it works</action>
<action dev="POI-DEVELOPERS" type="fix">48425 - improved performance of DateUtil.isCellDateFormatted() </action>
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
-import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
+import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
if(entry.getName().equals("__substg1.0_1000001E") ||
entry.getName().equals("__substg1.0_0047001E") ||
entry.getName().equals("__substg1.0_0037001E")) {
- return new HSMFTextExtactor(poifsDir, fs);
+ return new OutlookTextExtactor(poifsDir, fs);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
-import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
+import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(msg)
- instanceof HSMFTextExtactor
+ instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(msg).getText().length() > 50
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg))
- instanceof HSMFTextExtactor
+ instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
- instanceof HSMFTextExtactor
+ instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hsmf.extractor;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.SimpleDateFormat;
-
-import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.hsmf.MAPIMessage;
-import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-
-public class HSMFTextExtactor extends POIOLE2TextExtractor {
- public HSMFTextExtactor(MAPIMessage msg) {
- super(msg);
- }
- public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
- this(new MAPIMessage(poifsDir, fs));
- }
- public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
- this(new MAPIMessage(fs));
- }
- public HSMFTextExtactor(InputStream inp) throws IOException {
- this(new MAPIMessage(inp));
- }
-
- /**
- * Outputs something a little like a RFC822 email
- */
- public String getText() {
- MAPIMessage msg = (MAPIMessage)document;
- StringBuffer s = new StringBuffer();
-
- try {
- s.append("From: " + msg.getDisplayFrom() + "\n");
- } catch(ChunkNotFoundException e) {}
- try {
- s.append("To: " + msg.getDisplayTo() + "\n");
- } catch(ChunkNotFoundException e) {}
- try {
- if(msg.getDisplayCC().length() > 0)
- s.append("CC: " + msg.getDisplayCC() + "\n");
- } catch(ChunkNotFoundException e) {}
- try {
- if(msg.getDisplayBCC().length() > 0)
- s.append("BCC: " + msg.getDisplayBCC() + "\n");
- } catch(ChunkNotFoundException e) {}
- try {
- SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
- s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
- } catch(ChunkNotFoundException e) {}
- try {
- s.append("Subject: " + msg.getSubject() + "\n");
- } catch(ChunkNotFoundException e) {}
- try {
- s.append("\n" + msg.getTextBody() + "\n");
- } catch(ChunkNotFoundException e) {}
-
- return s.toString();
- }
-
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hsmf.extractor;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.SimpleDateFormat;
+
+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * A text extractor for HSMF (Outlook) .msg files.
+ * Outputs in a format somewhat like a plain text email.
+ */
+public class OutlookTextExtactor extends POIOLE2TextExtractor {
+ public OutlookTextExtactor(MAPIMessage msg) {
+ super(msg);
+ }
+ public OutlookTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+ this(new MAPIMessage(poifsDir, fs));
+ }
+ public OutlookTextExtactor(POIFSFileSystem fs) throws IOException {
+ this(new MAPIMessage(fs));
+ }
+ public OutlookTextExtactor(InputStream inp) throws IOException {
+ this(new MAPIMessage(inp));
+ }
+
+ /**
+ * Outputs something a little like a RFC822 email
+ */
+ public String getText() {
+ MAPIMessage msg = (MAPIMessage)document;
+ StringBuffer s = new StringBuffer();
+
+ try {
+ s.append("From: " + msg.getDisplayFrom() + "\n");
+ } catch(ChunkNotFoundException e) {}
+ try {
+ s.append("To: " + msg.getDisplayTo() + "\n");
+ } catch(ChunkNotFoundException e) {}
+ try {
+ if(msg.getDisplayCC().length() > 0)
+ s.append("CC: " + msg.getDisplayCC() + "\n");
+ } catch(ChunkNotFoundException e) {}
+ try {
+ if(msg.getDisplayBCC().length() > 0)
+ s.append("BCC: " + msg.getDisplayBCC() + "\n");
+ } catch(ChunkNotFoundException e) {}
+ try {
+ SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
+ s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
+ } catch(ChunkNotFoundException e) {}
+ try {
+ s.append("Subject: " + msg.getSubject() + "\n");
+ } catch(ChunkNotFoundException e) {}
+ try {
+ s.append("\n" + msg.getTextBody() + "\n");
+ } catch(ChunkNotFoundException e) {}
+
+ return s.toString();
+ }
+}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-
-package org.apache.poi.hsmf.extractor;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-
-import junit.framework.TestCase;
-
-import org.apache.poi.POIDataSamples;
-import org.apache.poi.hsmf.MAPIMessage;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-
-/**
- * Tests to verify that the text extractor works
- */
-public final class TestHSMFTextExtractor extends TestCase {
- private POIDataSamples samples;
-
- public TestHSMFTextExtractor() throws IOException {
- samples = POIDataSamples.getHSMFInstance();
- }
-
- private void assertContains(String haystack, String needle) {
- if(haystack.indexOf(needle) > -1) {
- return;
- }
- fail("'" + needle + "' wasn't found in '" + haystack + "'");
- }
-
- public void testQuick() throws Exception {
- POIFSFileSystem simple = new POIFSFileSystem(
- new FileInputStream(samples.getFile("quick.msg"))
- );
- MAPIMessage msg = new MAPIMessage(simple);
-
- HSMFTextExtactor ext = new HSMFTextExtactor(msg);
- String text = ext.getText();
-
- assertContains(text, "From: Kevin Roast\n");
- assertContains(text, "To: Kevin Roast\n");
- assertEquals(-1, text.indexOf("CC:"));
- assertEquals(-1, text.indexOf("BCC:"));
- assertContains(text, "Subject: Test the content transformer\n");
- assertContains(text, "Date: Thu, 14 Jun 2007 09:42:55\n");
- assertContains(text, "The quick brown fox jumps over the lazy dog");
- }
-
- public void testSimple() throws Exception {
- MAPIMessage msg = new MAPIMessage(new POIFSFileSystem(
- new FileInputStream(samples.getFile("simple_test_msg.msg"))
- ));
-
- HSMFTextExtactor ext = new HSMFTextExtactor(msg);
- String text = ext.getText();
-
- assertContains(text, "From: Travis Ferguson\n");
- assertContains(text, "To: travis@overwrittenstack.com\n");
- assertEquals(-1, text.indexOf("CC:"));
- assertEquals(-1, text.indexOf("BCC:"));
- assertContains(text, "Subject: test message\n");
- assertEquals(-1, text.indexOf("Date:"));
- assertContains(text, "This is a test message.");
- }
-
- public void testConstructors() throws Exception {
- String inp = (new HSMFTextExtactor(new FileInputStream(
- samples.getFile("simple_test_msg.msg")
- )).getText());
- String poifs = (new HSMFTextExtactor(new POIFSFileSystem(new FileInputStream(
- samples.getFile("simple_test_msg.msg")
- ))).getText());
- String mapi = (new HSMFTextExtactor(new MAPIMessage(new FileInputStream(
- samples.getFile("simple_test_msg.msg")
- ))).getText());
-
- assertEquals(inp, poifs);
- assertEquals(inp, mapi);
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hsmf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Tests to verify that the text extractor works
+ */
+public final class TestOutlookTextExtractor extends TestCase {
+ private POIDataSamples samples;
+
+ public TestOutlookTextExtractor() throws IOException {
+ samples = POIDataSamples.getHSMFInstance();
+ }
+
+ private void assertContains(String haystack, String needle) {
+ if(haystack.indexOf(needle) > -1) {
+ return;
+ }
+ fail("'" + needle + "' wasn't found in '" + haystack + "'");
+ }
+
+ public void testQuick() throws Exception {
+ POIFSFileSystem simple = new POIFSFileSystem(
+ new FileInputStream(samples.getFile("quick.msg"))
+ );
+ MAPIMessage msg = new MAPIMessage(simple);
+
+ OutlookTextExtactor ext = new OutlookTextExtactor(msg);
+ String text = ext.getText();
+
+ assertContains(text, "From: Kevin Roast\n");
+ assertContains(text, "To: Kevin Roast\n");
+ assertEquals(-1, text.indexOf("CC:"));
+ assertEquals(-1, text.indexOf("BCC:"));
+ assertContains(text, "Subject: Test the content transformer\n");
+ assertContains(text, "Date: Thu, 14 Jun 2007 09:42:55\n");
+ assertContains(text, "The quick brown fox jumps over the lazy dog");
+ }
+
+ public void testSimple() throws Exception {
+ MAPIMessage msg = new MAPIMessage(new POIFSFileSystem(
+ new FileInputStream(samples.getFile("simple_test_msg.msg"))
+ ));
+
+ OutlookTextExtactor ext = new OutlookTextExtactor(msg);
+ String text = ext.getText();
+
+ assertContains(text, "From: Travis Ferguson\n");
+ assertContains(text, "To: travis@overwrittenstack.com\n");
+ assertEquals(-1, text.indexOf("CC:"));
+ assertEquals(-1, text.indexOf("BCC:"));
+ assertContains(text, "Subject: test message\n");
+ assertEquals(-1, text.indexOf("Date:"));
+ assertContains(text, "This is a test message.");
+ }
+
+ public void testConstructors() throws Exception {
+ String inp = (new OutlookTextExtactor(new FileInputStream(
+ samples.getFile("simple_test_msg.msg")
+ )).getText());
+ String poifs = (new OutlookTextExtactor(new POIFSFileSystem(new FileInputStream(
+ samples.getFile("simple_test_msg.msg")
+ ))).getText());
+ String mapi = (new OutlookTextExtactor(new MAPIMessage(new FileInputStream(
+ samples.getFile("simple_test_msg.msg")
+ ))).getText());
+
+ assertEquals(inp, poifs);
+ assertEquals(inp, mapi);
+ }
+}