aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorNick Burch <nick@apache.org>2010-01-08 16:14:27 +0000
committerNick Burch <nick@apache.org>2010-01-08 16:14:27 +0000
commitf7ccc5d5f5a870fa9a7c38c418edfe2a8ba107dd (patch)
treed4125ea1446b7462bac047b0bb755fa2933bc2a8 /src
parentc01272208eec15837daaf699d9546be97ebd2403 (diff)
downloadpoi-f7ccc5d5f5a870fa9a7c38c418edfe2a8ba107dd.tar.gz
poi-f7ccc5d5f5a870fa9a7c38c418edfe2a8ba107dd.zip
Wire up the new HSMFTextExtactor to the ExtractorFactory
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r--src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java6
-rw-r--r--src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java377
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java18
-rw-r--r--src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java4
4 files changed, 229 insertions, 176 deletions
diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
index 34e4829de5..28af05b3e2 100644
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -138,6 +139,11 @@ public class ExtractorFactory {
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(poifsDir, fs);
}
+ if(entry.getName().equals("__substg1.0_1000001E") ||
+ entry.getName().equals("__substg1.0_0047001E") ||
+ entry.getName().equals("__substg1.0_0037001E")) {
+ return new HSMFTextExtactor(poifsDir, fs);
+ }
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
index 7ef1f002c3..8518b6eab1 100644
--- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
+++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
@@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.extractor.HSMFTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -42,132 +43,145 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
*/
public class TestExtractorFactory extends TestCase {
- private File txt;
-
- private File xls;
- private File xlsx;
- private File xltx;
- private File xlsEmb;
-
- private File doc;
- private File docx;
- private File dotx;
- private File docEmb;
-
- private File ppt;
- private File pptx;
-
- private File vsd;
+ private File txt;
- protected void setUp() throws Exception {
- super.setUp();
-
- POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
- xls = ssTests.getFile("SampleSS.xls");
- xlsx = ssTests.getFile("SampleSS.xlsx");
- xltx = ssTests.getFile("test.xltx");
- xlsEmb = ssTests.getFile("excel_with_embeded.xls");
-
- POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
- doc = wpTests.getFile("SampleDoc.doc");
- docx = wpTests.getFile("SampleDoc.docx");
- dotx = wpTests.getFile("test.dotx");
- docEmb = wpTests.getFile("word_with_embeded.doc");
-
- POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- ppt = slTests.getFile("SampleShow.ppt");
- pptx = slTests.getFile("SampleShow.pptx");
- txt = slTests.getFile("SampleShow.txt");
-
- POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
- vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
- }
+ private File xls;
+ private File xlsx;
+ private File xltx;
+ private File xlsEmb;
- public void testFile() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(xls)
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(xls).getText().length() > 200
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(xlsx)
- instanceof XSSFExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(xlsx).getText().length() > 200
- );
+ private File doc;
+ private File docx;
+ private File dotx;
+ private File docEmb;
- assertTrue(
- ExtractorFactory.createExtractor(xltx)
- instanceof XSSFExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(xltx).getText().contains("test")
- );
+ private File ppt;
+ private File pptx;
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(doc)
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(doc).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(docx)
- instanceof XWPFWordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(docx).getText().length() > 120
- );
+ private File msg;
+ private File vsd;
- assertTrue(
- ExtractorFactory.createExtractor(dotx)
- instanceof XWPFWordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(dotx).getText().contains("Test")
- );
+ protected void setUp() throws Exception {
+ super.setUp();
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(ppt)
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(ppt).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(pptx)
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(pptx).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(vsd)
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(vsd).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(txt);
- fail();
- } catch(IllegalArgumentException e) {
- // Good
- }
+ POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
+ xls = ssTests.getFile("SampleSS.xls");
+ xlsx = ssTests.getFile("SampleSS.xlsx");
+ xltx = ssTests.getFile("test.xltx");
+ xlsEmb = ssTests.getFile("excel_with_embeded.xls");
+
+ POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
+ doc = wpTests.getFile("SampleDoc.doc");
+ docx = wpTests.getFile("SampleDoc.docx");
+ dotx = wpTests.getFile("test.dotx");
+ docEmb = wpTests.getFile("word_with_embeded.doc");
+
+ POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
+ ppt = slTests.getFile("SampleShow.ppt");
+ pptx = slTests.getFile("SampleShow.pptx");
+ txt = slTests.getFile("SampleShow.txt");
+
+ POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
+ vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd");
+
+ POIDataSamples olTests = POIDataSamples.getHSMFInstance();
+ msg = olTests.getFile("quick.msg");
+ }
+
+ public void testFile() throws Exception {
+ // Excel
+ assertTrue(
+ ExtractorFactory.createExtractor(xls)
+ instanceof ExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(xls).getText().length() > 200
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(xlsx)
+ instanceof XSSFExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(xlsx).getText().length() > 200
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(xltx)
+ instanceof XSSFExcelExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(xltx).getText().contains("test")
+ );
+
+
+ // Word
+ assertTrue(
+ ExtractorFactory.createExtractor(doc)
+ instanceof WordExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(doc).getText().length() > 120
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(docx)
+ instanceof XWPFWordExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(docx).getText().length() > 120
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(dotx)
+ instanceof XWPFWordExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(dotx).getText().contains("Test")
+ );
+
+ // PowerPoint
+ assertTrue(
+ ExtractorFactory.createExtractor(ppt)
+ instanceof PowerPointExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(ppt).getText().length() > 120
+ );
+
+ assertTrue(
+ ExtractorFactory.createExtractor(pptx)
+ instanceof XSLFPowerPointExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(pptx).getText().length() > 120
+ );
+
+ // Visio
+ assertTrue(
+ ExtractorFactory.createExtractor(vsd)
+ instanceof VisioTextExtractor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(vsd).getText().length() > 50
+ );
+
+ // Outlook msg
+ assertTrue(
+ ExtractorFactory.createExtractor(msg)
+ instanceof HSMFTextExtactor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(msg).getText().length() > 50
+ );
+
+ // Text
+ try {
+ ExtractorFactory.createExtractor(txt);
+ fail();
+ } catch(IllegalArgumentException e) {
+ // Good
+ }
}
public void testInputStream() throws Exception {
@@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
);
+ // Outlook msg
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(msg))
+ instanceof HSMFTextExtactor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
+ );
+
// Text
try {
ExtractorFactory.createExtractor(new FileInputStream(txt));
@@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
+ // Outlook msg
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
+ instanceof HSMFTextExtactor
+ );
+ assertTrue(
+ ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
+ );
+
// Text
try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
@@ -323,57 +355,58 @@ public class TestExtractorFactory extends TestCase {
}
}
- /**
- * Test embeded docs text extraction. For now, only
- * does poifs embeded, but will do ooxml ones
- * at some point.
- */
- public void testEmbeded() throws Exception {
- POIOLE2TextExtractor ext;
- POITextExtractor[] embeds;
-
- // No embedings
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xls);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
- assertEquals(0, embeds.length);
-
- // Excel
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xlsEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0;
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
-
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- }
- assertEquals(2, numPpt);
- assertEquals(2, numXls);
- assertEquals(2, numWord);
-
- // Word
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0;
- assertEquals(4, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- }
- assertEquals(1, numPpt);
- assertEquals(2, numXls);
- assertEquals(1, numWord);
+ /**
+ * Test embeded docs text extraction. For now, only
+ * does poifs embeded, but will do ooxml ones
+ * at some point.
+ */
+ public void testEmbeded() throws Exception {
+ POIOLE2TextExtractor ext;
+ POITextExtractor[] embeds;
- // TODO - PowerPoint
- // TODO - Visio
- }
+ // No embedings
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(xls);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+ assertEquals(0, embeds.length);
+
+ // Excel
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(xlsEmb);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ assertEquals(6, embeds.length);
+ int numWord = 0, numXls = 0, numPpt = 0;
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ }
+ assertEquals(2, numPpt);
+ assertEquals(2, numXls);
+ assertEquals(2, numWord);
+
+ // Word
+ ext = (POIOLE2TextExtractor)
+ ExtractorFactory.createExtractor(docEmb);
+ embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+ numWord = 0; numXls = 0; numPpt = 0;
+ assertEquals(4, embeds.length);
+ for(int i=0; i<embeds.length; i++) {
+ assertTrue(embeds[i].getText().length() > 20);
+ if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+ else if(embeds[i] instanceof ExcelExtractor) numXls++;
+ else if(embeds[i] instanceof WordExtractor) numWord++;
+ }
+ assertEquals(1, numPpt);
+ assertEquals(2, numXls);
+ assertEquals(1, numWord);
+
+ // TODO - PowerPoint
+ // TODO - Visio
+ // TODO - Outlook
+ }
}
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
index 7e693b0c8d..ea7335c5b1 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
@@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument {
this(new POIFSFileSystem(in));
}
/**
- * Constructor for reading MSG Files from an input stream.
+ * Constructor for reading MSG Files from a POIFS filesystem
* @param in
* @throws IOException
*/
public MAPIMessage(POIFSFileSystem fs) throws IOException {
- super(fs);
-
+ this(fs.getRoot(), fs);
+ }
+ /**
+ * Constructor for reading MSG Files from a certain
+ * point within a POIFS filesystem
+ * @param in
+ * @throws IOException
+ */
+ public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+ super(poifsDir, fs);
+
// Grab all the chunks
- ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
+ ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir);
// Grab interesting bits
ArrayList<AttachmentChunks> attachments = new ArrayList<AttachmentChunks>();
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java
index 63bbeb3518..fd72feaa56 100644
--- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java
@@ -23,12 +23,16 @@ import java.text.SimpleDateFormat;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class HSMFTextExtactor extends POIOLE2TextExtractor {
public HSMFTextExtactor(MAPIMessage msg) {
super(msg);
}
+ public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
+ this(new MAPIMessage(poifsDir, fs));
+ }
public HSMFTextExtactor(POIFSFileSystem fs) throws IOException {
this(new MAPIMessage(fs));
}