From: Nick Burch Date: Fri, 8 Jan 2010 16:14:27 +0000 (+0000) Subject: Wire up the new HSMFTextExtactor to the ExtractorFactory X-Git-Tag: REL_3_7_BETA1~153 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f7ccc5d5f5a870fa9a7c38c418edfe2a8ba107dd;p=poi.git Wire up the new HSMFTextExtactor to the ExtractorFactory git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897246 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 34e4829de5..28af05b3e2 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -31,6 +31,7 @@ import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hsmf.extractor.HSMFTextExtactor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; @@ -138,6 +139,11 @@ public class ExtractorFactory { if(entry.getName().equals("VisioDocument")) { return new VisioTextExtractor(poifsDir, fs); } + if(entry.getName().equals("__substg1.0_1000001E") || + entry.getName().equals("__substg1.0_0047001E") || + entry.getName().equals("__substg1.0_0037001E")) { + return new HSMFTextExtactor(poifsDir, fs); + } } throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); } diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index 7ef1f002c3..8518b6eab1 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -25,6 +25,7 @@ import org.apache.poi.POITextExtractor; import org.apache.poi.POIDataSamples; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; +import org.apache.poi.hsmf.extractor.HSMFTextExtactor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; @@ -42,132 +43,145 @@ import org.apache.poi.openxml4j.opc.OPCPackage; */ public class TestExtractorFactory extends TestCase { - private File txt; - - private File xls; - private File xlsx; - private File xltx; - private File xlsEmb; - - private File doc; - private File docx; - private File dotx; - private File docEmb; - - private File ppt; - private File pptx; - - private File vsd; + private File txt; - protected void setUp() throws Exception { - super.setUp(); - - POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); - xls = ssTests.getFile("SampleSS.xls"); - xlsx = ssTests.getFile("SampleSS.xlsx"); - xltx = ssTests.getFile("test.xltx"); - xlsEmb = ssTests.getFile("excel_with_embeded.xls"); - - POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); - doc = wpTests.getFile("SampleDoc.doc"); - docx = wpTests.getFile("SampleDoc.docx"); - dotx = wpTests.getFile("test.dotx"); - docEmb = wpTests.getFile("word_with_embeded.doc"); - - POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); - ppt = slTests.getFile("SampleShow.ppt"); - pptx = slTests.getFile("SampleShow.pptx"); - txt = slTests.getFile("SampleShow.txt"); - - POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); - vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd"); - } + private File xls; + private File xlsx; + private File xltx; + private File xlsEmb; - public void testFile() throws Exception { - // Excel - assertTrue( - ExtractorFactory.createExtractor(xls) - instanceof ExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(xls).getText().length() > 200 - ); - - assertTrue( - ExtractorFactory.createExtractor(xlsx) - instanceof XSSFExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(xlsx).getText().length() > 200 - ); + private File doc; + private File docx; + private File dotx; + private File docEmb; - assertTrue( - ExtractorFactory.createExtractor(xltx) - instanceof XSSFExcelExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(xltx).getText().contains("test") - ); + private File ppt; + private File pptx; - - // Word - assertTrue( - ExtractorFactory.createExtractor(doc) - instanceof WordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(doc).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(docx) - instanceof XWPFWordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(docx).getText().length() > 120 - ); + private File msg; + private File vsd; - assertTrue( - ExtractorFactory.createExtractor(dotx) - instanceof XWPFWordExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(dotx).getText().contains("Test") - ); + protected void setUp() throws Exception { + super.setUp(); - // PowerPoint - assertTrue( - ExtractorFactory.createExtractor(ppt) - instanceof PowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(ppt).getText().length() > 120 - ); - - assertTrue( - ExtractorFactory.createExtractor(pptx) - instanceof XSLFPowerPointExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(pptx).getText().length() > 120 - ); - - // Visio - assertTrue( - ExtractorFactory.createExtractor(vsd) - instanceof VisioTextExtractor - ); - assertTrue( - ExtractorFactory.createExtractor(vsd).getText().length() > 50 - ); - - // Text - try { - ExtractorFactory.createExtractor(txt); - fail(); - } catch(IllegalArgumentException e) { - // Good - } + POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); + xls = ssTests.getFile("SampleSS.xls"); + xlsx = ssTests.getFile("SampleSS.xlsx"); + xltx = ssTests.getFile("test.xltx"); + xlsEmb = ssTests.getFile("excel_with_embeded.xls"); + + POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); + doc = wpTests.getFile("SampleDoc.doc"); + docx = wpTests.getFile("SampleDoc.docx"); + dotx = wpTests.getFile("test.dotx"); + docEmb = wpTests.getFile("word_with_embeded.doc"); + + POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); + ppt = slTests.getFile("SampleShow.ppt"); + pptx = slTests.getFile("SampleShow.pptx"); + txt = slTests.getFile("SampleShow.txt"); + + POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); + vsd = dgTests.getFile("Test_Visio-Some_Random_Text.vsd"); + + POIDataSamples olTests = POIDataSamples.getHSMFInstance(); + msg = olTests.getFile("quick.msg"); + } + + public void testFile() throws Exception { + // Excel + assertTrue( + ExtractorFactory.createExtractor(xls) + instanceof ExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xls).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(xlsx) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xlsx).getText().length() > 200 + ); + + assertTrue( + ExtractorFactory.createExtractor(xltx) + instanceof XSSFExcelExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(xltx).getText().contains("test") + ); + + + // Word + assertTrue( + ExtractorFactory.createExtractor(doc) + instanceof WordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(docx) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(docx).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(dotx) + instanceof XWPFWordExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(dotx).getText().contains("Test") + ); + + // PowerPoint + assertTrue( + ExtractorFactory.createExtractor(ppt) + instanceof PowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(ppt).getText().length() > 120 + ); + + assertTrue( + ExtractorFactory.createExtractor(pptx) + instanceof XSLFPowerPointExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(pptx).getText().length() > 120 + ); + + // Visio + assertTrue( + ExtractorFactory.createExtractor(vsd) + instanceof VisioTextExtractor + ); + assertTrue( + ExtractorFactory.createExtractor(vsd).getText().length() > 50 + ); + + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(msg) + instanceof HSMFTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(msg).getText().length() > 50 + ); + + // Text + try { + ExtractorFactory.createExtractor(txt); + fail(); + } catch(IllegalArgumentException e) { + // Good + } } public void testInputStream() throws Exception { @@ -231,6 +245,15 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50 ); + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(msg)) + instanceof HSMFTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50 + ); + // Text try { ExtractorFactory.createExtractor(new FileInputStream(txt)); @@ -277,6 +300,15 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50 ); + // Outlook msg + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))) + instanceof HSMFTextExtactor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50 + ); + // Text try { ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt))); @@ -323,57 +355,58 @@ public class TestExtractorFactory extends TestCase { } } - /** - * Test embeded docs text extraction. For now, only - * does poifs embeded, but will do ooxml ones - * at some point. - */ - public void testEmbeded() throws Exception { - POIOLE2TextExtractor ext; - POITextExtractor[] embeds; - - // No embedings - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xls); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - assertEquals(0, embeds.length); - - // Excel - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(xlsEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - assertEquals(6, embeds.length); - int numWord = 0, numXls = 0, numPpt = 0; - for(int i=0; i 20); - - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - } - assertEquals(2, numPpt); - assertEquals(2, numXls); - assertEquals(2, numWord); - - // Word - ext = (POIOLE2TextExtractor) - ExtractorFactory.createExtractor(docEmb); - embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); - - numWord = 0; numXls = 0; numPpt = 0; - assertEquals(4, embeds.length); - for(int i=0; i 20); - if(embeds[i] instanceof PowerPointExtractor) numPpt++; - else if(embeds[i] instanceof ExcelExtractor) numXls++; - else if(embeds[i] instanceof WordExtractor) numWord++; - } - assertEquals(1, numPpt); - assertEquals(2, numXls); - assertEquals(1, numWord); + /** + * Test embeded docs text extraction. For now, only + * does poifs embeded, but will do ooxml ones + * at some point. + */ + public void testEmbeded() throws Exception { + POIOLE2TextExtractor ext; + POITextExtractor[] embeds; - // TODO - PowerPoint - // TODO - Visio - } + // No embedings + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(xls); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + assertEquals(0, embeds.length); + + // Excel + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(xlsEmb); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + assertEquals(6, embeds.length); + int numWord = 0, numXls = 0, numPpt = 0; + for(int i=0; i 20); + + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + } + assertEquals(2, numPpt); + assertEquals(2, numXls); + assertEquals(2, numWord); + + // Word + ext = (POIOLE2TextExtractor) + ExtractorFactory.createExtractor(docEmb); + embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext); + + numWord = 0; numXls = 0; numPpt = 0; + assertEquals(4, embeds.length); + for(int i=0; i 20); + if(embeds[i] instanceof PowerPointExtractor) numPpt++; + else if(embeds[i] instanceof ExcelExtractor) numXls++; + else if(embeds[i] instanceof WordExtractor) numWord++; + } + assertEquals(1, numPpt); + assertEquals(2, numXls); + assertEquals(1, numWord); + + // TODO - PowerPoint + // TODO - Visio + // TODO - Outlook + } } diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java index 7e693b0c8d..ea7335c5b1 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java @@ -34,6 +34,7 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks; import org.apache.poi.hsmf.datatypes.StringChunk; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; import org.apache.poi.hsmf.parsers.POIFSChunkParser; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -78,15 +79,24 @@ public class MAPIMessage extends POIDocument { this(new POIFSFileSystem(in)); } /** - * Constructor for reading MSG Files from an input stream. + * Constructor for reading MSG Files from a POIFS filesystem * @param in * @throws IOException */ public MAPIMessage(POIFSFileSystem fs) throws IOException { - super(fs); - + this(fs.getRoot(), fs); + } + /** + * Constructor for reading MSG Files from a certain + * point within a POIFS filesystem + * @param in + * @throws IOException + */ + public MAPIMessage(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { + super(poifsDir, fs); + // Grab all the chunks - ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs); + ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir); // Grab interesting bits ArrayList attachments = new ArrayList(); diff --git a/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java b/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java index 63bbeb3518..fd72feaa56 100644 --- a/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java +++ b/src/scratchpad/src/org/apache/poi/hsmf/extractor/HSMFTextExtactor.java @@ -23,12 +23,16 @@ import java.text.SimpleDateFormat; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; +import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; public class HSMFTextExtactor extends POIOLE2TextExtractor { public HSMFTextExtactor(MAPIMessage msg) { super(msg); } + public HSMFTextExtactor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { + this(new MAPIMessage(poifsDir, fs)); + } public HSMFTextExtactor(POIFSFileSystem fs) throws IOException { this(new MAPIMessage(fs)); }