123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.extractor;
-
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertFalse;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertNull;
- import static org.junit.Assert.assertTrue;
- import static org.junit.Assert.fail;
-
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.POIOLE2TextExtractor;
- import org.apache.poi.POITextExtractor;
- import org.apache.poi.POIXMLException;
- import org.apache.poi.POIXMLTextExtractor;
- import org.apache.poi.hdgf.extractor.VisioTextExtractor;
- import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
- import org.apache.poi.hslf.extractor.PowerPointExtractor;
- import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
- import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
- import org.apache.poi.hssf.extractor.ExcelExtractor;
- import org.apache.poi.hwpf.extractor.Word6Extractor;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
- import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
- import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.junit.BeforeClass;
- import org.junit.Test;
-
- /**
- * Test that the extractor factory plays nicely
- */
- public class TestExtractorFactory {
- private static File txt;
-
- private static File xls;
- private static File xlsx;
- private static File xlsxStrict;
- private static File xltx;
- private static File xlsEmb;
-
- private static File doc;
- private static File doc6;
- private static File doc95;
- private static File docx;
- private static File dotx;
- private static File docEmb;
- private static File docEmbOOXML;
-
- private static File ppt;
- private static File pptx;
-
- private static File msg;
- private static File msgEmb;
- private static File msgEmbMsg;
-
- private static File vsd;
- private static File vsdx;
-
- private static File pub;
-
- private static File getFileAndCheck(POIDataSamples samples, String name) {
- File file = samples.getFile(name);
-
- assertNotNull("Did not get a file for " + name, file);
- assertTrue("Did not get a type file for " + name, file.isFile());
- assertTrue("File did not exist: " + name, file.exists());
-
- return file;
- }
-
- @BeforeClass
- public static void setUp() throws Exception {
-
- POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
- xls = getFileAndCheck(ssTests, "SampleSS.xls");
- xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
- xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
- xltx = getFileAndCheck(ssTests, "test.xltx");
- xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
-
- POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
- doc = getFileAndCheck(wpTests, "SampleDoc.doc");
- doc6 = getFileAndCheck(wpTests, "Word6.doc");
- doc95 = getFileAndCheck(wpTests, "Word95.doc");
- docx = getFileAndCheck(wpTests, "SampleDoc.docx");
- dotx = getFileAndCheck(wpTests, "test.dotx");
- docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
- docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
-
- POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- ppt = getFileAndCheck(slTests, "SampleShow.ppt");
- pptx = getFileAndCheck(slTests, "SampleShow.pptx");
- txt = getFileAndCheck(slTests, "SampleShow.txt");
-
- POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
- vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
- vsdx = getFileAndCheck(dgTests, "test.vsdx");
-
- POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
- pub = getFileAndCheck(pubTests, "Simple.pub");
-
- POIDataSamples olTests = POIDataSamples.getHSMFInstance();
- msg = getFileAndCheck(olTests, "quick.msg");
- msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
- msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
- }
-
- @Test
- public void testFile() throws Exception {
- // Excel
- POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
- assertNotNull("Had empty extractor for " + xls, xlsExtractor);
- assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
- xlsExtractor
- instanceof ExcelExtractor
- );
- assertTrue(
- xlsExtractor.getText().length() > 200
- );
- xlsExtractor.close();
-
- POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor.getText().contains("test")
- );
- extractor.close();
-
- // TODO Support OOXML-Strict, see bug #57699
- try {
- extractor = ExtractorFactory.createExtractor(xlsxStrict);
- fail("OOXML-Strict isn't yet supported");
- } catch (POIXMLException e) {
- // Expected, for now
- }
- // extractor = ExtractorFactory.createExtractor(xlsxStrict);
- // assertTrue(
- // extractor
- // instanceof XSSFExcelExtractor
- // );
- // extractor.close();
- //
- // extractor = ExtractorFactory.createExtractor(xlsxStrict);
- // assertTrue(
- // extractor.getText().contains("test")
- // );
- // extractor.close();
-
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(doc)
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(doc).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(doc6)
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(doc6).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(doc95)
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(doc95).getText().length() > 120
- );
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor.getText().contains("Test")
- );
- extractor.close();
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(ppt)
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(ppt).getText().length() > 120
- );
-
- extractor = ExtractorFactory.createExtractor(pptx);
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(pptx);
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio - binary
- assertTrue(
- ExtractorFactory.createExtractor(vsd)
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(vsd).getText().length() > 50
- );
- // Visio - vsdx
- assertTrue(
- ExtractorFactory.createExtractor(vsdx)
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(vsdx).getText().length() > 20
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(pub)
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(pub).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(msg)
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(msg).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(txt);
- fail();
- } catch(IllegalArgumentException e) {
- // Good
- }
- }
-
- @Test
- public void testInputStream() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(xls))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(xls)).getText().length() > 200
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(xlsx))
- instanceof XSSFExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(xlsx)).getText().length() > 200
- );
- // TODO Support OOXML-Strict, see bug #57699
- // assertTrue(
- // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
- // instanceof XSSFExcelExtractor
- // );
- // assertTrue(
- // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
- // );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(doc))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(doc6))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(doc95))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(docx))
- instanceof XWPFWordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(docx)).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(ppt))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(ppt)).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(pptx))
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(pptx)).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(vsd))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
- );
- // Visio - vsdx
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(vsdx))
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20
- );
-
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(pub))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(msg))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
- );
-
- // Text
- try {
- FileInputStream stream = new FileInputStream(txt);
- try {
- ExtractorFactory.createExtractor(stream);
- fail();
- } finally {
- stream.close();
- }
- } catch(IllegalArgumentException e) {
- // Good
- }
- }
-
- @Test
- public void testPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
- fail();
- } catch(IOException e) {
- // Good
- }
- }
-
- @Test
- public void testPackage() throws Exception {
- // Excel
- POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(extractor.getText().length() > 200);
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(
- extractor
- instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
- fail();
- } catch(InvalidOperationException e) {
- // Good
- }
- }
-
- @Test
- public void testPreferEventBased() throws Exception {
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setThreadPrefersEventExtractors(true);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(false);
-
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(null);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
-
- // Check we get the right extractors now
- POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof EventBasedExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
-
- // Put back to normal
- ExtractorFactory.setThreadPrefersEventExtractors(false);
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- // And back
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
- }
-
- /**
- * Test embeded docs text extraction. For now, only
- * does poifs embeded, but will do ooxml ones
- * at some point.
- */
- @Test
- public void testEmbeded() throws Exception {
- POIOLE2TextExtractor ext;
- POITextExtractor[] embeds;
-
- // No embedings
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xls);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
- assertEquals(0, embeds.length);
-
- // Excel
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xlsEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
-
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(2, numPpt);
- assertEquals(2, numXls);
- assertEquals(2, numWord);
- assertEquals(0, numMsg);
-
- // Word
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(4, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(1, numPpt);
- assertEquals(2, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
-
- // Word which contains an OOXML file
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmbOOXML);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
- assertEquals(3, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
- else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
- }
- assertEquals(1, numPpt);
- assertEquals(1, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numWordX);
- assertEquals(0, numMsg);
-
- // Outlook
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
-
- // Outlook with another outlook file in it
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmbMsg);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for(int i=0; i<embeds.length; i++) {
- assertTrue(embeds[i].getText().length() > 20);
- if(embeds[i] instanceof PowerPointExtractor) numPpt++;
- else if(embeds[i] instanceof ExcelExtractor) numXls++;
- else if(embeds[i] instanceof WordExtractor) numWord++;
- else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numMsg);
-
-
- // TODO - PowerPoint
- // TODO - Publisher
- // TODO - Visio
- }
- }
|