1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.extractor;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertFalse;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertNull;
- import static org.junit.Assert.assertTrue;
- import static org.junit.Assert.fail;
-
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.POIOLE2TextExtractor;
- import org.apache.poi.POITextExtractor;
- import org.apache.poi.POIXMLException;
- import org.apache.poi.POIXMLTextExtractor;
- import org.apache.poi.UnsupportedFileFormatException;
- import org.apache.poi.hdgf.extractor.VisioTextExtractor;
- import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
- import org.apache.poi.hslf.extractor.PowerPointExtractor;
- import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
- import org.apache.poi.hssf.HSSFTestDataSamples;
- import org.apache.poi.hssf.OldExcelFormatException;
- import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
- import org.apache.poi.hssf.extractor.ExcelExtractor;
- import org.apache.poi.hwpf.extractor.Word6Extractor;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
- import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
- import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.junit.BeforeClass;
- import org.junit.Test;
-
- /**
- * Test that the extractor factory plays nicely
- */
- public class TestExtractorFactory {
- private static File txt;
-
- private static File xls;
- private static File xlsx;
- private static File xlsxStrict;
- private static File xltx;
- private static File xlsEmb;
- private static File xlsb;
-
- private static File doc;
- private static File doc6;
- private static File doc95;
- private static File docx;
- private static File dotx;
- private static File docEmb;
- private static File docEmbOOXML;
-
- private static File ppt;
- private static File pptx;
-
- private static File msg;
- private static File msgEmb;
- private static File msgEmbMsg;
-
- private static File vsd;
- private static File vsdx;
-
- private static File pub;
-
- private static File getFileAndCheck(POIDataSamples samples, String name) {
- File file = samples.getFile(name);
-
- assertNotNull("Did not get a file for " + name, file);
- assertTrue("Did not get a type file for " + name, file.isFile());
- assertTrue("File did not exist: " + name, file.exists());
-
- return file;
- }
-
- @BeforeClass
- public static void setUp() throws Exception {
-
- POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
- xls = getFileAndCheck(ssTests, "SampleSS.xls");
- xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
- xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
- xltx = getFileAndCheck(ssTests, "test.xltx");
- xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
- xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
-
- POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
- doc = getFileAndCheck(wpTests, "SampleDoc.doc");
- doc6 = getFileAndCheck(wpTests, "Word6.doc");
- doc95 = getFileAndCheck(wpTests, "Word95.doc");
- docx = getFileAndCheck(wpTests, "SampleDoc.docx");
- dotx = getFileAndCheck(wpTests, "test.dotx");
- docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
- docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
-
- POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- ppt = getFileAndCheck(slTests, "SampleShow.ppt");
- pptx = getFileAndCheck(slTests, "SampleShow.pptx");
- txt = getFileAndCheck(slTests, "SampleShow.txt");
-
- POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
- vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
- vsdx = getFileAndCheck(dgTests, "test.vsdx");
-
- POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
- pub = getFileAndCheck(pubTests, "Simple.pub");
-
- POIDataSamples olTests = POIDataSamples.getHSMFInstance();
- msg = getFileAndCheck(olTests, "quick.msg");
- msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
- msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
- }
-
- @Test
- public void testFile() throws Exception {
- // Excel
- POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
- assertNotNull("Had empty extractor for " + xls, xlsExtractor);
- assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
- xlsExtractor
- instanceof ExcelExtractor
- );
- assertTrue(
- xlsExtractor.getText().length() > 200
- );
- xlsExtractor.close();
-
- POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xlsb);
- assertTrue(
- extractor.getText().contains("test")
- );
- extractor.close();
-
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor.getText().contains("test")
- );
- extractor.close();
-
- // TODO Support OOXML-Strict, see bug #57699
- try {
- /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
- fail("OOXML-Strict isn't yet supported");
- } catch (POIXMLException e) {
- // Expected, for now
- }
- // extractor = ExtractorFactory.createExtractor(xlsxStrict);
- // assertTrue(
- // extractor
- // instanceof XSSFExcelExtractor
- // );
- // extractor.close();
- //
- // extractor = ExtractorFactory.createExtractor(xlsxStrict);
- // assertTrue(
- // extractor.getText().contains("test")
- // );
- // extractor.close();
-
-
- // Word
- extractor = ExtractorFactory.createExtractor(doc);
- assertTrue(
- extractor
- instanceof WordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(doc6);
- assertTrue(
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(doc95);
- assertTrue(
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor.getText().contains("Test")
- );
- extractor.close();
-
- // PowerPoint (PPT)
- extractor = ExtractorFactory.createExtractor(ppt);
- assertTrue(
- extractor
- instanceof PowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint (PPTX)
- extractor = ExtractorFactory.createExtractor(pptx);
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio - binary
- extractor = ExtractorFactory.createExtractor(vsd);
- assertTrue(
- extractor
- instanceof VisioTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Visio - vsdx
- extractor = ExtractorFactory.createExtractor(vsdx);
- assertTrue(
- extractor
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- // Publisher
- extractor = ExtractorFactory.createExtractor(pub);
- assertTrue(
- extractor
- instanceof PublisherTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Outlook msg
- extractor = ExtractorFactory.createExtractor(msg);
- assertTrue(
- extractor
- instanceof OutlookTextExtactor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Text
- try {
- ExtractorFactory.createExtractor(txt);
- fail();
- } catch(IllegalArgumentException e) {
- // Good
- }
- }
-
- @Test
- public void testInputStream() throws Exception {
- // Excel
- POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- assertTrue(
- extractor.getText().length() > 200
- );
- // TODO Support OOXML-Strict, see bug #57699
- // assertTrue(
- // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
- // instanceof XSSFExcelExtractor
- // );
- // assertTrue(
- // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
- // );
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof WordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
- assertTrue(
- extractor
- instanceof XWPFWordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
- assertTrue(
- extractor
- instanceof PowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio
- extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
- assertTrue(
- extractor
- instanceof VisioTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Visio - vsdx
- extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
- assertTrue(
- extractor
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- // Publisher
- extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
- assertTrue(
- extractor
- instanceof PublisherTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Outlook msg
- extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
- assertTrue(
- extractor
- instanceof OutlookTextExtactor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Text
- try {
- FileInputStream stream = new FileInputStream(txt);
- try {
- ExtractorFactory.createExtractor(stream);
- fail();
- } finally {
- IOUtils.closeQuietly(stream);
- }
- } catch(IllegalArgumentException e) {
- // Good
- }
- }
-
- @Test
- public void testPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
- fail();
- } catch(IOException e) {
- // Good
- }
- }
-
-
- @Test
- public void testOPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
- fail();
- } catch(IOException e) {
- // Good
- }
- }
-
- @Test
- public void testPackage() throws Exception {
- // Excel
- POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFExcelExtractor);
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(extractor.getText().length() > 200);
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(extractor instanceof XWPFWordExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(extractor.getText().length() > 120);
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(extractor instanceof XSLFPowerPointExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(extractor.getText().length() > 120);
- extractor.close();
-
- // Visio
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
- assertTrue(extractor instanceof XDGFVisioExtractor);
- assertTrue(extractor.getText().length() > 20);
- extractor.close();
-
- // Text
- try {
- ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
- fail("TestExtractorFactory.testPackage() failed on " + txt);
- } catch(UnsupportedFileFormatException e) {
- // Good
- } catch (Exception e) {
- System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
- throw e;
- }
- }
-
- @Test
- public void testPreferEventBased() throws Exception {
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setThreadPrefersEventExtractors(true);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(false);
-
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(null);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
-
- // Check we get the right extractors now
- POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof EventBasedExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
-
- // Put back to normal
- ExtractorFactory.setThreadPrefersEventExtractors(false);
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- // And back
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
- }
-
- /**
- * Test embeded docs text extraction. For now, only
- * does poifs embeded, but will do ooxml ones
- * at some point.
- */
- @Test
- public void testEmbeded() throws Exception {
- POIOLE2TextExtractor ext;
- POITextExtractor[] embeds;
-
- // No embedings
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xls);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
- assertEquals(0, embeds.length);
- ext.close();
-
- // Excel
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xlsEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
-
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(2, numPpt);
- assertEquals(2, numXls);
- assertEquals(2, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Word
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(4, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(1, numPpt);
- assertEquals(2, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Word which contains an OOXML file
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmbOOXML);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
- assertEquals(3, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- else if (embed instanceof XWPFWordExtractor) numWordX++;
- }
- assertEquals(1, numPpt);
- assertEquals(1, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numWordX);
- assertEquals(0, numMsg);
- ext.close();
-
- // Outlook
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Outlook with another outlook file in it
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmbMsg);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numMsg);
- ext.close();
-
- // TODO - PowerPoint
- // TODO - Publisher
- // TODO - Visio
- }
-
- private static final String[] EXPECTED_FAILURES = new String[] {
- // password protected files
- "spreadsheet/password.xls",
- "spreadsheet/protected_passtika.xlsx",
- "spreadsheet/51832.xls",
- "document/PasswordProtected.doc",
- "slideshow/Password_Protected-hello.ppt",
- "slideshow/Password_Protected-56-hello.ppt",
- "slideshow/Password_Protected-np-hello.ppt",
- "slideshow/cryptoapi-proc2356.ppt",
- //"document/bug53475-password-is-pass.docx",
- //"document/bug53475-password-is-solrcell.docx",
- "spreadsheet/xor-encryption-abc.xls",
- "spreadsheet/35897-type4.xls",
- //"poifs/protect.xlsx",
- //"poifs/protected_sha512.xlsx",
- //"poifs/extenxls_pwd123.xlsx",
- //"poifs/protected_agile.docx",
- "spreadsheet/58616.xlsx",
-
- // TODO: fails XMLExportTest, is this ok?
- "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
- "spreadsheet/55864.xlsx",
- "spreadsheet/57890.xlsx",
-
- // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
- "spreadsheet/44958.xls",
- "spreadsheet/44958_1.xls",
- "spreadsheet/testArraysAndTables.xls",
-
- // TODO: good to ignore?
- "spreadsheet/sample-beta.xlsx",
-
- // This is actually a spreadsheet!
- "hpsf/TestRobert_Flaherty.doc",
-
- // some files that are broken, eg Word 95, ...
- "spreadsheet/43493.xls",
- "spreadsheet/46904.xls",
- "document/Bug50955.doc",
- "slideshow/PPT95.ppt",
- "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
- "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
- "openxml4j/invalid.xlsx",
- "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
- "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
- "spreadsheet/Simple.xlsb",
- "poifs/unknown_properties.msg", // POIFS properties corrupted
- "poifs/only-zero-byte-streams.ole2", // No actual contents
- "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
- "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
-
- // old Excel files, which we only support simple text extraction of
- "spreadsheet/testEXCEL_2.xls",
- "spreadsheet/testEXCEL_3.xls",
- "spreadsheet/testEXCEL_4.xls",
- "spreadsheet/testEXCEL_5.xls",
- "spreadsheet/testEXCEL_95.xls",
-
- // OOXML Strict is not yet supported, see bug #57699
- "spreadsheet/SampleSS.strict.xlsx",
- "spreadsheet/SimpleStrict.xlsx",
- "spreadsheet/sample.strict.xlsx",
-
- // non-TNEF files
- "ddf/Container.dat",
- "ddf/47143.dat",
-
- // sheet cloning errors
- "spreadsheet/47813.xlsx",
- "spreadsheet/56450.xls",
- "spreadsheet/57231_MixedGasReport.xls",
- "spreadsheet/OddStyleRecord.xls",
- "spreadsheet/WithChartSheet.xlsx",
- "spreadsheet/chart_sheet.xlsx",
- };
-
- @Test
- public void testFileLeak() throws Exception {
- // run a number of files that might fail in order to catch
- // leaked file resources when using file-leak-detector while
- // running the test
-
- for(String file : EXPECTED_FAILURES) {
- try {
- ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
- } catch (Exception e) {
- // catch all exceptions here as we are only interested in file-handle leaks
- }
- }
- }
-
- /**
- * #59074 - Excel 95 files should give a helpful message, not just
- * "No supported documents found in the OLE2 stream"
- */
- @Test
- public void bug59074() throws Exception {
- try {
- ExtractorFactory.createExtractor(
- POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
- fail("Old excel formats not supported via ExtractorFactory");
- } catch (OldExcelFormatException e) {
- // expected here
- }
- }
-
- @Test
- public void testGetEmbeddedFromXMLExtractor() {
- try {
- // currently not implemented
- ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
- fail("Unsupported currently");
- } catch (IllegalStateException e) {
- // expected here
- }
- }
-
- // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
- // When this happens, change this from @Test(expected=...) to @Test
- // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
- @Test(expected=AssertionError.class)
- public void test45565() throws Exception {
- POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
- try {
- String text = extractor.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- } finally {
- extractor.close();
- }
- }
- }
|