123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.extractor;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertFalse;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertNull;
- import static org.junit.Assert.assertTrue;
- import static org.junit.Assert.fail;
-
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.POIOLE2TextExtractor;
- import org.apache.poi.POITextExtractor;
- import org.apache.poi.POIXMLException;
- import org.apache.poi.POIXMLTextExtractor;
- import org.apache.poi.UnsupportedFileFormatException;
- import org.apache.poi.hdgf.extractor.VisioTextExtractor;
- import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
- import org.apache.poi.hslf.extractor.PowerPointExtractor;
- import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
- import org.apache.poi.hssf.HSSFTestDataSamples;
- import org.apache.poi.hssf.OldExcelFormatException;
- import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
- import org.apache.poi.hssf.extractor.ExcelExtractor;
- import org.apache.poi.hwpf.extractor.Word6Extractor;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
- import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
- import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.junit.BeforeClass;
- import org.junit.Test;
-
- /**
- * Test that the extractor factory plays nicely
- */
- public class TestExtractorFactory {
- private static File txt;
-
- private static File xls;
- private static File xlsx;
- private static File xlsxStrict;
- private static File xltx;
- private static File xlsEmb;
-
- private static File doc;
- private static File doc6;
- private static File doc95;
- private static File docx;
- private static File dotx;
- private static File docEmb;
- private static File docEmbOOXML;
-
- private static File ppt;
- private static File pptx;
-
- private static File msg;
- private static File msgEmb;
- private static File msgEmbMsg;
-
- private static File vsd;
- private static File vsdx;
-
- private static File pub;
-
- private static File getFileAndCheck(POIDataSamples samples, String name) {
- File file = samples.getFile(name);
-
- assertNotNull("Did not get a file for " + name, file);
- assertTrue("Did not get a type file for " + name, file.isFile());
- assertTrue("File did not exist: " + name, file.exists());
-
- return file;
- }
-
- @BeforeClass
- public static void setUp() throws Exception {
-
- POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
- xls = getFileAndCheck(ssTests, "SampleSS.xls");
- xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
- xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
- xltx = getFileAndCheck(ssTests, "test.xltx");
- xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
-
- POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
- doc = getFileAndCheck(wpTests, "SampleDoc.doc");
- doc6 = getFileAndCheck(wpTests, "Word6.doc");
- doc95 = getFileAndCheck(wpTests, "Word95.doc");
- docx = getFileAndCheck(wpTests, "SampleDoc.docx");
- dotx = getFileAndCheck(wpTests, "test.dotx");
- docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
- docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
-
- POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- ppt = getFileAndCheck(slTests, "SampleShow.ppt");
- pptx = getFileAndCheck(slTests, "SampleShow.pptx");
- txt = getFileAndCheck(slTests, "SampleShow.txt");
-
- POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
- vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
- vsdx = getFileAndCheck(dgTests, "test.vsdx");
-
- POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
- pub = getFileAndCheck(pubTests, "Simple.pub");
-
- POIDataSamples olTests = POIDataSamples.getHSMFInstance();
- msg = getFileAndCheck(olTests, "quick.msg");
- msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
- msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
- }
-
- @Test
- public void testFile() throws Exception {
- // Excel
- POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
- assertNotNull("Had empty extractor for " + xls, xlsExtractor);
- assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
- xlsExtractor
- instanceof ExcelExtractor
- );
- assertTrue(
- xlsExtractor.getText().length() > 200
- );
- xlsExtractor.close();
-
- POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xlsx);
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(xltx);
- assertTrue(
- extractor.getText().contains("test")
- );
- extractor.close();
-
- // TODO Support OOXML-Strict, see bug #57699
- try {
- /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
- fail("OOXML-Strict isn't yet supported");
- } catch (POIXMLException e) {
- // Expected, for now
- }
- // extractor = ExtractorFactory.createExtractor(xlsxStrict);
- // assertTrue(
- // extractor
- // instanceof XSSFExcelExtractor
- // );
- // extractor.close();
- //
- // extractor = ExtractorFactory.createExtractor(xlsxStrict);
- // assertTrue(
- // extractor.getText().contains("test")
- // );
- // extractor.close();
-
-
- // Word
- extractor = ExtractorFactory.createExtractor(doc);
- assertTrue(
- extractor
- instanceof WordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(doc6);
- assertTrue(
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(doc95);
- assertTrue(
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(docx);
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor instanceof XWPFWordExtractor
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(dotx);
- assertTrue(
- extractor.getText().contains("Test")
- );
- extractor.close();
-
- // PowerPoint (PPT)
- extractor = ExtractorFactory.createExtractor(ppt);
- assertTrue(
- extractor
- instanceof PowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint (PPTX)
- extractor = ExtractorFactory.createExtractor(pptx);
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio - binary
- extractor = ExtractorFactory.createExtractor(vsd);
- assertTrue(
- extractor
- instanceof VisioTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Visio - vsdx
- extractor = ExtractorFactory.createExtractor(vsdx);
- assertTrue(
- extractor
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- // Publisher
- extractor = ExtractorFactory.createExtractor(pub);
- assertTrue(
- extractor
- instanceof PublisherTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Outlook msg
- extractor = ExtractorFactory.createExtractor(msg);
- assertTrue(
- extractor
- instanceof OutlookTextExtactor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Text
- try {
- ExtractorFactory.createExtractor(txt);
- fail();
- } catch(IllegalArgumentException e) {
- // Good
- }
- }
-
- @Test
- public void testInputStream() throws Exception {
- // Excel
- POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof XSSFExcelExtractor
- );
- assertTrue(
- extractor.getText().length() > 200
- );
- // TODO Support OOXML-Strict, see bug #57699
- // assertTrue(
- // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
- // instanceof XSSFExcelExtractor
- // );
- // assertTrue(
- // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
- // );
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof WordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
- assertTrue(
- extractor.getClass().getName(),
- extractor
- instanceof Word6Extractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
- assertTrue(
- extractor
- instanceof XWPFWordExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
- assertTrue(
- extractor
- instanceof PowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
- assertTrue(
- extractor
- instanceof XSLFPowerPointExtractor
- );
- assertTrue(
- extractor.getText().length() > 120
- );
- extractor.close();
-
- // Visio
- extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
- assertTrue(
- extractor
- instanceof VisioTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Visio - vsdx
- extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
- assertTrue(
- extractor
- instanceof XDGFVisioExtractor
- );
- assertTrue(
- extractor.getText().length() > 20
- );
- extractor.close();
-
- // Publisher
- extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
- assertTrue(
- extractor
- instanceof PublisherTextExtractor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Outlook msg
- extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
- assertTrue(
- extractor
- instanceof OutlookTextExtactor
- );
- assertTrue(
- extractor.getText().length() > 50
- );
- extractor.close();
-
- // Text
- try {
- FileInputStream stream = new FileInputStream(txt);
- try {
- ExtractorFactory.createExtractor(stream);
- fail();
- } finally {
- IOUtils.closeQuietly(stream);
- }
- } catch(IllegalArgumentException e) {
- // Good
- }
- }
-
- @Test
- public void testPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
- fail();
- } catch(IOException e) {
- // Good
- }
- }
-
-
- @Test
- public void testOPOIFS() throws Exception {
- // Excel
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
- instanceof ExcelExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
- );
-
- // Word
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
- instanceof WordExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
- );
-
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
- instanceof Word6Extractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
- );
-
- // PowerPoint
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
- instanceof PowerPointExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
- );
-
- // Visio
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
- instanceof VisioTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
- );
-
- // Publisher
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
- instanceof PublisherTextExtractor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
- );
-
- // Outlook msg
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
- instanceof OutlookTextExtactor
- );
- assertTrue(
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
- );
-
- // Text
- try {
- ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
- fail();
- } catch(IOException e) {
- // Good
- }
- }
-
- @Test
- public void testPackage() throws Exception {
- // Excel
- POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFExcelExtractor);
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(extractor.getText().length() > 200);
- extractor.close();
-
- // Word
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(extractor instanceof XWPFWordExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
- assertTrue(extractor.getText().length() > 120);
- extractor.close();
-
- // PowerPoint
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(extractor instanceof XSLFPowerPointExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
- assertTrue(extractor.getText().length() > 120);
- extractor.close();
-
- // Visio
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
- assertTrue(extractor instanceof XDGFVisioExtractor);
- assertTrue(extractor.getText().length() > 20);
- extractor.close();
-
- // Text
- try {
- ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
- fail("TestExtractorFactory.testPackage() failed on " + txt.toString());
- } catch(UnsupportedFileFormatException e) {
- // Good
- } catch (Exception e) {
- System.out.println("TestExtractorFactory.testPackage() failed on " + txt.toString());
- throw e;
- }
- }
-
- @Test
- public void testPreferEventBased() throws Exception {
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setThreadPrefersEventExtractors(true);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(false);
-
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(null);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
-
- // Check we get the right extractors now
- POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof EventBasedExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
-
- // Put back to normal
- ExtractorFactory.setThreadPrefersEventExtractors(false);
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- // And back
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor
- instanceof ExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
-
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
- assertTrue(
- extractor
- instanceof XSSFExcelExtractor
- );
- extractor.close();
- extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
- assertTrue(
- extractor.getText().length() > 200
- );
- extractor.close();
- }
-
- /**
- * Test embeded docs text extraction. For now, only
- * does poifs embeded, but will do ooxml ones
- * at some point.
- */
- @Test
- public void testEmbeded() throws Exception {
- POIOLE2TextExtractor ext;
- POITextExtractor[] embeds;
-
- // No embedings
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xls);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
- assertEquals(0, embeds.length);
- ext.close();
-
- // Excel
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(xlsEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- assertEquals(6, embeds.length);
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
-
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(2, numPpt);
- assertEquals(2, numXls);
- assertEquals(2, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Word
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(4, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(1, numPpt);
- assertEquals(2, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Word which contains an OOXML file
- ext = (POIOLE2TextExtractor)
- ExtractorFactory.createExtractor(docEmbOOXML);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
- assertEquals(3, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- else if (embed instanceof XWPFWordExtractor) numWordX++;
- }
- assertEquals(1, numPpt);
- assertEquals(1, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numWordX);
- assertEquals(0, numMsg);
- ext.close();
-
- // Outlook
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmb);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(1, numWord);
- assertEquals(0, numMsg);
- ext.close();
-
- // Outlook with another outlook file in it
- ext = (OutlookTextExtactor)
- ExtractorFactory.createExtractor(msgEmbMsg);
- embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
-
- numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
- assertEquals(1, embeds.length);
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- if (embed instanceof PowerPointExtractor) numPpt++;
- else if (embed instanceof ExcelExtractor) numXls++;
- else if (embed instanceof WordExtractor) numWord++;
- else if (embed instanceof OutlookTextExtactor) numMsg++;
- }
- assertEquals(0, numPpt);
- assertEquals(0, numXls);
- assertEquals(0, numWord);
- assertEquals(1, numMsg);
- ext.close();
-
- // TODO - PowerPoint
- // TODO - Publisher
- // TODO - Visio
- }
-
- private static final String[] EXPECTED_FAILURES = new String[] {
- // password protected files
- "spreadsheet/password.xls",
- "spreadsheet/protected_passtika.xlsx",
- "spreadsheet/51832.xls",
- "document/PasswordProtected.doc",
- "slideshow/Password_Protected-hello.ppt",
- "slideshow/Password_Protected-56-hello.ppt",
- "slideshow/Password_Protected-np-hello.ppt",
- "slideshow/cryptoapi-proc2356.ppt",
- //"document/bug53475-password-is-pass.docx",
- //"document/bug53475-password-is-solrcell.docx",
- "spreadsheet/xor-encryption-abc.xls",
- "spreadsheet/35897-type4.xls",
- //"poifs/protect.xlsx",
- //"poifs/protected_sha512.xlsx",
- //"poifs/extenxls_pwd123.xlsx",
- //"poifs/protected_agile.docx",
- "spreadsheet/58616.xlsx",
-
- // TODO: fails XMLExportTest, is this ok?
- "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
- "spreadsheet/55864.xlsx",
- "spreadsheet/57890.xlsx",
-
- // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
- "spreadsheet/44958.xls",
- "spreadsheet/44958_1.xls",
- "spreadsheet/testArraysAndTables.xls",
-
- // TODO: good to ignore?
- "spreadsheet/sample-beta.xlsx",
-
- // This is actually a spreadsheet!
- "hpsf/TestRobert_Flaherty.doc",
-
- // some files that are broken, eg Word 95, ...
- "spreadsheet/43493.xls",
- "spreadsheet/46904.xls",
- "document/Bug50955.doc",
- "slideshow/PPT95.ppt",
- "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
- "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
- "openxml4j/invalid.xlsx",
- "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
- "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
- "spreadsheet/Simple.xlsb",
- "poifs/unknown_properties.msg", // POIFS properties corrupted
- "poifs/only-zero-byte-streams.ole2", // No actual contents
- "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
- "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
-
- // old Excel files, which we only support simple text extraction of
- "spreadsheet/testEXCEL_2.xls",
- "spreadsheet/testEXCEL_3.xls",
- "spreadsheet/testEXCEL_4.xls",
- "spreadsheet/testEXCEL_5.xls",
- "spreadsheet/testEXCEL_95.xls",
-
- // OOXML Strict is not yet supported, see bug #57699
- "spreadsheet/SampleSS.strict.xlsx",
- "spreadsheet/SimpleStrict.xlsx",
- "spreadsheet/sample.strict.xlsx",
-
- // non-TNEF files
- "ddf/Container.dat",
- "ddf/47143.dat",
-
- // sheet cloning errors
- "spreadsheet/47813.xlsx",
- "spreadsheet/56450.xls",
- "spreadsheet/57231_MixedGasReport.xls",
- "spreadsheet/OddStyleRecord.xls",
- "spreadsheet/WithChartSheet.xlsx",
- "spreadsheet/chart_sheet.xlsx",
- };
-
- @Test
- public void testFileLeak() throws Exception {
- // run a number of files that might fail in order to catch
- // leaked file resources when using file-leak-detector while
- // running the test
-
- for(String file : EXPECTED_FAILURES) {
- try {
- ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
- } catch (Exception e) {
- // catch all exceptions here as we are only interested in file-handle leaks
- }
- }
- }
-
- /**
- * #59074 - Excel 95 files should give a helpful message, not just
- * "No supported documents found in the OLE2 stream"
- */
- @Test
- public void bug59074() throws Exception {
- try {
- ExtractorFactory.createExtractor(
- POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
- fail("Old excel formats not supported via ExtractorFactory");
- } catch (OldExcelFormatException e) {
- // expected here
- }
- }
-
- @Test
- public void testGetEmbeddedFromXMLExtractor() {
- try {
- // currently not implemented
- ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
- fail("Unsupported currently");
- } catch (IllegalStateException e) {
- // expected here
- }
- }
-
- // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
- // When this happens, change this from @Test(expected=...) to @Test
- @Test(expected=AssertionError.class)
- public void test45565() throws Exception {
- POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
- String text = extractor.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- extractor.close();
- }
- }
|