123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.extractor.ooxml;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.apache.poi.extractor.ExtractorFactory.createExtractor;
- import static org.junit.jupiter.api.Assertions.assertEquals;
- import static org.junit.jupiter.api.Assertions.assertFalse;
- import static org.junit.jupiter.api.Assertions.assertNotNull;
- import static org.junit.jupiter.api.Assertions.assertNull;
- import static org.junit.jupiter.api.Assertions.assertThrows;
- import static org.junit.jupiter.api.Assertions.assertTrue;
-
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.util.Locale;
- import java.util.stream.Stream;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.extractor.ExtractorFactory;
- import org.apache.poi.extractor.POIOLE2TextExtractor;
- import org.apache.poi.extractor.POITextExtractor;
- import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
- import org.apache.poi.hssf.extractor.ExcelExtractor;
- import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
- import org.apache.poi.openxml4j.exceptions.NotOfficeXmlFileException;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.poifs.filesystem.FileMagic;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
- import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
- import org.junit.jupiter.api.Test;
- import org.junit.jupiter.params.ParameterizedTest;
- import org.junit.jupiter.params.provider.Arguments;
- import org.junit.jupiter.params.provider.MethodSource;
- import org.junit.jupiter.params.provider.ValueSource;
-
- /**
- * Test that the extractor factory plays nicely
- */
- class TestExtractorFactory {
-
- private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
- private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
- private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
- @SuppressWarnings("unused")
- private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
- private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
- private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
- private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
-
- private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
- private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
- private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
- private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
- private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
- private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
- private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
- private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
-
- private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
- private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
- private static final File ppt97 = getFileAndCheck(slTests, "bug56240.ppt");
- private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
-
- private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
- private static final File msg = getFileAndCheck(olTests, "quick.msg");
- private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
- private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
-
- private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
- private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
- private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
-
- private static final POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
- private static final File pub = getFileAndCheck(pubTests, "Simple.pub");
-
- private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
-
- private static File getFileAndCheck(POIDataSamples samples, String name) {
- File file = samples.getFile(name);
-
- assertNotNull(file, "Did not get a file for " + name);
- assertTrue(file.isFile(), "Did not get a type file for " + name);
- assertTrue(file.exists(), "File did not exist: " + name);
-
- return file;
- }
-
- public static Stream<Arguments> testOOXMLData() {
- return Stream.of(
- Arguments.of("Excel - xlsx", xlsx, "XSSFExcelExtractor", 200),
- Arguments.of("Excel - xltx", xltx, "XSSFExcelExtractor", -1),
- Arguments.of("Excel - xlsb", xlsb, "XSSFBEventBasedExcelExtractor", -1),
- Arguments.of("Word - docx", docx, "XWPFWordExtractor", 120),
- Arguments.of("Word - dotx", dotx, "XWPFWordExtractor", -1),
- Arguments.of("PowerPoint - pptx", pptx, "XSLFExtractor", 120),
- Arguments.of("Visio - vsdx", vsdx, "XDGFVisioExtractor", 20)
- );
- }
-
- public static Stream<Arguments> testScratchData() {
- return Stream.of(
- Arguments.of("Excel", xls, "ExcelExtractor", 200),
- Arguments.of("Word", doc, "WordExtractor", 120),
- Arguments.of("Word 6", doc6, "Word6Extractor", 20),
- Arguments.of("Word 95", doc95, "Word6Extractor", 120),
- Arguments.of("PowerPoint", ppt, "SlideShowExtractor", 120),
- Arguments.of("PowerPoint 97 Dual", ppt97, "SlideShowExtractor", 120),
- Arguments.of("Visio", vsd, "VisioTextExtractor", 50),
- Arguments.of("Publisher", pub, "PublisherTextExtractor", 50),
- Arguments.of("Outlook msg", msg, "OutlookTextExtractor", 50)
- );
- }
-
- public static Stream<Arguments> testFileData() {
- return Stream.concat(testOOXMLData(), testScratchData());
- // TODO Support OOXML-Strict / xlsxStrict, see bug #57699
- }
-
-
- @ParameterizedTest
- @MethodSource("testFileData")
- void testFile(String testcase, File file, String extractor, int count) throws Exception {
- try (POITextExtractor ext = createExtractor(file)) {
- assertNotNull(ext);
- testExtractor(ext, testcase, extractor, count);
- }
- }
-
- @ParameterizedTest
- @MethodSource("testScratchData")
- void testPOIFS(String testcase, File testFile, String extractor, int count) throws Exception {
- // test processing of InputStream
- try (FileInputStream fis = new FileInputStream(testFile);
- POIFSFileSystem poifs = new POIFSFileSystem(fis);
- POITextExtractor ext = createExtractor(poifs)) {
- assertNotNull(ext);
- testExtractor(ext, testcase, extractor, count);
- }
- }
-
- @ParameterizedTest
- @MethodSource("testFileData")
- void testOOXML(String testcase, File testFile, String extractor, int count) throws Exception {
- // test processing of InputStream
- try (FileInputStream fis = new FileInputStream(testFile);
- POITextExtractor ext = createExtractor(fis)) {
- assertNotNull(ext);
- testExtractor(ext, testcase, extractor, count);
- }
- }
-
- @ParameterizedTest
- @MethodSource("testOOXMLData")
- void testPackage(String testcase, File testFile, String extractor, int count) throws Exception {
- try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
- final POITextExtractor ext = xmlFactory.create(pkg)) {
- assertNotNull(ext);
- testExtractor(ext, testcase, extractor, count);
- pkg.revert();
- }
- }
-
- @Test
- void testFileInvalid() {
- IOException ex = assertThrows(IOException.class, () -> createExtractor(txt));
- assertEquals("Can't create extractor - unsupported file type: UNKNOWN", ex.getMessage());
- }
-
- @Test
- void testInputStreamInvalid() throws IOException {
- try (FileInputStream fis = new FileInputStream(txt)) {
- IOException ex = assertThrows(IOException.class, () -> createExtractor(fis));
- assertTrue(ex.getMessage().contains(FileMagic.UNKNOWN.name()));
- }
- }
-
- @Test
- void testPOIFSInvalid() {
- // Not really an Extractor test, but we'll leave it to test POIFS reaction anyway ...
- IOException ex = assertThrows(IOException.class, () -> new POIFSFileSystem(txt));
- assertTrue(ex.getMessage().contains("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0"));
- }
-
- private void testExtractor(final POITextExtractor ext, final String testcase, final String extrClass, final Integer minLength) {
- assertEquals(extrClass, ext.getClass().getSimpleName(), "invalid extractor for " + testcase);
- final String actual = ext.getText();
- if (minLength == -1) {
- assertContains(actual.toLowerCase(Locale.ROOT), "test");
- } else {
- assertTrue(actual.length() > minLength, "extracted content too short for " + testcase);
- }
- }
- @Test
- void testPackageInvalid() {
- // Text
- assertThrows(NotOfficeXmlFileException.class, () -> OPCPackage.open(txt, PackageAccess.READ));
- }
-
- @Test
- void testPreferEventBased() throws Exception {
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setThreadPrefersEventExtractors(true);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(false);
-
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- ExtractorFactory.setAllThreadsPreferEventExtractors(null);
-
- assertTrue(ExtractorFactory.getPreferEventExtractor());
- assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- try {
- // Check we get the right extractors now
- try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
- assertTrue(extractor instanceof EventBasedExcelExtractor);
- assertTrue(extractor.getText().length() > 200);
- }
- try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
- assertNotNull(extractor);
- assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
- assertTrue(extractor.getText().length() > 200);
- }
- } finally {
- // Put back to normal
- ExtractorFactory.setThreadPrefersEventExtractors(false);
- }
-
- assertFalse(ExtractorFactory.getPreferEventExtractor());
- assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
- assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
-
- // And back
- try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
- assertTrue(extractor instanceof ExcelExtractor);
- assertTrue(extractor.getText().length() > 200);
- }
-
- try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
- assertTrue(extractor instanceof XSSFExcelExtractor);
- }
-
- try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
- assertNotNull(extractor);
- assertTrue(extractor.getText().length() > 200);
- }
- }
-
- public static Stream<Arguments> testEmbeddedData() {
- return Stream.of(
- Arguments.of("No embeddings", xls, "0-0-0-0-0-0"),
- Arguments.of("Excel", xlsEmb, "6-2-2-2-0-0"),
- Arguments.of("Word", docEmb, "4-1-2-1-0-0"),
- Arguments.of("Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1"),
- Arguments.of("Outlook", msgEmb, "1-1-0-0-0-0"),
- Arguments.of("Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0")
- // TODO - PowerPoint
- // TODO - Publisher
- // TODO - Visio
- );
- }
-
- /**
- * Test embedded docs text extraction. For now, only
- * does poifs embedded, but will do ooxml ones
- * at some point.
- */
- @ParameterizedTest
- @MethodSource("testEmbeddedData")
- void testEmbedded(String format, File file, String expected) throws Exception {
- int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
-
- try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor) createExtractor(file)) {
- final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
-
- for (POITextExtractor embed : embeds) {
- assertTrue(embed.getText().length() > 20);
- switch (embed.getClass().getSimpleName()) {
- case "SlideShowExtractor":
- numPpt++;
- break;
- case "ExcelExtractor":
- numXls++;
- break;
- case "WordExtractor":
- numWord++;
- break;
- case "OutlookTextExtractor":
- numMsg++;
- break;
- case "XWPFWordExtractor":
- numWordX++;
- break;
- }
- }
-
- final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
- assertEquals(expected, actual, "invalid number of embeddings - "+format);
- }
-
-
- }
-
- @ParameterizedTest
- @ValueSource(strings = {
- // password protected files
- "spreadsheet/password.xls",
- "spreadsheet/protected_passtika.xlsx",
- "spreadsheet/51832.xls",
- "document/PasswordProtected.doc",
- "slideshow/Password_Protected-hello.ppt",
- "slideshow/Password_Protected-56-hello.ppt",
- "slideshow/Password_Protected-np-hello.ppt",
- "slideshow/cryptoapi-proc2356.ppt",
- //"document/bug53475-password-is-pass.docx",
- //"document/bug53475-password-is-solrcell.docx",
- "spreadsheet/xor-encryption-abc.xls",
- "spreadsheet/35897-type4.xls",
- //"poifs/protect.xlsx",
- //"poifs/protected_sha512.xlsx",
- //"poifs/extenxls_pwd123.xlsx",
- //"poifs/protected_agile.docx",
- "spreadsheet/58616.xlsx",
-
- // TODO: fails XMLExportTest, is this ok?
- "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
- "spreadsheet/55864.xlsx",
- "spreadsheet/57890.xlsx",
-
- // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
- "spreadsheet/44958.xls",
- "spreadsheet/44958_1.xls",
- "spreadsheet/testArraysAndTables.xls",
-
- // TODO: good to ignore?
- "spreadsheet/sample-beta.xlsx",
-
- // This is actually a spreadsheet!
- "hpsf/TestRobert_Flaherty.doc",
-
- // some files that are broken, eg Word 95, ...
- "spreadsheet/43493.xls",
- "spreadsheet/46904.xls",
- "document/Bug50955.doc",
- "slideshow/PPT95.ppt",
- "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
- "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
- "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
- "openxml4j/invalid.xlsx",
- "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
- "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
- "spreadsheet/Simple.xlsb",
- "poifs/unknown_properties.msg", // POIFS properties corrupted
- "poifs/only-zero-byte-streams.ole2", // No actual contents
- "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
- "spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion
- "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
-
- // old Excel files, which we only support simple text extraction of
- "spreadsheet/testEXCEL_2.xls",
- "spreadsheet/testEXCEL_3.xls",
- "spreadsheet/testEXCEL_4.xls",
- "spreadsheet/testEXCEL_5.xls",
- "spreadsheet/testEXCEL_95.xls",
-
- // OOXML Strict is not yet supported, see bug #57699
- "spreadsheet/SampleSS.strict.xlsx",
- "spreadsheet/SimpleStrict.xlsx",
- "spreadsheet/sample.strict.xlsx",
-
- // non-TNEF files
- "ddf/Container.dat",
- "ddf/47143.dat",
-
- // sheet cloning errors
- "spreadsheet/47813.xlsx",
- "spreadsheet/56450.xls",
- "spreadsheet/57231_MixedGasReport.xls",
- "spreadsheet/OddStyleRecord.xls",
- "spreadsheet/WithChartSheet.xlsx",
- "spreadsheet/chart_sheet.xlsx"
- })
- void testFileLeak(String file) {
- // run a number of files that might fail in order to catch
- // leaked file resources when using file-leak-detector while
- // running the test
- assertThrows(Exception.class, () -> ex(file));
- }
-
- /**
- * #59074 - Excel 95 files should give a helpful message, not just
- * "No supported documents found in the OLE2 stream"
- */
- @Test
- void bug59074() throws Exception {
- try (POITextExtractor extractor = ex("59074.xls")) {
- String text = extractor.getText();
- assertContains(text, "Exotic warrant");
- }
- }
-
- @Test
- void testGetEmbeddedFromXMLExtractor() {
- // currently not implemented
- assertThrows(IllegalStateException.class, () -> ExtractorFactory.getEmbeddedDocsTextExtractors(null));
- }
-
- // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
- // When this happens, change this from @Test(expected=...) to @Test
- // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
- @Test
- void test45565() throws Exception {
- try (POITextExtractor extractor = ex("45565.xls")) {
- String text = extractor.getText();
- assertThrows(AssertionError.class, () -> {
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- });
- }
- }
-
- private static POITextExtractor ex(String filename) throws IOException {
- return createExtractor(ssTests.getFile(filename));
- }
- }
|