/* ==================================================================== Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ package org.apache.poi.extractor.ooxml; import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.extractor.ExtractorFactory.createExtractor; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.Locale; import java.util.stream.Stream; import org.apache.poi.POIDataSamples; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory; import org.apache.poi.openxml4j.exceptions.NotOfficeXmlFileException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.poifs.filesystem.FileMagic; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; /** * Test that the extractor factory plays nicely */ class TestExtractorFactory { private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance(); private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls"); private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx"); @SuppressWarnings("unused") private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx"); private static final File xltx = getFileAndCheck(ssTests, "test.xltx"); private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls"); private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb"); private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc"); private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc"); private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc"); private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx"); private static final File dotx = getFileAndCheck(wpTests, "test.dotx"); private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc"); private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc"); private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt"); private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx"); private static final File ppt97 = getFileAndCheck(slTests, "bug56240.ppt"); private static final File txt = getFileAndCheck(slTests, "SampleShow.txt"); private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance(); private static final File msg = getFileAndCheck(olTests, "quick.msg"); private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg"); private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg"); private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance(); private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd"); private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx"); private static final POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); private static final File pub = getFileAndCheck(pubTests, "Simple.pub"); private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory(); private static File getFileAndCheck(POIDataSamples samples, String name) { File file = samples.getFile(name); assertNotNull(file, "Did not get a file for " + name); assertTrue(file.isFile(), "Did not get a type file for " + name); assertTrue(file.exists(), "File did not exist: " + name); return file; } public static Stream testOOXMLData() { return Stream.of( Arguments.of("Excel - xlsx", xlsx, "XSSFExcelExtractor", 200), Arguments.of("Excel - xltx", xltx, "XSSFExcelExtractor", -1), Arguments.of("Excel - xlsb", xlsb, "XSSFBEventBasedExcelExtractor", -1), Arguments.of("Word - docx", docx, "XWPFWordExtractor", 120), Arguments.of("Word - dotx", dotx, "XWPFWordExtractor", -1), Arguments.of("PowerPoint - pptx", pptx, "XSLFExtractor", 120), Arguments.of("Visio - vsdx", vsdx, "XDGFVisioExtractor", 20) ); } public static Stream testScratchData() { return Stream.of( Arguments.of("Excel", xls, "ExcelExtractor", 200), Arguments.of("Word", doc, "WordExtractor", 120), Arguments.of("Word 6", doc6, "Word6Extractor", 20), Arguments.of("Word 95", doc95, "Word6Extractor", 120), Arguments.of("PowerPoint", ppt, "SlideShowExtractor", 120), Arguments.of("PowerPoint 97 Dual", ppt97, "SlideShowExtractor", 120), Arguments.of("Visio", vsd, "VisioTextExtractor", 50), Arguments.of("Publisher", pub, "PublisherTextExtractor", 50), Arguments.of("Outlook msg", msg, "OutlookTextExtractor", 50) ); } public static Stream testFileData() { return Stream.concat(testOOXMLData(), testScratchData()); // TODO Support OOXML-Strict / xlsxStrict, see bug #57699 } @ParameterizedTest @MethodSource("testFileData") void testFile(String testcase, File file, String extractor, int count) throws Exception { try (POITextExtractor ext = createExtractor(file)) { assertNotNull(ext); testExtractor(ext, testcase, extractor, count); } } @ParameterizedTest @MethodSource("testScratchData") void testPOIFS(String testcase, File testFile, String extractor, int count) throws Exception { // test processing of InputStream try (FileInputStream fis = new FileInputStream(testFile); POIFSFileSystem poifs = new POIFSFileSystem(fis); POITextExtractor ext = createExtractor(poifs)) { assertNotNull(ext); testExtractor(ext, testcase, extractor, count); } } @ParameterizedTest @MethodSource("testFileData") void testOOXML(String testcase, File testFile, String extractor, int count) throws Exception { // test processing of InputStream try (FileInputStream fis = new FileInputStream(testFile); POITextExtractor ext = createExtractor(fis)) { assertNotNull(ext); testExtractor(ext, testcase, extractor, count); } } @ParameterizedTest @MethodSource("testOOXMLData") void testPackage(String testcase, File testFile, String extractor, int count) throws Exception { try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ); final POITextExtractor ext = xmlFactory.create(pkg)) { assertNotNull(ext); testExtractor(ext, testcase, extractor, count); pkg.revert(); } } @Test void testFileInvalid() { IOException ex = assertThrows(IOException.class, () -> createExtractor(txt)); assertEquals("Can't create extractor - unsupported file type: UNKNOWN", ex.getMessage()); } @Test void testInputStreamInvalid() throws IOException { try (FileInputStream fis = new FileInputStream(txt)) { IOException ex = assertThrows(IOException.class, () -> createExtractor(fis)); assertTrue(ex.getMessage().contains(FileMagic.UNKNOWN.name())); } } @Test void testPOIFSInvalid() { // Not really an Extractor test, but we'll leave it to test POIFS reaction anyway ... IOException ex = assertThrows(IOException.class, () -> new POIFSFileSystem(txt)); assertTrue(ex.getMessage().contains("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0")); } private void testExtractor(final POITextExtractor ext, final String testcase, final String extrClass, final Integer minLength) { assertEquals(extrClass, ext.getClass().getSimpleName(), "invalid extractor for " + testcase); final String actual = ext.getText(); if (minLength == -1) { assertContains(actual.toLowerCase(Locale.ROOT), "test"); } else { assertTrue(actual.length() > minLength, "extracted content too short for " + testcase); } } @Test void testPackageInvalid() { // Text assertThrows(NotOfficeXmlFileException.class, () -> OPCPackage.open(txt, PackageAccess.READ)); } @Test void testPreferEventBased() throws Exception { assertFalse(ExtractorFactory.getPreferEventExtractor()); assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); ExtractorFactory.setThreadPrefersEventExtractors(true); assertTrue(ExtractorFactory.getPreferEventExtractor()); assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); ExtractorFactory.setAllThreadsPreferEventExtractors(false); assertFalse(ExtractorFactory.getPreferEventExtractor()); assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors()); ExtractorFactory.setAllThreadsPreferEventExtractors(null); assertTrue(ExtractorFactory.getPreferEventExtractor()); assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); try { // Check we get the right extractors now try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { assertTrue(extractor instanceof EventBasedExcelExtractor); assertTrue(extractor.getText().length() > 200); } try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { assertNotNull(extractor); assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); assertTrue(extractor.getText().length() > 200); } } finally { // Put back to normal ExtractorFactory.setThreadPrefersEventExtractors(false); } assertFalse(ExtractorFactory.getPreferEventExtractor()); assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); // And back try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) { assertTrue(extractor instanceof ExcelExtractor); assertTrue(extractor.getText().length() > 200); } try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) { assertTrue(extractor instanceof XSSFExcelExtractor); } try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) { assertNotNull(extractor); assertTrue(extractor.getText().length() > 200); } } public static Stream testEmbeddedData() { return Stream.of( Arguments.of("No embeddings", xls, "0-0-0-0-0-0"), Arguments.of("Excel", xlsEmb, "6-2-2-2-0-0"), Arguments.of("Word", docEmb, "4-1-2-1-0-0"), Arguments.of("Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1"), Arguments.of("Outlook", msgEmb, "1-1-0-0-0-0"), Arguments.of("Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0") // TODO - PowerPoint // TODO - Publisher // TODO - Visio ); } /** * Test embedded docs text extraction. For now, only * does poifs embedded, but will do ooxml ones * at some point. */ @ParameterizedTest @MethodSource("testEmbeddedData") void testEmbedded(String format, File file, String expected) throws Exception { int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0; try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor) createExtractor(file)) { final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext); for (POITextExtractor embed : embeds) { assertTrue(embed.getText().length() > 20); switch (embed.getClass().getSimpleName()) { case "SlideShowExtractor": numPpt++; break; case "ExcelExtractor": numXls++; break; case "WordExtractor": numWord++; break; case "OutlookTextExtractor": numMsg++; break; case "XWPFWordExtractor": numWordX++; break; } } final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX; assertEquals(expected, actual, "invalid number of embeddings - "+format); } } @ParameterizedTest @ValueSource(strings = { // password protected files "spreadsheet/password.xls", "spreadsheet/protected_passtika.xlsx", "spreadsheet/51832.xls", "document/PasswordProtected.doc", "slideshow/Password_Protected-hello.ppt", "slideshow/Password_Protected-56-hello.ppt", "slideshow/Password_Protected-np-hello.ppt", "slideshow/cryptoapi-proc2356.ppt", //"document/bug53475-password-is-pass.docx", //"document/bug53475-password-is-solrcell.docx", "spreadsheet/xor-encryption-abc.xls", "spreadsheet/35897-type4.xls", //"poifs/protect.xlsx", //"poifs/protected_sha512.xlsx", //"poifs/extenxls_pwd123.xlsx", //"poifs/protected_agile.docx", "spreadsheet/58616.xlsx", // TODO: fails XMLExportTest, is this ok? "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx", "spreadsheet/55864.xlsx", "spreadsheet/57890.xlsx", // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()! "spreadsheet/44958.xls", "spreadsheet/44958_1.xls", "spreadsheet/testArraysAndTables.xls", // TODO: good to ignore? "spreadsheet/sample-beta.xlsx", // This is actually a spreadsheet! "hpsf/TestRobert_Flaherty.doc", // some files that are broken, eg Word 95, ... "spreadsheet/43493.xls", "spreadsheet/46904.xls", "document/Bug50955.doc", "slideshow/PPT95.ppt", "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx", "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx", "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx", "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx", "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx", "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx", "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx", "openxml4j/invalid.xlsx", "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764() "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764() "spreadsheet/Simple.xlsb", "poifs/unknown_properties.msg", // POIFS properties corrupted "poifs/only-zero-byte-streams.ole2", // No actual contents "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion "spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion // old Excel files, which we only support simple text extraction of "spreadsheet/testEXCEL_2.xls", "spreadsheet/testEXCEL_3.xls", "spreadsheet/testEXCEL_4.xls", "spreadsheet/testEXCEL_5.xls", "spreadsheet/testEXCEL_95.xls", // OOXML Strict is not yet supported, see bug #57699 "spreadsheet/SampleSS.strict.xlsx", "spreadsheet/SimpleStrict.xlsx", "spreadsheet/sample.strict.xlsx", // non-TNEF files "ddf/Container.dat", "ddf/47143.dat", // sheet cloning errors "spreadsheet/47813.xlsx", "spreadsheet/56450.xls", "spreadsheet/57231_MixedGasReport.xls", "spreadsheet/OddStyleRecord.xls", "spreadsheet/WithChartSheet.xlsx", "spreadsheet/chart_sheet.xlsx" }) void testFileLeak(String file) { // run a number of files that might fail in order to catch // leaked file resources when using file-leak-detector while // running the test assertThrows(Exception.class, () -> ex(file)); } /** * #59074 - Excel 95 files should give a helpful message, not just * "No supported documents found in the OLE2 stream" */ @Test void bug59074() throws Exception { try (POITextExtractor extractor = ex("59074.xls")) { String text = extractor.getText(); assertContains(text, "Exotic warrant"); } } @Test void testGetEmbeddedFromXMLExtractor() { // currently not implemented assertThrows(IllegalStateException.class, () -> ExtractorFactory.getEmbeddedDocsTextExtractors(null)); } // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed. // When this happens, change this from @Test(expected=...) to @Test // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor @Test void test45565() throws Exception { try (POITextExtractor extractor = ex("45565.xls")) { String text = extractor.getText(); assertThrows(AssertionError.class, () -> { assertContains(text, "testdoc"); assertContains(text, "test phrase"); }); } } private static POITextExtractor ex(String filename) throws IOException { return createExtractor(ssTests.getFile(filename)); } }