You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractorFactory.java 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor.ooxml;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.apache.poi.extractor.ExtractorFactory.createExtractor;
  18. import static org.junit.jupiter.api.Assertions.assertEquals;
  19. import static org.junit.jupiter.api.Assertions.assertFalse;
  20. import static org.junit.jupiter.api.Assertions.assertNotNull;
  21. import static org.junit.jupiter.api.Assertions.assertNull;
  22. import static org.junit.jupiter.api.Assertions.assertThrows;
  23. import static org.junit.jupiter.api.Assertions.assertTrue;
  24. import java.io.File;
  25. import java.io.FileInputStream;
  26. import java.io.IOException;
  27. import java.util.Locale;
  28. import java.util.stream.Stream;
  29. import org.apache.poi.POIDataSamples;
  30. import org.apache.poi.extractor.ExtractorFactory;
  31. import org.apache.poi.extractor.POIOLE2TextExtractor;
  32. import org.apache.poi.extractor.POITextExtractor;
  33. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  34. import org.apache.poi.hssf.extractor.ExcelExtractor;
  35. import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
  36. import org.apache.poi.openxml4j.exceptions.NotOfficeXmlFileException;
  37. import org.apache.poi.openxml4j.opc.OPCPackage;
  38. import org.apache.poi.openxml4j.opc.PackageAccess;
  39. import org.apache.poi.poifs.filesystem.FileMagic;
  40. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  41. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  42. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  43. import org.junit.jupiter.api.Test;
  44. import org.junit.jupiter.params.ParameterizedTest;
  45. import org.junit.jupiter.params.provider.Arguments;
  46. import org.junit.jupiter.params.provider.MethodSource;
  47. import org.junit.jupiter.params.provider.ValueSource;
  48. /**
  49. * Test that the extractor factory plays nicely
  50. */
  51. class TestExtractorFactory {
  52. private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  53. private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
  54. private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  55. @SuppressWarnings("unused")
  56. private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  57. private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
  58. private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  59. private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
  60. private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  61. private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  62. private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
  63. private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
  64. private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  65. private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
  66. private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  67. private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  68. private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  69. private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  70. private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  71. private static final File ppt97 = getFileAndCheck(slTests, "bug56240.ppt");
  72. private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
  73. private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  74. private static final File msg = getFileAndCheck(olTests, "quick.msg");
  75. private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  76. private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  77. private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  78. private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  79. private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
  80. private static final POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  81. private static final File pub = getFileAndCheck(pubTests, "Simple.pub");
  82. private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
  83. private static File getFileAndCheck(POIDataSamples samples, String name) {
  84. File file = samples.getFile(name);
  85. assertNotNull(file, "Did not get a file for " + name);
  86. assertTrue(file.isFile(), "Did not get a type file for " + name);
  87. assertTrue(file.exists(), "File did not exist: " + name);
  88. return file;
  89. }
  90. public static Stream<Arguments> testOOXMLData() {
  91. return Stream.of(
  92. Arguments.of("Excel - xlsx", xlsx, "XSSFExcelExtractor", 200),
  93. Arguments.of("Excel - xltx", xltx, "XSSFExcelExtractor", -1),
  94. Arguments.of("Excel - xlsb", xlsb, "XSSFBEventBasedExcelExtractor", -1),
  95. Arguments.of("Word - docx", docx, "XWPFWordExtractor", 120),
  96. Arguments.of("Word - dotx", dotx, "XWPFWordExtractor", -1),
  97. Arguments.of("PowerPoint - pptx", pptx, "XSLFExtractor", 120),
  98. Arguments.of("Visio - vsdx", vsdx, "XDGFVisioExtractor", 20)
  99. );
  100. }
  101. public static Stream<Arguments> testScratchData() {
  102. return Stream.of(
  103. Arguments.of("Excel", xls, "ExcelExtractor", 200),
  104. Arguments.of("Word", doc, "WordExtractor", 120),
  105. Arguments.of("Word 6", doc6, "Word6Extractor", 20),
  106. Arguments.of("Word 95", doc95, "Word6Extractor", 120),
  107. Arguments.of("PowerPoint", ppt, "SlideShowExtractor", 120),
  108. Arguments.of("PowerPoint 97 Dual", ppt97, "SlideShowExtractor", 120),
  109. Arguments.of("Visio", vsd, "VisioTextExtractor", 50),
  110. Arguments.of("Publisher", pub, "PublisherTextExtractor", 50),
  111. Arguments.of("Outlook msg", msg, "OutlookTextExtractor", 50)
  112. );
  113. }
  114. public static Stream<Arguments> testFileData() {
  115. return Stream.concat(testOOXMLData(), testScratchData());
  116. // TODO Support OOXML-Strict / xlsxStrict, see bug #57699
  117. }
  118. @ParameterizedTest
  119. @MethodSource("testFileData")
  120. void testFile(String testcase, File file, String extractor, int count) throws Exception {
  121. try (POITextExtractor ext = createExtractor(file)) {
  122. assertNotNull(ext);
  123. testExtractor(ext, testcase, extractor, count);
  124. }
  125. }
  126. @ParameterizedTest
  127. @MethodSource("testScratchData")
  128. void testPOIFS(String testcase, File testFile, String extractor, int count) throws Exception {
  129. // test processing of InputStream
  130. try (FileInputStream fis = new FileInputStream(testFile);
  131. POIFSFileSystem poifs = new POIFSFileSystem(fis);
  132. POITextExtractor ext = createExtractor(poifs)) {
  133. assertNotNull(ext);
  134. testExtractor(ext, testcase, extractor, count);
  135. }
  136. }
  137. @ParameterizedTest
  138. @MethodSource("testFileData")
  139. void testOOXML(String testcase, File testFile, String extractor, int count) throws Exception {
  140. // test processing of InputStream
  141. try (FileInputStream fis = new FileInputStream(testFile);
  142. POITextExtractor ext = createExtractor(fis)) {
  143. assertNotNull(ext);
  144. testExtractor(ext, testcase, extractor, count);
  145. }
  146. }
  147. @ParameterizedTest
  148. @MethodSource("testOOXMLData")
  149. void testPackage(String testcase, File testFile, String extractor, int count) throws Exception {
  150. try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
  151. final POITextExtractor ext = xmlFactory.create(pkg)) {
  152. assertNotNull(ext);
  153. testExtractor(ext, testcase, extractor, count);
  154. pkg.revert();
  155. }
  156. }
  157. @Test
  158. void testFileInvalid() {
  159. IOException ex = assertThrows(IOException.class, () -> createExtractor(txt));
  160. assertEquals("Can't create extractor - unsupported file type: UNKNOWN", ex.getMessage());
  161. }
  162. @Test
  163. void testInputStreamInvalid() throws IOException {
  164. try (FileInputStream fis = new FileInputStream(txt)) {
  165. IOException ex = assertThrows(IOException.class, () -> createExtractor(fis));
  166. assertTrue(ex.getMessage().contains(FileMagic.UNKNOWN.name()));
  167. }
  168. }
  169. @Test
  170. void testPOIFSInvalid() {
  171. // Not really an Extractor test, but we'll leave it to test POIFS reaction anyway ...
  172. IOException ex = assertThrows(IOException.class, () -> new POIFSFileSystem(txt));
  173. assertTrue(ex.getMessage().contains("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0"));
  174. }
  175. private void testExtractor(final POITextExtractor ext, final String testcase, final String extrClass, final Integer minLength) {
  176. assertEquals(extrClass, ext.getClass().getSimpleName(), "invalid extractor for " + testcase);
  177. final String actual = ext.getText();
  178. if (minLength == -1) {
  179. assertContains(actual.toLowerCase(Locale.ROOT), "test");
  180. } else {
  181. assertTrue(actual.length() > minLength, "extracted content too short for " + testcase);
  182. }
  183. }
  184. @Test
  185. void testPackageInvalid() {
  186. // Text
  187. assertThrows(NotOfficeXmlFileException.class, () -> OPCPackage.open(txt, PackageAccess.READ));
  188. }
  189. @Test
  190. void testPreferEventBased() throws Exception {
  191. assertFalse(ExtractorFactory.getPreferEventExtractor());
  192. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  193. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  194. ExtractorFactory.setThreadPrefersEventExtractors(true);
  195. assertTrue(ExtractorFactory.getPreferEventExtractor());
  196. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  197. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  198. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  199. assertFalse(ExtractorFactory.getPreferEventExtractor());
  200. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  201. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  202. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  203. assertTrue(ExtractorFactory.getPreferEventExtractor());
  204. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  205. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  206. try {
  207. // Check we get the right extractors now
  208. try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  209. assertTrue(extractor instanceof EventBasedExcelExtractor);
  210. assertTrue(extractor.getText().length() > 200);
  211. }
  212. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  213. assertNotNull(extractor);
  214. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  215. assertTrue(extractor.getText().length() > 200);
  216. }
  217. } finally {
  218. // Put back to normal
  219. ExtractorFactory.setThreadPrefersEventExtractors(false);
  220. }
  221. assertFalse(ExtractorFactory.getPreferEventExtractor());
  222. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  223. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  224. // And back
  225. try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  226. assertTrue(extractor instanceof ExcelExtractor);
  227. assertTrue(extractor.getText().length() > 200);
  228. }
  229. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  230. assertTrue(extractor instanceof XSSFExcelExtractor);
  231. }
  232. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
  233. assertNotNull(extractor);
  234. assertTrue(extractor.getText().length() > 200);
  235. }
  236. }
  237. public static Stream<Arguments> testEmbeddedData() {
  238. return Stream.of(
  239. Arguments.of("No embeddings", xls, "0-0-0-0-0-0"),
  240. Arguments.of("Excel", xlsEmb, "6-2-2-2-0-0"),
  241. Arguments.of("Word", docEmb, "4-1-2-1-0-0"),
  242. Arguments.of("Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1"),
  243. Arguments.of("Outlook", msgEmb, "1-1-0-0-0-0"),
  244. Arguments.of("Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0")
  245. // TODO - PowerPoint
  246. // TODO - Publisher
  247. // TODO - Visio
  248. );
  249. }
  250. /**
  251. * Test embedded docs text extraction. For now, only
  252. * does poifs embedded, but will do ooxml ones
  253. * at some point.
  254. */
  255. @ParameterizedTest
  256. @MethodSource("testEmbeddedData")
  257. void testEmbedded(String format, File file, String expected) throws Exception {
  258. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
  259. try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor) createExtractor(file)) {
  260. final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
  261. for (POITextExtractor embed : embeds) {
  262. assertTrue(embed.getText().length() > 20);
  263. switch (embed.getClass().getSimpleName()) {
  264. case "SlideShowExtractor":
  265. numPpt++;
  266. break;
  267. case "ExcelExtractor":
  268. numXls++;
  269. break;
  270. case "WordExtractor":
  271. numWord++;
  272. break;
  273. case "OutlookTextExtractor":
  274. numMsg++;
  275. break;
  276. case "XWPFWordExtractor":
  277. numWordX++;
  278. break;
  279. }
  280. }
  281. final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
  282. assertEquals(expected, actual, "invalid number of embeddings - "+format);
  283. }
  284. }
  285. @ParameterizedTest
  286. @ValueSource(strings = {
  287. // password protected files
  288. "spreadsheet/password.xls",
  289. "spreadsheet/protected_passtika.xlsx",
  290. "spreadsheet/51832.xls",
  291. "document/PasswordProtected.doc",
  292. "slideshow/Password_Protected-hello.ppt",
  293. "slideshow/Password_Protected-56-hello.ppt",
  294. "slideshow/Password_Protected-np-hello.ppt",
  295. "slideshow/cryptoapi-proc2356.ppt",
  296. //"document/bug53475-password-is-pass.docx",
  297. //"document/bug53475-password-is-solrcell.docx",
  298. "spreadsheet/xor-encryption-abc.xls",
  299. "spreadsheet/35897-type4.xls",
  300. //"poifs/protect.xlsx",
  301. //"poifs/protected_sha512.xlsx",
  302. //"poifs/extenxls_pwd123.xlsx",
  303. //"poifs/protected_agile.docx",
  304. "spreadsheet/58616.xlsx",
  305. // TODO: fails XMLExportTest, is this ok?
  306. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  307. "spreadsheet/55864.xlsx",
  308. "spreadsheet/57890.xlsx",
  309. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  310. "spreadsheet/44958.xls",
  311. "spreadsheet/44958_1.xls",
  312. "spreadsheet/testArraysAndTables.xls",
  313. // TODO: good to ignore?
  314. "spreadsheet/sample-beta.xlsx",
  315. // This is actually a spreadsheet!
  316. "hpsf/TestRobert_Flaherty.doc",
  317. // some files that are broken, eg Word 95, ...
  318. "spreadsheet/43493.xls",
  319. "spreadsheet/46904.xls",
  320. "document/Bug50955.doc",
  321. "slideshow/PPT95.ppt",
  322. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  323. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  324. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  325. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  326. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  327. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  328. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  329. "openxml4j/invalid.xlsx",
  330. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  331. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  332. "spreadsheet/Simple.xlsb",
  333. "poifs/unknown_properties.msg", // POIFS properties corrupted
  334. "poifs/only-zero-byte-streams.ole2", // No actual contents
  335. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  336. "spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion
  337. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  338. // old Excel files, which we only support simple text extraction of
  339. "spreadsheet/testEXCEL_2.xls",
  340. "spreadsheet/testEXCEL_3.xls",
  341. "spreadsheet/testEXCEL_4.xls",
  342. "spreadsheet/testEXCEL_5.xls",
  343. "spreadsheet/testEXCEL_95.xls",
  344. // OOXML Strict is not yet supported, see bug #57699
  345. "spreadsheet/SampleSS.strict.xlsx",
  346. "spreadsheet/SimpleStrict.xlsx",
  347. "spreadsheet/sample.strict.xlsx",
  348. // non-TNEF files
  349. "ddf/Container.dat",
  350. "ddf/47143.dat",
  351. // sheet cloning errors
  352. "spreadsheet/47813.xlsx",
  353. "spreadsheet/56450.xls",
  354. "spreadsheet/57231_MixedGasReport.xls",
  355. "spreadsheet/OddStyleRecord.xls",
  356. "spreadsheet/WithChartSheet.xlsx",
  357. "spreadsheet/chart_sheet.xlsx"
  358. })
  359. void testFileLeak(String file) {
  360. // run a number of files that might fail in order to catch
  361. // leaked file resources when using file-leak-detector while
  362. // running the test
  363. assertThrows(Exception.class, () -> ex(file));
  364. }
  365. /**
  366. * #59074 - Excel 95 files should give a helpful message, not just
  367. * "No supported documents found in the OLE2 stream"
  368. */
  369. @Test
  370. void bug59074() throws Exception {
  371. try (POITextExtractor extractor = ex("59074.xls")) {
  372. String text = extractor.getText();
  373. assertContains(text, "Exotic warrant");
  374. }
  375. }
  376. @Test
  377. void testGetEmbeddedFromXMLExtractor() {
  378. // currently not implemented
  379. assertThrows(IllegalStateException.class, () -> ExtractorFactory.getEmbeddedDocsTextExtractors(null));
  380. }
  381. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
  382. // When this happens, change this from @Test(expected=...) to @Test
  383. // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
  384. @Test
  385. void test45565() throws Exception {
  386. try (POITextExtractor extractor = ex("45565.xls")) {
  387. String text = extractor.getText();
  388. assertThrows(AssertionError.class, () -> {
  389. assertContains(text, "testdoc");
  390. assertContains(text, "test phrase");
  391. });
  392. }
  393. }
  394. private static POITextExtractor ex(String filename) throws IOException {
  395. return createExtractor(ssTests.getFile(filename));
  396. }
  397. }