Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

TestExtractorFactory.java 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor.ooxml;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.apache.poi.extractor.ExtractorFactory.createExtractor;
  18. import static org.junit.jupiter.api.Assertions.assertEquals;
  19. import static org.junit.jupiter.api.Assertions.assertFalse;
  20. import static org.junit.jupiter.api.Assertions.assertInstanceOf;
  21. import static org.junit.jupiter.api.Assertions.assertNotNull;
  22. import static org.junit.jupiter.api.Assertions.assertNull;
  23. import static org.junit.jupiter.api.Assertions.assertThrows;
  24. import static org.junit.jupiter.api.Assertions.assertTrue;
  25. import java.io.File;
  26. import java.io.FileInputStream;
  27. import java.io.IOException;
  28. import java.util.Locale;
  29. import java.util.stream.Stream;
  30. import org.apache.poi.POIDataSamples;
  31. import org.apache.poi.extractor.ExtractorFactory;
  32. import org.apache.poi.extractor.POIOLE2TextExtractor;
  33. import org.apache.poi.extractor.POITextExtractor;
  34. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  35. import org.apache.poi.hssf.extractor.ExcelExtractor;
  36. import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
  37. import org.apache.poi.openxml4j.exceptions.NotOfficeXmlFileException;
  38. import org.apache.poi.openxml4j.opc.OPCPackage;
  39. import org.apache.poi.openxml4j.opc.PackageAccess;
  40. import org.apache.poi.poifs.filesystem.FileMagic;
  41. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  42. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  43. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  44. import org.junit.jupiter.api.Tag;
  45. import org.junit.jupiter.api.Test;
  46. import org.junit.jupiter.params.ParameterizedTest;
  47. import org.junit.jupiter.params.provider.Arguments;
  48. import org.junit.jupiter.params.provider.MethodSource;
  49. import org.junit.jupiter.params.provider.ValueSource;
  50. /**
  51. * Test that the extractor factory plays nicely
  52. */
  53. @Tag("scratchpad.ignore")
  54. class TestExtractorFactory {
  55. private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  56. private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
  57. private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  58. @SuppressWarnings("unused")
  59. private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  60. private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
  61. private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  62. private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
  63. private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  64. private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  65. private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
  66. private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
  67. private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  68. private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
  69. private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  70. private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  71. private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  72. private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  73. private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  74. private static final File ppt97 = getFileAndCheck(slTests, "bug56240.ppt");
  75. private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
  76. private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  77. private static final File msg = getFileAndCheck(olTests, "quick.msg");
  78. private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  79. private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  80. private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  81. private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  82. private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
  83. private static final POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  84. private static final File pub = getFileAndCheck(pubTests, "Simple.pub");
  85. private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
  86. private static File getFileAndCheck(POIDataSamples samples, String name) {
  87. File file = samples.getFile(name);
  88. assertNotNull(file, "Did not get a file for " + name);
  89. assertTrue(file.isFile(), "Did not get a type file for " + name);
  90. assertTrue(file.exists(), "File did not exist: " + name);
  91. return file;
  92. }
  93. public static Stream<Arguments> testOOXMLData() {
  94. return Stream.of(
  95. Arguments.of("Excel - xlsx", xlsx, "XSSFExcelExtractor", 200),
  96. Arguments.of("Excel - xltx", xltx, "XSSFExcelExtractor", -1),
  97. Arguments.of("Excel - xlsb", xlsb, "XSSFBEventBasedExcelExtractor", -1),
  98. Arguments.of("Word - docx", docx, "XWPFWordExtractor", 120),
  99. Arguments.of("Word - dotx", dotx, "XWPFWordExtractor", -1),
  100. Arguments.of("PowerPoint - pptx", pptx, "XSLFExtractor", 120),
  101. Arguments.of("Visio - vsdx", vsdx, "XDGFVisioExtractor", 20)
  102. );
  103. }
  104. public static Stream<Arguments> testScratchData() {
  105. return Stream.of(
  106. Arguments.of("Excel", xls, "ExcelExtractor", 200),
  107. Arguments.of("Word", doc, "WordExtractor", 120),
  108. Arguments.of("Word 6", doc6, "Word6Extractor", 20),
  109. Arguments.of("Word 95", doc95, "Word6Extractor", 120),
  110. Arguments.of("PowerPoint", ppt, "SlideShowExtractor", 120),
  111. Arguments.of("PowerPoint 97 Dual", ppt97, "SlideShowExtractor", 120),
  112. Arguments.of("Visio", vsd, "VisioTextExtractor", 50),
  113. Arguments.of("Publisher", pub, "PublisherTextExtractor", 50),
  114. Arguments.of("Outlook msg", msg, "OutlookTextExtractor", 50)
  115. );
  116. }
  117. public static Stream<Arguments> testFileData() {
  118. return Stream.concat(testOOXMLData(), testScratchData());
  119. // TODO Support OOXML-Strict / xlsxStrict, see bug #57699
  120. }
  121. @ParameterizedTest
  122. @MethodSource("testFileData")
  123. void testFile(String testcase, File file, String extractor, int count) throws Exception {
  124. try (POITextExtractor ext = createExtractor(file)) {
  125. assertNotNull(ext);
  126. testExtractor(ext, testcase, extractor, count);
  127. }
  128. }
  129. @ParameterizedTest
  130. @MethodSource("testScratchData")
  131. void testPOIFS(String testcase, File testFile, String extractor, int count) throws Exception {
  132. // test processing of InputStream
  133. try (FileInputStream fis = new FileInputStream(testFile);
  134. POIFSFileSystem poifs = new POIFSFileSystem(fis);
  135. POITextExtractor ext = createExtractor(poifs)) {
  136. assertNotNull(ext);
  137. testExtractor(ext, testcase, extractor, count);
  138. }
  139. }
  140. @ParameterizedTest
  141. @MethodSource("testFileData")
  142. void testOOXML(String testcase, File testFile, String extractor, int count) throws Exception {
  143. // test processing of InputStream
  144. try (FileInputStream fis = new FileInputStream(testFile);
  145. POITextExtractor ext = createExtractor(fis)) {
  146. assertNotNull(ext);
  147. testExtractor(ext, testcase, extractor, count);
  148. }
  149. }
  150. @ParameterizedTest
  151. @MethodSource("testOOXMLData")
  152. void testPackage(String testcase, File testFile, String extractor, int count) throws Exception {
  153. try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
  154. final POITextExtractor ext = xmlFactory.create(pkg)) {
  155. assertNotNull(ext);
  156. testExtractor(ext, testcase, extractor, count);
  157. pkg.revert();
  158. } catch (Exception e) {
  159. throw new Exception("While handling " + testcase + " - " + testFile + " - " + extractor);
  160. }
  161. }
  162. @Test
  163. void testFileInvalid() {
  164. //noinspection resource
  165. IOException ex = assertThrows(IOException.class, () -> createExtractor(txt));
  166. assertEquals("Can't create extractor - unsupported file type: UNKNOWN", ex.getMessage());
  167. }
  168. @Test
  169. void testInputStreamInvalid() throws IOException {
  170. try (FileInputStream fis = new FileInputStream(txt)) {
  171. IOException ex = assertThrows(IOException.class, () -> createExtractor(fis));
  172. assertTrue(ex.getMessage().contains(FileMagic.UNKNOWN.name()));
  173. }
  174. }
  175. @Test
  176. void testPOIFSInvalid() {
  177. // Not really an Extractor test, but we'll leave it to test POIFS reaction anyway ...
  178. //noinspection resource
  179. IOException ex = assertThrows(IOException.class, () -> new POIFSFileSystem(txt));
  180. assertTrue(ex.getMessage().contains("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0"));
  181. }
  182. @Test
  183. void testPackageInvalid() {
  184. // Text
  185. //noinspection resource
  186. assertThrows(NotOfficeXmlFileException.class, () -> OPCPackage.open(txt, PackageAccess.READ));
  187. }
  188. @Test
  189. void testPreferEventBased() throws Exception {
  190. assertFalse(ExtractorFactory.getPreferEventExtractor());
  191. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  192. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  193. ExtractorFactory.setThreadPrefersEventExtractors(true);
  194. assertTrue(ExtractorFactory.getPreferEventExtractor());
  195. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  196. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  197. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  198. assertFalse(ExtractorFactory.getPreferEventExtractor());
  199. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  200. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  201. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  202. assertTrue(ExtractorFactory.getPreferEventExtractor());
  203. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  204. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  205. try {
  206. // Check we get the right extractors now
  207. try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  208. assertInstanceOf(EventBasedExcelExtractor.class, extractor);
  209. assertTrue(extractor.getText().length() > 200);
  210. }
  211. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  212. assertNotNull(extractor);
  213. assertInstanceOf(XSSFEventBasedExcelExtractor.class, extractor);
  214. assertTrue(extractor.getText().length() > 200);
  215. }
  216. } finally {
  217. // Put back to normal
  218. ExtractorFactory.setThreadPrefersEventExtractors(false);
  219. }
  220. assertFalse(ExtractorFactory.getPreferEventExtractor());
  221. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  222. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  223. // And back
  224. try (POITextExtractor extractor = createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  225. assertInstanceOf(ExcelExtractor.class, extractor);
  226. assertTrue(extractor.getText().length() > 200);
  227. }
  228. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  229. assertInstanceOf(XSSFExcelExtractor.class, extractor);
  230. }
  231. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
  232. assertNotNull(extractor);
  233. assertTrue(extractor.getText().length() > 200);
  234. }
  235. }
  236. public static Stream<Arguments> testEmbeddedData() {
  237. return Stream.of(
  238. Arguments.of("No embeddings", xls, "0-0-0-0-0-0"),
  239. Arguments.of("Excel", xlsEmb, "6-2-2-2-0-0"),
  240. Arguments.of("Word", docEmb, "4-1-2-1-0-0"),
  241. Arguments.of("Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1"),
  242. Arguments.of("Outlook", msgEmb, "1-1-0-0-0-0"),
  243. Arguments.of("Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0")
  244. // TODO - PowerPoint
  245. // TODO - Publisher
  246. // TODO - Visio
  247. );
  248. }
  249. /**
  250. * Test embedded docs text extraction. For now, only
  251. * does poifs embedded, but will do ooxml ones
  252. * at some point.
  253. */
  254. @ParameterizedTest
  255. @MethodSource("testEmbeddedData")
  256. void testEmbedded(String format, File file, String expected) throws Exception {
  257. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
  258. try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor) createExtractor(file)) {
  259. final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
  260. for (POITextExtractor embed : embeds) {
  261. assertTrue(embed.getText().length() > 20);
  262. switch (embed.getClass().getSimpleName()) {
  263. case "SlideShowExtractor":
  264. numPpt++;
  265. break;
  266. case "ExcelExtractor":
  267. numXls++;
  268. break;
  269. case "WordExtractor":
  270. numWord++;
  271. break;
  272. case "OutlookTextExtractor":
  273. numMsg++;
  274. break;
  275. case "XWPFWordExtractor":
  276. numWordX++;
  277. break;
  278. }
  279. }
  280. final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
  281. assertEquals(expected, actual, "invalid number of embeddings - "+format);
  282. }
  283. }
  284. @ParameterizedTest
  285. @ValueSource(strings = {
  286. // password protected files
  287. "spreadsheet/password.xls",
  288. "spreadsheet/protected_passtika.xlsx",
  289. "spreadsheet/51832.xls",
  290. "document/PasswordProtected.doc",
  291. "slideshow/Password_Protected-hello.ppt",
  292. "slideshow/Password_Protected-56-hello.ppt",
  293. "slideshow/Password_Protected-np-hello.ppt",
  294. "slideshow/cryptoapi-proc2356.ppt",
  295. //"document/bug53475-password-is-pass.docx",
  296. //"document/bug53475-password-is-solrcell.docx",
  297. "spreadsheet/xor-encryption-abc.xls",
  298. "spreadsheet/35897-type4.xls",
  299. //"poifs/protect.xlsx",
  300. //"poifs/protected_sha512.xlsx",
  301. //"poifs/extenxls_pwd123.xlsx",
  302. //"poifs/protected_agile.docx",
  303. "spreadsheet/58616.xlsx",
  304. // TODO: fails XMLExportTest, is this ok?
  305. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  306. "spreadsheet/55864.xlsx",
  307. "spreadsheet/57890.xlsx",
  308. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  309. "spreadsheet/44958.xls",
  310. "spreadsheet/44958_1.xls",
  311. "spreadsheet/testArraysAndTables.xls",
  312. // TODO: good to ignore?
  313. "spreadsheet/sample-beta.xlsx",
  314. // This is actually a spreadsheet!
  315. "hpsf/TestRobert_Flaherty.doc",
  316. // some files that are broken, eg Word 95, ...
  317. "spreadsheet/43493.xls",
  318. "spreadsheet/46904.xls",
  319. "document/Bug50955.doc",
  320. "slideshow/PPT95.ppt",
  321. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  322. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  323. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  324. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  325. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  326. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  327. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  328. "openxml4j/invalid.xlsx",
  329. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  330. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  331. "spreadsheet/Simple.xlsb",
  332. "poifs/unknown_properties.msg", // POIFS properties corrupted
  333. "poifs/only-zero-byte-streams.ole2", // No actual contents
  334. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  335. "spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion
  336. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  337. // old Excel files, which we only support simple text extraction of
  338. "spreadsheet/testEXCEL_2.xls",
  339. "spreadsheet/testEXCEL_3.xls",
  340. "spreadsheet/testEXCEL_4.xls",
  341. "spreadsheet/testEXCEL_5.xls",
  342. "spreadsheet/testEXCEL_95.xls",
  343. // OOXML Strict is not yet supported, see bug #57699
  344. "spreadsheet/SampleSS.strict.xlsx",
  345. "spreadsheet/SimpleStrict.xlsx",
  346. "spreadsheet/sample.strict.xlsx",
  347. // non-TNEF files
  348. "ddf/Container.dat",
  349. "ddf/47143.dat",
  350. // sheet cloning errors
  351. "spreadsheet/47813.xlsx",
  352. "spreadsheet/56450.xls",
  353. "spreadsheet/57231_MixedGasReport.xls",
  354. "spreadsheet/OddStyleRecord.xls",
  355. "spreadsheet/WithChartSheet.xlsx",
  356. "spreadsheet/chart_sheet.xlsx"
  357. })
  358. void testFileLeak(String file) {
  359. // run a number of files that might fail in order to catch
  360. // leaked file resources when using file-leak-detector while
  361. // running the test
  362. //noinspection resource
  363. assertThrows(Exception.class, () -> ex(file));
  364. }
  365. /**
  366. * #59074 - Excel 95 files should give a helpful message, not just
  367. * "No supported documents found in the OLE2 stream"
  368. */
  369. @Test
  370. void bug59074() throws Exception {
  371. try (POITextExtractor extractor = ex("59074.xls")) {
  372. String text = extractor.getText();
  373. assertContains(text, "Exotic warrant");
  374. }
  375. }
  376. @Test
  377. void testGetEmbeddedFromXMLExtractor() {
  378. // currently not implemented
  379. assertThrows(IllegalStateException.class, () -> ExtractorFactory.getEmbeddedDocsTextExtractors(null));
  380. }
  381. @Test
  382. void test66365() throws Exception {
  383. try (POITextExtractor extractor = ex("66365.xlsx")) {
  384. String text = extractor.getText();
  385. assertContains(text, "Alice\tAlice");
  386. assertContains(text, "Bob\tBob");
  387. }
  388. }
  389. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
  390. // When this happens, change this from @Test(expected=...) to @Test
  391. // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
  392. @Test
  393. void test45565() throws Exception {
  394. try (POITextExtractor extractor = ex("45565.xls")) {
  395. String text = extractor.getText();
  396. assertThrows(AssertionError.class, () -> {
  397. assertContains(text, "testdoc");
  398. assertContains(text, "test phrase");
  399. });
  400. }
  401. }
  402. private void testExtractor(final POITextExtractor ext, final String testcase, final String extrClass, final Integer minLength) {
  403. assertEquals(extrClass, ext.getClass().getSimpleName(), "invalid extractor for " + testcase);
  404. final String actual = ext.getText();
  405. if (minLength == -1) {
  406. assertContains(actual.toLowerCase(Locale.ROOT), "test");
  407. } else {
  408. assertTrue(actual.length() > minLength, "extracted content too short for " + testcase);
  409. }
  410. }
  411. private static POITextExtractor ex(String filename) throws IOException {
  412. return createExtractor(ssTests.getFile(filename));
  413. }
  414. }