You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractorFactory.java 21KB


  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor.ooxml;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.junit.Assert.assertEquals;
  18. import static org.junit.Assert.assertFalse;
  19. import static org.junit.Assert.assertNotNull;
  20. import static org.junit.Assert.assertNull;
  21. import static org.junit.Assert.assertTrue;
  22. import static org.junit.Assert.fail;
  23. import java.io.File;
  24. import java.io.FileInputStream;
  25. import java.io.IOException;
  26. import java.util.Locale;
  27. import org.apache.poi.POIDataSamples;
  28. import org.apache.poi.UnsupportedFileFormatException;
  29. import org.apache.poi.extractor.ExtractorFactory;
  30. import org.apache.poi.extractor.POIOLE2TextExtractor;
  31. import org.apache.poi.extractor.POITextExtractor;
  32. import org.apache.poi.hssf.HSSFTestDataSamples;
  33. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  34. import org.apache.poi.hssf.extractor.ExcelExtractor;
  35. import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
  36. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  37. import org.apache.poi.openxml4j.opc.OPCPackage;
  38. import org.apache.poi.openxml4j.opc.PackageAccess;
  39. import org.apache.poi.poifs.filesystem.FileMagic;
  40. import org.apache.poi.poifs.filesystem.NotOLE2FileException;
  41. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  42. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  43. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  44. import org.apache.xmlbeans.XmlException;
  45. import org.junit.Rule;
  46. import org.junit.Test;
  47. import org.junit.rules.ExpectedException;
  48. /**
  49. * Test that the extractor factory plays nicely
  50. */
  51. public class TestExtractorFactory {
  52. private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  53. private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
  54. private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  55. @SuppressWarnings("unused")
  56. private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  57. private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
  58. private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  59. private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
  60. private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  61. private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  62. private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
  63. private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
  64. private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  65. private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
  66. private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  67. private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  68. private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  69. private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  70. private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  71. private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
  72. private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  73. private static final File msg = getFileAndCheck(olTests, "quick.msg");
  74. private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  75. private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  76. private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  77. private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  78. private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
  79. private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  80. private static File pub = getFileAndCheck(pubTests, "Simple.pub");
  81. private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
  82. private static File getFileAndCheck(POIDataSamples samples, String name) {
  83. File file = samples.getFile(name);
  84. assertNotNull("Did not get a file for " + name, file);
  85. assertTrue("Did not get a type file for " + name, file.isFile());
  86. assertTrue("File did not exist: " + name, file.exists());
  87. return file;
  88. }
  89. private static final Object[] TEST_SET = {
  90. "Excel", xls, "ExcelExtractor", 200,
  91. "Excel - xlsx", xlsx, "XSSFExcelExtractor", 200,
  92. "Excel - xltx", xltx, "XSSFExcelExtractor", -1,
  93. "Excel - xlsb", xlsb, "XSSFBEventBasedExcelExtractor", -1,
  94. "Word", doc, "WordExtractor", 120,
  95. "Word - docx", docx, "XWPFWordExtractor", 120,
  96. "Word - dotx", dotx, "XWPFWordExtractor", -1,
  97. "Word 6", doc6, "Word6Extractor", 20,
  98. "Word 95", doc95, "Word6Extractor", 120,
  99. "PowerPoint", ppt, "SlideShowExtractor", 120,
  100. "PowerPoint - pptx", pptx, "XSLFExtractor", 120,
  101. "Visio", vsd, "VisioTextExtractor", 50,
  102. "Visio - vsdx", vsdx, "XDGFVisioExtractor", 20,
  103. "Publisher", pub, "PublisherTextExtractor", 50,
  104. "Outlook msg", msg, "OutlookTextExtractor", 50,
  105. // TODO Support OOXML-Strict, see bug #57699
  106. // xlsxStrict
  107. };
  108. @FunctionalInterface
  109. interface FunctionEx<T, R> {
  110. R apply(T t) throws IOException, OpenXML4JException, XmlException;
  111. }
  112. @Rule
  113. public ExpectedException thrown = ExpectedException.none();
  114. @Test
  115. public void testFile() throws Exception {
  116. for (int i = 0; i < TEST_SET.length; i += 4) {
  117. try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
  118. testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
  119. }
  120. }
  121. }
  122. @Test
  123. public void testFileInvalid() throws Exception {
  124. thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
  125. thrown.expect(IOException.class);
  126. // Text
  127. ExtractorFactory.createExtractor(txt);
  128. }
  129. @Test
  130. public void testInputStream() throws Exception {
  131. testStream(ExtractorFactory::createExtractor, true);
  132. }
  133. @Test
  134. public void testInputStreamInvalid() throws Exception {
  135. thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
  136. thrown.expect(IOException.class);
  137. testInvalid(ExtractorFactory::createExtractor);
  138. }
  139. @Test
  140. public void testPOIFS() throws Exception {
  141. testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
  142. }
  143. @Test
  144. public void testPOIFSInvalid() throws Exception {
  145. thrown.expectMessage("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0");
  146. thrown.expect(NotOLE2FileException.class);
  147. testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
  148. }
  149. private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
  150. throws IOException, OpenXML4JException, XmlException {
  151. for (int i = 0; i < TEST_SET.length; i += 4) {
  152. File testFile = (File) TEST_SET[i + 1];
  153. if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
  154. continue;
  155. }
  156. try (FileInputStream fis = new FileInputStream(testFile);
  157. POITextExtractor ext = poifsIS.apply(fis)) {
  158. testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
  159. } catch (IllegalArgumentException e) {
  160. fail("failed to process "+testFile);
  161. }
  162. }
  163. }
  164. private void testExtractor(final POITextExtractor ext, final String testcase, final String extrClass, final Integer minLength) {
  165. assertEquals("invalid extractor for " + testcase, extrClass, ext.getClass().getSimpleName());
  166. final String actual = ext.getText();
  167. if (minLength == -1) {
  168. assertContains(actual.toLowerCase(Locale.ROOT), "test");
  169. } else {
  170. assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
  171. }
  172. }
  173. private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
  174. // Text
  175. try (FileInputStream fis = new FileInputStream(txt);
  176. POITextExtractor ignored = poifs.apply(fis)) {
  177. fail("extracting from invalid package");
  178. } catch (IllegalArgumentException e) {
  179. assertTrue("Had: " + e, e.getMessage().contains(FileMagic.UNKNOWN.name()));
  180. throw e;
  181. }
  182. }
  183. @Test
  184. public void testPackage() throws Exception {
  185. for (int i = 0; i < TEST_SET.length; i += 4) {
  186. final File testFile = (File) TEST_SET[i + 1];
  187. if (!testFile.getName().endsWith("x")) {
  188. continue;
  189. }
  190. try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
  191. final POITextExtractor ext = xmlFactory.create(pkg)) {
  192. testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
  193. pkg.revert();
  194. }
  195. }
  196. }
  197. @Test(expected = UnsupportedFileFormatException.class)
  198. public void testPackageInvalid() throws Exception {
  199. // Text
  200. try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
  201. final POITextExtractor ignored = xmlFactory.create(pkg)) {
  202. fail("extracting from invalid package");
  203. }
  204. }
  205. @Test
  206. public void testPreferEventBased() throws Exception {
  207. assertFalse(ExtractorFactory.getPreferEventExtractor());
  208. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  209. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  210. ExtractorFactory.setThreadPrefersEventExtractors(true);
  211. assertTrue(ExtractorFactory.getPreferEventExtractor());
  212. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  213. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  214. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  215. assertFalse(ExtractorFactory.getPreferEventExtractor());
  216. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  217. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  218. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  219. assertTrue(ExtractorFactory.getPreferEventExtractor());
  220. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  221. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  222. try {
  223. // Check we get the right extractors now
  224. try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  225. assertTrue(extractor instanceof EventBasedExcelExtractor);
  226. }
  227. try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  228. assertTrue(extractor.getText().length() > 200);
  229. }
  230. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  231. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  232. }
  233. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  234. assertTrue(extractor.getText().length() > 200);
  235. }
  236. } finally {
  237. // Put back to normal
  238. ExtractorFactory.setThreadPrefersEventExtractors(false);
  239. }
  240. assertFalse(ExtractorFactory.getPreferEventExtractor());
  241. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  242. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  243. // And back
  244. try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  245. assertTrue(extractor instanceof ExcelExtractor);
  246. }
  247. try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
  248. assertTrue(extractor.getText().length() > 200);
  249. }
  250. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
  251. assertTrue(extractor instanceof XSSFExcelExtractor);
  252. }
  253. try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
  254. assertTrue(extractor.getText().length() > 200);
  255. }
  256. }
  257. /**
  258. * Test embedded docs text extraction. For now, only
  259. * does poifs embedded, but will do ooxml ones
  260. * at some point.
  261. */
  262. @Test
  263. public void testEmbedded() throws Exception {
  264. final Object[] testObj = {
  265. "No embeddings", xls, "0-0-0-0-0-0",
  266. "Excel", xlsEmb, "6-2-2-2-0-0",
  267. "Word", docEmb, "4-1-2-1-0-0",
  268. "Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
  269. "Outlook", msgEmb, "1-1-0-0-0-0",
  270. "Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
  271. };
  272. for (int i=0; i<testObj.length; i+=3) {
  273. try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor)ExtractorFactory.createExtractor((File)testObj[i+1])) {
  274. final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
  275. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
  276. for (POITextExtractor embed : embeds) {
  277. assertTrue(embed.getText().length() > 20);
  278. switch (embed.getClass().getSimpleName()) {
  279. case "SlideShowExtractor":
  280. numPpt++;
  281. break;
  282. case "ExcelExtractor":
  283. numXls++;
  284. break;
  285. case "WordExtractor":
  286. numWord++;
  287. break;
  288. case "OutlookTextExtractor":
  289. numMsg++;
  290. break;
  291. case "XWPFWordExtractor":
  292. numWordX++;
  293. break;
  294. }
  295. }
  296. final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
  297. final String expected = (String)testObj[i+2];
  298. assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
  299. }
  300. }
  301. // TODO - PowerPoint
  302. // TODO - Publisher
  303. // TODO - Visio
  304. }
  305. private static final String[] EXPECTED_FAILURES = {
  306. // password protected files
  307. "spreadsheet/password.xls",
  308. "spreadsheet/protected_passtika.xlsx",
  309. "spreadsheet/51832.xls",
  310. "document/PasswordProtected.doc",
  311. "slideshow/Password_Protected-hello.ppt",
  312. "slideshow/Password_Protected-56-hello.ppt",
  313. "slideshow/Password_Protected-np-hello.ppt",
  314. "slideshow/cryptoapi-proc2356.ppt",
  315. //"document/bug53475-password-is-pass.docx",
  316. //"document/bug53475-password-is-solrcell.docx",
  317. "spreadsheet/xor-encryption-abc.xls",
  318. "spreadsheet/35897-type4.xls",
  319. //"poifs/protect.xlsx",
  320. //"poifs/protected_sha512.xlsx",
  321. //"poifs/extenxls_pwd123.xlsx",
  322. //"poifs/protected_agile.docx",
  323. "spreadsheet/58616.xlsx",
  324. // TODO: fails XMLExportTest, is this ok?
  325. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  326. "spreadsheet/55864.xlsx",
  327. "spreadsheet/57890.xlsx",
  328. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  329. "spreadsheet/44958.xls",
  330. "spreadsheet/44958_1.xls",
  331. "spreadsheet/testArraysAndTables.xls",
  332. // TODO: good to ignore?
  333. "spreadsheet/sample-beta.xlsx",
  334. // This is actually a spreadsheet!
  335. "hpsf/TestRobert_Flaherty.doc",
  336. // some files that are broken, eg Word 95, ...
  337. "spreadsheet/43493.xls",
  338. "spreadsheet/46904.xls",
  339. "document/Bug50955.doc",
  340. "slideshow/PPT95.ppt",
  341. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  342. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  343. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  344. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  345. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  346. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  347. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  348. "openxml4j/invalid.xlsx",
  349. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  350. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  351. "spreadsheet/Simple.xlsb",
  352. "poifs/unknown_properties.msg", // POIFS properties corrupted
  353. "poifs/only-zero-byte-streams.ole2", // No actual contents
  354. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  355. "spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion
  356. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  357. // old Excel files, which we only support simple text extraction of
  358. "spreadsheet/testEXCEL_2.xls",
  359. "spreadsheet/testEXCEL_3.xls",
  360. "spreadsheet/testEXCEL_4.xls",
  361. "spreadsheet/testEXCEL_5.xls",
  362. "spreadsheet/testEXCEL_95.xls",
  363. // OOXML Strict is not yet supported, see bug #57699
  364. "spreadsheet/SampleSS.strict.xlsx",
  365. "spreadsheet/SimpleStrict.xlsx",
  366. "spreadsheet/sample.strict.xlsx",
  367. // non-TNEF files
  368. "ddf/Container.dat",
  369. "ddf/47143.dat",
  370. // sheet cloning errors
  371. "spreadsheet/47813.xlsx",
  372. "spreadsheet/56450.xls",
  373. "spreadsheet/57231_MixedGasReport.xls",
  374. "spreadsheet/OddStyleRecord.xls",
  375. "spreadsheet/WithChartSheet.xlsx",
  376. "spreadsheet/chart_sheet.xlsx",
  377. };
  378. @Test
  379. public void testFileLeak() {
  380. // run a number of files that might fail in order to catch
  381. // leaked file resources when using file-leak-detector while
  382. // running the test
  383. for(String file : EXPECTED_FAILURES) {
  384. try {
  385. ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
  386. } catch (Exception e) {
  387. // catch all exceptions here as we are only interested in file-handle leaks
  388. }
  389. }
  390. }
  391. /**
  392. * #59074 - Excel 95 files should give a helpful message, not just
  393. * "No supported documents found in the OLE2 stream"
  394. */
  395. public void bug59074() throws Exception {
  396. try (POITextExtractor extractor = ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"))) {
  397. String text = extractor.getText();
  398. assertContains(text, "testdoc");
  399. }
  400. }
  401. @Test(expected = IllegalStateException.class)
  402. public void testGetEmbeddedFromXMLExtractor() throws IOException {
  403. // currently not implemented
  404. ExtractorFactory.getEmbeddedDocsTextExtractors(null);
  405. }
  406. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
  407. // When this happens, change this from @Test(expected=...) to @Test
  408. // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
  409. @Test(expected=AssertionError.class)
  410. public void test45565() throws Exception {
  411. try (POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"))) {
  412. String text = extractor.getText();
  413. assertContains(text, "testdoc");
  414. assertContains(text, "test phrase");
  415. }
  416. }
  417. }