You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractorFactory.java 37KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.junit.Assert.assertEquals;
  18. import static org.junit.Assert.assertFalse;
  19. import static org.junit.Assert.assertNotNull;
  20. import static org.junit.Assert.assertNull;
  21. import static org.junit.Assert.assertTrue;
  22. import static org.junit.Assert.fail;
  23. import java.io.File;
  24. import java.io.FileInputStream;
  25. import java.io.IOException;
  26. import org.apache.poi.POIDataSamples;
  27. import org.apache.poi.POIOLE2TextExtractor;
  28. import org.apache.poi.POITextExtractor;
  29. import org.apache.poi.POIXMLException;
  30. import org.apache.poi.POIXMLTextExtractor;
  31. import org.apache.poi.UnsupportedFileFormatException;
  32. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  33. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  34. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  35. import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
  36. import org.apache.poi.hssf.HSSFTestDataSamples;
  37. import org.apache.poi.hssf.OldExcelFormatException;
  38. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  39. import org.apache.poi.hssf.extractor.ExcelExtractor;
  40. import org.apache.poi.hwpf.extractor.Word6Extractor;
  41. import org.apache.poi.hwpf.extractor.WordExtractor;
  42. import org.apache.poi.openxml4j.opc.OPCPackage;
  43. import org.apache.poi.openxml4j.opc.PackageAccess;
  44. import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
  45. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  46. import org.apache.poi.util.IOUtils;
  47. import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
  48. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  49. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  50. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  51. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  52. import org.junit.BeforeClass;
  53. import org.junit.Test;
  54. /**
  55. * Test that the extractor factory plays nicely
  56. */
  57. public class TestExtractorFactory {
  58. private static File txt;
  59. private static File xls;
  60. private static File xlsx;
  61. private static File xlsxStrict;
  62. private static File xltx;
  63. private static File xlsEmb;
  64. private static File xlsb;
  65. private static File doc;
  66. private static File doc6;
  67. private static File doc95;
  68. private static File docx;
  69. private static File dotx;
  70. private static File docEmb;
  71. private static File docEmbOOXML;
  72. private static File ppt;
  73. private static File pptx;
  74. private static File msg;
  75. private static File msgEmb;
  76. private static File msgEmbMsg;
  77. private static File vsd;
  78. private static File vsdx;
  79. private static File pub;
  80. private static File getFileAndCheck(POIDataSamples samples, String name) {
  81. File file = samples.getFile(name);
  82. assertNotNull("Did not get a file for " + name, file);
  83. assertTrue("Did not get a type file for " + name, file.isFile());
  84. assertTrue("File did not exist: " + name, file.exists());
  85. return file;
  86. }
  87. @BeforeClass
  88. public static void setUp() throws Exception {
  89. POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  90. xls = getFileAndCheck(ssTests, "SampleSS.xls");
  91. xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  92. xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  93. xltx = getFileAndCheck(ssTests, "test.xltx");
  94. xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  95. xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
  96. POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  97. doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  98. doc6 = getFileAndCheck(wpTests, "Word6.doc");
  99. doc95 = getFileAndCheck(wpTests, "Word95.doc");
  100. docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  101. dotx = getFileAndCheck(wpTests, "test.dotx");
  102. docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  103. docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  104. POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  105. ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  106. pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  107. txt = getFileAndCheck(slTests, "SampleShow.txt");
  108. POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  109. vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  110. vsdx = getFileAndCheck(dgTests, "test.vsdx");
  111. POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  112. pub = getFileAndCheck(pubTests, "Simple.pub");
  113. POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  114. msg = getFileAndCheck(olTests, "quick.msg");
  115. msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  116. msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  117. }
  118. @Test
  119. public void testFile() throws Exception {
  120. // Excel
  121. POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
  122. assertNotNull("Had empty extractor for " + xls, xlsExtractor);
  123. assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
  124. xlsExtractor
  125. instanceof ExcelExtractor
  126. );
  127. assertTrue(
  128. xlsExtractor.getText().length() > 200
  129. );
  130. xlsExtractor.close();
  131. POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
  132. assertTrue(
  133. extractor.getClass().getName(),
  134. extractor
  135. instanceof XSSFExcelExtractor
  136. );
  137. extractor.close();
  138. extractor = ExtractorFactory.createExtractor(xlsx);
  139. assertTrue(
  140. extractor.getText().length() > 200
  141. );
  142. extractor.close();
  143. extractor = ExtractorFactory.createExtractor(xltx);
  144. assertTrue(
  145. extractor.getClass().getName(),
  146. extractor
  147. instanceof XSSFExcelExtractor
  148. );
  149. extractor.close();
  150. extractor = ExtractorFactory.createExtractor(xlsb);
  151. assertTrue(
  152. extractor.getText().contains("test")
  153. );
  154. extractor.close();
  155. extractor = ExtractorFactory.createExtractor(xltx);
  156. assertTrue(
  157. extractor.getText().contains("test")
  158. );
  159. extractor.close();
  160. // TODO Support OOXML-Strict, see bug #57699
  161. try {
  162. /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
  163. fail("OOXML-Strict isn't yet supported");
  164. } catch (POIXMLException e) {
  165. // Expected, for now
  166. }
  167. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  168. // assertTrue(
  169. // extractor
  170. // instanceof XSSFExcelExtractor
  171. // );
  172. // extractor.close();
  173. //
  174. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  175. // assertTrue(
  176. // extractor.getText().contains("test")
  177. // );
  178. // extractor.close();
  179. // Word
  180. extractor = ExtractorFactory.createExtractor(doc);
  181. assertTrue(
  182. extractor
  183. instanceof WordExtractor
  184. );
  185. assertTrue(
  186. extractor.getText().length() > 120
  187. );
  188. extractor.close();
  189. extractor = ExtractorFactory.createExtractor(doc6);
  190. assertTrue(
  191. extractor
  192. instanceof Word6Extractor
  193. );
  194. assertTrue(
  195. extractor.getText().length() > 20
  196. );
  197. extractor.close();
  198. extractor = ExtractorFactory.createExtractor(doc95);
  199. assertTrue(
  200. extractor
  201. instanceof Word6Extractor
  202. );
  203. assertTrue(
  204. extractor.getText().length() > 120
  205. );
  206. extractor.close();
  207. extractor = ExtractorFactory.createExtractor(docx);
  208. assertTrue(
  209. extractor instanceof XWPFWordExtractor
  210. );
  211. extractor.close();
  212. extractor = ExtractorFactory.createExtractor(docx);
  213. assertTrue(
  214. extractor.getText().length() > 120
  215. );
  216. extractor.close();
  217. extractor = ExtractorFactory.createExtractor(dotx);
  218. assertTrue(
  219. extractor instanceof XWPFWordExtractor
  220. );
  221. extractor.close();
  222. extractor = ExtractorFactory.createExtractor(dotx);
  223. assertTrue(
  224. extractor.getText().contains("Test")
  225. );
  226. extractor.close();
  227. // PowerPoint (PPT)
  228. extractor = ExtractorFactory.createExtractor(ppt);
  229. assertTrue(
  230. extractor
  231. instanceof PowerPointExtractor
  232. );
  233. assertTrue(
  234. extractor.getText().length() > 120
  235. );
  236. extractor.close();
  237. // PowerPoint (PPTX)
  238. extractor = ExtractorFactory.createExtractor(pptx);
  239. assertTrue(
  240. extractor
  241. instanceof XSLFPowerPointExtractor
  242. );
  243. assertTrue(
  244. extractor.getText().length() > 120
  245. );
  246. extractor.close();
  247. // Visio - binary
  248. extractor = ExtractorFactory.createExtractor(vsd);
  249. assertTrue(
  250. extractor
  251. instanceof VisioTextExtractor
  252. );
  253. assertTrue(
  254. extractor.getText().length() > 50
  255. );
  256. extractor.close();
  257. // Visio - vsdx
  258. extractor = ExtractorFactory.createExtractor(vsdx);
  259. assertTrue(
  260. extractor
  261. instanceof XDGFVisioExtractor
  262. );
  263. assertTrue(
  264. extractor.getText().length() > 20
  265. );
  266. extractor.close();
  267. // Publisher
  268. extractor = ExtractorFactory.createExtractor(pub);
  269. assertTrue(
  270. extractor
  271. instanceof PublisherTextExtractor
  272. );
  273. assertTrue(
  274. extractor.getText().length() > 50
  275. );
  276. extractor.close();
  277. // Outlook msg
  278. extractor = ExtractorFactory.createExtractor(msg);
  279. assertTrue(
  280. extractor
  281. instanceof OutlookTextExtactor
  282. );
  283. assertTrue(
  284. extractor.getText().length() > 50
  285. );
  286. extractor.close();
  287. // Text
  288. try {
  289. ExtractorFactory.createExtractor(txt);
  290. fail();
  291. } catch(IllegalArgumentException e) {
  292. // Good
  293. }
  294. }
  295. @Test
  296. public void testInputStream() throws Exception {
  297. // Excel
  298. POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
  299. assertTrue(
  300. extractor
  301. instanceof ExcelExtractor
  302. );
  303. assertTrue(
  304. extractor.getText().length() > 200
  305. );
  306. extractor.close();
  307. extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
  308. assertTrue(
  309. extractor.getClass().getName(),
  310. extractor
  311. instanceof XSSFExcelExtractor
  312. );
  313. assertTrue(
  314. extractor.getText().length() > 200
  315. );
  316. // TODO Support OOXML-Strict, see bug #57699
  317. // assertTrue(
  318. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
  319. // instanceof XSSFExcelExtractor
  320. // );
  321. // assertTrue(
  322. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
  323. // );
  324. extractor.close();
  325. // Word
  326. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
  327. assertTrue(
  328. extractor.getClass().getName(),
  329. extractor
  330. instanceof WordExtractor
  331. );
  332. assertTrue(
  333. extractor.getText().length() > 120
  334. );
  335. extractor.close();
  336. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
  337. assertTrue(
  338. extractor.getClass().getName(),
  339. extractor
  340. instanceof Word6Extractor
  341. );
  342. assertTrue(
  343. extractor.getText().length() > 20
  344. );
  345. extractor.close();
  346. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
  347. assertTrue(
  348. extractor.getClass().getName(),
  349. extractor
  350. instanceof Word6Extractor
  351. );
  352. assertTrue(
  353. extractor.getText().length() > 120
  354. );
  355. extractor.close();
  356. extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
  357. assertTrue(
  358. extractor
  359. instanceof XWPFWordExtractor
  360. );
  361. assertTrue(
  362. extractor.getText().length() > 120
  363. );
  364. extractor.close();
  365. // PowerPoint
  366. extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
  367. assertTrue(
  368. extractor
  369. instanceof PowerPointExtractor
  370. );
  371. assertTrue(
  372. extractor.getText().length() > 120
  373. );
  374. extractor.close();
  375. extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
  376. assertTrue(
  377. extractor
  378. instanceof XSLFPowerPointExtractor
  379. );
  380. assertTrue(
  381. extractor.getText().length() > 120
  382. );
  383. extractor.close();
  384. // Visio
  385. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
  386. assertTrue(
  387. extractor
  388. instanceof VisioTextExtractor
  389. );
  390. assertTrue(
  391. extractor.getText().length() > 50
  392. );
  393. extractor.close();
  394. // Visio - vsdx
  395. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
  396. assertTrue(
  397. extractor
  398. instanceof XDGFVisioExtractor
  399. );
  400. assertTrue(
  401. extractor.getText().length() > 20
  402. );
  403. extractor.close();
  404. // Publisher
  405. extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
  406. assertTrue(
  407. extractor
  408. instanceof PublisherTextExtractor
  409. );
  410. assertTrue(
  411. extractor.getText().length() > 50
  412. );
  413. extractor.close();
  414. // Outlook msg
  415. extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
  416. assertTrue(
  417. extractor
  418. instanceof OutlookTextExtactor
  419. );
  420. assertTrue(
  421. extractor.getText().length() > 50
  422. );
  423. extractor.close();
  424. // Text
  425. try {
  426. FileInputStream stream = new FileInputStream(txt);
  427. try {
  428. ExtractorFactory.createExtractor(stream);
  429. fail();
  430. } finally {
  431. IOUtils.closeQuietly(stream);
  432. }
  433. } catch(IllegalArgumentException e) {
  434. // Good
  435. }
  436. }
  437. @Test
  438. public void testPOIFS() throws Exception {
  439. // Excel
  440. assertTrue(
  441. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
  442. instanceof ExcelExtractor
  443. );
  444. assertTrue(
  445. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  446. );
  447. // Word
  448. assertTrue(
  449. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
  450. instanceof WordExtractor
  451. );
  452. assertTrue(
  453. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  454. );
  455. assertTrue(
  456. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
  457. instanceof Word6Extractor
  458. );
  459. assertTrue(
  460. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  461. );
  462. assertTrue(
  463. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
  464. instanceof Word6Extractor
  465. );
  466. assertTrue(
  467. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  468. );
  469. // PowerPoint
  470. assertTrue(
  471. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
  472. instanceof PowerPointExtractor
  473. );
  474. assertTrue(
  475. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  476. );
  477. // Visio
  478. assertTrue(
  479. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
  480. instanceof VisioTextExtractor
  481. );
  482. assertTrue(
  483. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  484. );
  485. // Publisher
  486. assertTrue(
  487. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
  488. instanceof PublisherTextExtractor
  489. );
  490. assertTrue(
  491. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  492. );
  493. // Outlook msg
  494. assertTrue(
  495. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
  496. instanceof OutlookTextExtactor
  497. );
  498. assertTrue(
  499. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  500. );
  501. // Text
  502. try {
  503. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
  504. fail();
  505. } catch(IOException e) {
  506. // Good
  507. }
  508. }
  509. @Test
  510. public void testOPOIFS() throws Exception {
  511. // Excel
  512. assertTrue(
  513. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
  514. instanceof ExcelExtractor
  515. );
  516. assertTrue(
  517. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  518. );
  519. // Word
  520. assertTrue(
  521. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
  522. instanceof WordExtractor
  523. );
  524. assertTrue(
  525. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  526. );
  527. assertTrue(
  528. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
  529. instanceof Word6Extractor
  530. );
  531. assertTrue(
  532. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  533. );
  534. assertTrue(
  535. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
  536. instanceof Word6Extractor
  537. );
  538. assertTrue(
  539. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  540. );
  541. // PowerPoint
  542. assertTrue(
  543. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
  544. instanceof PowerPointExtractor
  545. );
  546. assertTrue(
  547. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  548. );
  549. // Visio
  550. assertTrue(
  551. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
  552. instanceof VisioTextExtractor
  553. );
  554. assertTrue(
  555. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  556. );
  557. // Publisher
  558. assertTrue(
  559. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
  560. instanceof PublisherTextExtractor
  561. );
  562. assertTrue(
  563. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  564. );
  565. // Outlook msg
  566. assertTrue(
  567. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
  568. instanceof OutlookTextExtactor
  569. );
  570. assertTrue(
  571. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  572. );
  573. // Text
  574. try {
  575. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
  576. fail();
  577. } catch(IOException e) {
  578. // Good
  579. }
  580. }
  581. @Test
  582. public void testPackage() throws Exception {
  583. // Excel
  584. POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  585. assertTrue(extractor instanceof XSSFExcelExtractor);
  586. extractor.close();
  587. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  588. assertTrue(extractor.getText().length() > 200);
  589. extractor.close();
  590. // Word
  591. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  592. assertTrue(extractor instanceof XWPFWordExtractor);
  593. extractor.close();
  594. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  595. assertTrue(extractor.getText().length() > 120);
  596. extractor.close();
  597. // PowerPoint
  598. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  599. assertTrue(extractor instanceof XSLFPowerPointExtractor);
  600. extractor.close();
  601. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  602. assertTrue(extractor.getText().length() > 120);
  603. extractor.close();
  604. // Visio
  605. extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
  606. assertTrue(extractor instanceof XDGFVisioExtractor);
  607. assertTrue(extractor.getText().length() > 20);
  608. extractor.close();
  609. // Text
  610. try {
  611. ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
  612. fail("TestExtractorFactory.testPackage() failed on " + txt);
  613. } catch(UnsupportedFileFormatException e) {
  614. // Good
  615. } catch (Exception e) {
  616. System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
  617. throw e;
  618. }
  619. }
  620. @Test
  621. public void testPreferEventBased() throws Exception {
  622. assertFalse(ExtractorFactory.getPreferEventExtractor());
  623. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  624. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  625. ExtractorFactory.setThreadPrefersEventExtractors(true);
  626. assertTrue(ExtractorFactory.getPreferEventExtractor());
  627. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  628. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  629. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  630. assertFalse(ExtractorFactory.getPreferEventExtractor());
  631. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  632. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  633. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  634. assertTrue(ExtractorFactory.getPreferEventExtractor());
  635. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  636. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  637. // Check we get the right extractors now
  638. POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  639. assertTrue(
  640. extractor
  641. instanceof EventBasedExcelExtractor
  642. );
  643. extractor.close();
  644. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  645. assertTrue(
  646. extractor.getText().length() > 200
  647. );
  648. extractor.close();
  649. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  650. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  651. extractor.close();
  652. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  653. assertTrue(
  654. extractor.getText().length() > 200
  655. );
  656. extractor.close();
  657. // Put back to normal
  658. ExtractorFactory.setThreadPrefersEventExtractors(false);
  659. assertFalse(ExtractorFactory.getPreferEventExtractor());
  660. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  661. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  662. // And back
  663. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  664. assertTrue(
  665. extractor
  666. instanceof ExcelExtractor
  667. );
  668. extractor.close();
  669. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  670. assertTrue(
  671. extractor.getText().length() > 200
  672. );
  673. extractor.close();
  674. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  675. assertTrue(
  676. extractor
  677. instanceof XSSFExcelExtractor
  678. );
  679. extractor.close();
  680. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  681. assertTrue(
  682. extractor.getText().length() > 200
  683. );
  684. extractor.close();
  685. }
  686. /**
  687. * Test embeded docs text extraction. For now, only
  688. * does poifs embeded, but will do ooxml ones
  689. * at some point.
  690. */
  691. @Test
  692. public void testEmbeded() throws Exception {
  693. POIOLE2TextExtractor ext;
  694. POITextExtractor[] embeds;
  695. // No embedings
  696. ext = (POIOLE2TextExtractor)
  697. ExtractorFactory.createExtractor(xls);
  698. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  699. assertEquals(0, embeds.length);
  700. ext.close();
  701. // Excel
  702. ext = (POIOLE2TextExtractor)
  703. ExtractorFactory.createExtractor(xlsEmb);
  704. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  705. assertEquals(6, embeds.length);
  706. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
  707. for (POITextExtractor embed : embeds) {
  708. assertTrue(embed.getText().length() > 20);
  709. if (embed instanceof PowerPointExtractor) numPpt++;
  710. else if (embed instanceof ExcelExtractor) numXls++;
  711. else if (embed instanceof WordExtractor) numWord++;
  712. else if (embed instanceof OutlookTextExtactor) numMsg++;
  713. }
  714. assertEquals(2, numPpt);
  715. assertEquals(2, numXls);
  716. assertEquals(2, numWord);
  717. assertEquals(0, numMsg);
  718. ext.close();
  719. // Word
  720. ext = (POIOLE2TextExtractor)
  721. ExtractorFactory.createExtractor(docEmb);
  722. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  723. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  724. assertEquals(4, embeds.length);
  725. for (POITextExtractor embed : embeds) {
  726. assertTrue(embed.getText().length() > 20);
  727. if (embed instanceof PowerPointExtractor) numPpt++;
  728. else if (embed instanceof ExcelExtractor) numXls++;
  729. else if (embed instanceof WordExtractor) numWord++;
  730. else if (embed instanceof OutlookTextExtactor) numMsg++;
  731. }
  732. assertEquals(1, numPpt);
  733. assertEquals(2, numXls);
  734. assertEquals(1, numWord);
  735. assertEquals(0, numMsg);
  736. ext.close();
  737. // Word which contains an OOXML file
  738. ext = (POIOLE2TextExtractor)
  739. ExtractorFactory.createExtractor(docEmbOOXML);
  740. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  741. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
  742. assertEquals(3, embeds.length);
  743. for (POITextExtractor embed : embeds) {
  744. assertTrue(embed.getText().length() > 20);
  745. if (embed instanceof PowerPointExtractor) numPpt++;
  746. else if (embed instanceof ExcelExtractor) numXls++;
  747. else if (embed instanceof WordExtractor) numWord++;
  748. else if (embed instanceof OutlookTextExtactor) numMsg++;
  749. else if (embed instanceof XWPFWordExtractor) numWordX++;
  750. }
  751. assertEquals(1, numPpt);
  752. assertEquals(1, numXls);
  753. assertEquals(0, numWord);
  754. assertEquals(1, numWordX);
  755. assertEquals(0, numMsg);
  756. ext.close();
  757. // Outlook
  758. ext = (OutlookTextExtactor)
  759. ExtractorFactory.createExtractor(msgEmb);
  760. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  761. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  762. assertEquals(1, embeds.length);
  763. for (POITextExtractor embed : embeds) {
  764. assertTrue(embed.getText().length() > 20);
  765. if (embed instanceof PowerPointExtractor) numPpt++;
  766. else if (embed instanceof ExcelExtractor) numXls++;
  767. else if (embed instanceof WordExtractor) numWord++;
  768. else if (embed instanceof OutlookTextExtactor) numMsg++;
  769. }
  770. assertEquals(0, numPpt);
  771. assertEquals(0, numXls);
  772. assertEquals(1, numWord);
  773. assertEquals(0, numMsg);
  774. ext.close();
  775. // Outlook with another outlook file in it
  776. ext = (OutlookTextExtactor)
  777. ExtractorFactory.createExtractor(msgEmbMsg);
  778. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  779. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  780. assertEquals(1, embeds.length);
  781. for (POITextExtractor embed : embeds) {
  782. assertTrue(embed.getText().length() > 20);
  783. if (embed instanceof PowerPointExtractor) numPpt++;
  784. else if (embed instanceof ExcelExtractor) numXls++;
  785. else if (embed instanceof WordExtractor) numWord++;
  786. else if (embed instanceof OutlookTextExtactor) numMsg++;
  787. }
  788. assertEquals(0, numPpt);
  789. assertEquals(0, numXls);
  790. assertEquals(0, numWord);
  791. assertEquals(1, numMsg);
  792. ext.close();
  793. // TODO - PowerPoint
  794. // TODO - Publisher
  795. // TODO - Visio
  796. }
  797. private static final String[] EXPECTED_FAILURES = new String[] {
  798. // password protected files
  799. "spreadsheet/password.xls",
  800. "spreadsheet/protected_passtika.xlsx",
  801. "spreadsheet/51832.xls",
  802. "document/PasswordProtected.doc",
  803. "slideshow/Password_Protected-hello.ppt",
  804. "slideshow/Password_Protected-56-hello.ppt",
  805. "slideshow/Password_Protected-np-hello.ppt",
  806. "slideshow/cryptoapi-proc2356.ppt",
  807. //"document/bug53475-password-is-pass.docx",
  808. //"document/bug53475-password-is-solrcell.docx",
  809. "spreadsheet/xor-encryption-abc.xls",
  810. "spreadsheet/35897-type4.xls",
  811. //"poifs/protect.xlsx",
  812. //"poifs/protected_sha512.xlsx",
  813. //"poifs/extenxls_pwd123.xlsx",
  814. //"poifs/protected_agile.docx",
  815. "spreadsheet/58616.xlsx",
  816. // TODO: fails XMLExportTest, is this ok?
  817. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  818. "spreadsheet/55864.xlsx",
  819. "spreadsheet/57890.xlsx",
  820. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  821. "spreadsheet/44958.xls",
  822. "spreadsheet/44958_1.xls",
  823. "spreadsheet/testArraysAndTables.xls",
  824. // TODO: good to ignore?
  825. "spreadsheet/sample-beta.xlsx",
  826. // This is actually a spreadsheet!
  827. "hpsf/TestRobert_Flaherty.doc",
  828. // some files that are broken, eg Word 95, ...
  829. "spreadsheet/43493.xls",
  830. "spreadsheet/46904.xls",
  831. "document/Bug50955.doc",
  832. "slideshow/PPT95.ppt",
  833. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  834. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  835. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  836. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  837. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  838. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  839. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  840. "openxml4j/invalid.xlsx",
  841. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  842. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  843. "spreadsheet/Simple.xlsb",
  844. "poifs/unknown_properties.msg", // POIFS properties corrupted
  845. "poifs/only-zero-byte-streams.ole2", // No actual contents
  846. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  847. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  848. // old Excel files, which we only support simple text extraction of
  849. "spreadsheet/testEXCEL_2.xls",
  850. "spreadsheet/testEXCEL_3.xls",
  851. "spreadsheet/testEXCEL_4.xls",
  852. "spreadsheet/testEXCEL_5.xls",
  853. "spreadsheet/testEXCEL_95.xls",
  854. // OOXML Strict is not yet supported, see bug #57699
  855. "spreadsheet/SampleSS.strict.xlsx",
  856. "spreadsheet/SimpleStrict.xlsx",
  857. "spreadsheet/sample.strict.xlsx",
  858. // non-TNEF files
  859. "ddf/Container.dat",
  860. "ddf/47143.dat",
  861. // sheet cloning errors
  862. "spreadsheet/47813.xlsx",
  863. "spreadsheet/56450.xls",
  864. "spreadsheet/57231_MixedGasReport.xls",
  865. "spreadsheet/OddStyleRecord.xls",
  866. "spreadsheet/WithChartSheet.xlsx",
  867. "spreadsheet/chart_sheet.xlsx",
  868. };
  869. @Test
  870. public void testFileLeak() throws Exception {
  871. // run a number of files that might fail in order to catch
  872. // leaked file resources when using file-leak-detector while
  873. // running the test
  874. for(String file : EXPECTED_FAILURES) {
  875. try {
  876. ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
  877. } catch (Exception e) {
  878. // catch all exceptions here as we are only interested in file-handle leaks
  879. }
  880. }
  881. }
  882. /**
  883. * #59074 - Excel 95 files should give a helpful message, not just
  884. * "No supported documents found in the OLE2 stream"
  885. */
  886. @Test
  887. public void bug59074() throws Exception {
  888. try {
  889. ExtractorFactory.createExtractor(
  890. POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
  891. fail("Old excel formats not supported via ExtractorFactory");
  892. } catch (OldExcelFormatException e) {
  893. // expected here
  894. }
  895. }
  896. @Test
  897. public void testGetEmbeddedFromXMLExtractor() {
  898. try {
  899. // currently not implemented
  900. ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
  901. fail("Unsupported currently");
  902. } catch (IllegalStateException e) {
  903. // expected here
  904. }
  905. }
  906. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
  907. // When this happens, change this from @Test(expected=...) to @Test
  908. // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
  909. @Test(expected=AssertionError.class)
  910. public void test45565() throws Exception {
  911. POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
  912. try {
  913. String text = extractor.getText();
  914. assertContains(text, "testdoc");
  915. assertContains(text, "test phrase");
  916. } finally {
  917. extractor.close();
  918. }
  919. }
  920. }