You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractorFactory.java 37KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.junit.Assert.assertEquals;
  18. import static org.junit.Assert.assertFalse;
  19. import static org.junit.Assert.assertNotNull;
  20. import static org.junit.Assert.assertNull;
  21. import static org.junit.Assert.assertTrue;
  22. import static org.junit.Assert.fail;
  23. import java.io.File;
  24. import java.io.FileInputStream;
  25. import java.io.IOException;
  26. import org.apache.poi.POIDataSamples;
  27. import org.apache.poi.POIOLE2TextExtractor;
  28. import org.apache.poi.POITextExtractor;
  29. import org.apache.poi.POIXMLException;
  30. import org.apache.poi.POIXMLTextExtractor;
  31. import org.apache.poi.UnsupportedFileFormatException;
  32. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  33. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  34. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  35. import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
  36. import org.apache.poi.hssf.HSSFTestDataSamples;
  37. import org.apache.poi.hssf.OldExcelFormatException;
  38. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  39. import org.apache.poi.hssf.extractor.ExcelExtractor;
  40. import org.apache.poi.hwpf.extractor.Word6Extractor;
  41. import org.apache.poi.hwpf.extractor.WordExtractor;
  42. import org.apache.poi.openxml4j.opc.OPCPackage;
  43. import org.apache.poi.openxml4j.opc.PackageAccess;
  44. import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
  45. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  46. import org.apache.poi.util.IOUtils;
  47. import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
  48. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  49. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  50. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  51. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  52. import org.junit.BeforeClass;
  53. import org.junit.Test;
  54. /**
  55. * Test that the extractor factory plays nicely
  56. */
  57. public class TestExtractorFactory {
  58. private static File txt;
  59. private static File xls;
  60. private static File xlsx;
  61. private static File xlsxStrict;
  62. private static File xltx;
  63. private static File xlsEmb;
  64. private static File xlsb;
  65. private static File doc;
  66. private static File doc6;
  67. private static File doc95;
  68. private static File docx;
  69. private static File dotx;
  70. private static File docEmb;
  71. private static File docEmbOOXML;
  72. private static File ppt;
  73. private static File pptx;
  74. private static File msg;
  75. private static File msgEmb;
  76. private static File msgEmbMsg;
  77. private static File vsd;
  78. private static File vsdx;
  79. private static File pub;
  80. private static File getFileAndCheck(POIDataSamples samples, String name) {
  81. File file = samples.getFile(name);
  82. assertNotNull("Did not get a file for " + name, file);
  83. assertTrue("Did not get a type file for " + name, file.isFile());
  84. assertTrue("File did not exist: " + name, file.exists());
  85. return file;
  86. }
  87. @BeforeClass
  88. public static void setUp() throws Exception {
  89. POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  90. xls = getFileAndCheck(ssTests, "SampleSS.xls");
  91. xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  92. xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  93. xltx = getFileAndCheck(ssTests, "test.xltx");
  94. xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  95. xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
  96. POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  97. doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  98. doc6 = getFileAndCheck(wpTests, "Word6.doc");
  99. doc95 = getFileAndCheck(wpTests, "Word95.doc");
  100. docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  101. dotx = getFileAndCheck(wpTests, "test.dotx");
  102. docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  103. docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  104. POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  105. ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  106. pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  107. txt = getFileAndCheck(slTests, "SampleShow.txt");
  108. POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  109. vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  110. vsdx = getFileAndCheck(dgTests, "test.vsdx");
  111. POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  112. pub = getFileAndCheck(pubTests, "Simple.pub");
  113. POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  114. msg = getFileAndCheck(olTests, "quick.msg");
  115. msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  116. msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  117. }
  118. @Test
  119. public void testFile() throws Exception {
  120. // Excel
  121. POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
  122. assertNotNull("Had empty extractor for " + xls, xlsExtractor);
  123. assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
  124. xlsExtractor
  125. instanceof ExcelExtractor
  126. );
  127. assertTrue(
  128. xlsExtractor.getText().length() > 200
  129. );
  130. xlsExtractor.close();
  131. POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
  132. assertTrue(
  133. extractor.getClass().getName(),
  134. extractor
  135. instanceof XSSFExcelExtractor
  136. );
  137. extractor.close();
  138. extractor = ExtractorFactory.createExtractor(xlsx);
  139. assertTrue(
  140. extractor.getText().length() > 200
  141. );
  142. extractor.close();
  143. extractor = ExtractorFactory.createExtractor(xltx);
  144. assertTrue(
  145. extractor.getClass().getName(),
  146. extractor
  147. instanceof XSSFExcelExtractor
  148. );
  149. extractor.close();
  150. extractor = ExtractorFactory.createExtractor(xlsb);
  151. assertContains(extractor.getText(), "test");
  152. extractor.close();
  153. extractor = ExtractorFactory.createExtractor(xltx);
  154. assertContains(extractor.getText(), "test");
  155. extractor.close();
  156. // TODO Support OOXML-Strict, see bug #57699
  157. try {
  158. /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
  159. fail("OOXML-Strict isn't yet supported");
  160. } catch (POIXMLException e) {
  161. // Expected, for now
  162. }
  163. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  164. // assertTrue(
  165. // extractor
  166. // instanceof XSSFExcelExtractor
  167. // );
  168. // extractor.close();
  169. //
  170. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  171. // assertTrue(
  172. // extractor.getText().contains("test")
  173. // );
  174. // extractor.close();
  175. // Word
  176. extractor = ExtractorFactory.createExtractor(doc);
  177. assertTrue(
  178. extractor
  179. instanceof WordExtractor
  180. );
  181. assertTrue(
  182. extractor.getText().length() > 120
  183. );
  184. extractor.close();
  185. extractor = ExtractorFactory.createExtractor(doc6);
  186. assertTrue(
  187. extractor
  188. instanceof Word6Extractor
  189. );
  190. assertTrue(
  191. extractor.getText().length() > 20
  192. );
  193. extractor.close();
  194. extractor = ExtractorFactory.createExtractor(doc95);
  195. assertTrue(
  196. extractor
  197. instanceof Word6Extractor
  198. );
  199. assertTrue(
  200. extractor.getText().length() > 120
  201. );
  202. extractor.close();
  203. extractor = ExtractorFactory.createExtractor(docx);
  204. assertTrue(
  205. extractor instanceof XWPFWordExtractor
  206. );
  207. extractor.close();
  208. extractor = ExtractorFactory.createExtractor(docx);
  209. assertTrue(
  210. extractor.getText().length() > 120
  211. );
  212. extractor.close();
  213. extractor = ExtractorFactory.createExtractor(dotx);
  214. assertTrue(
  215. extractor instanceof XWPFWordExtractor
  216. );
  217. extractor.close();
  218. extractor = ExtractorFactory.createExtractor(dotx);
  219. assertContains(extractor.getText(), "Test");
  220. extractor.close();
  221. // PowerPoint (PPT)
  222. extractor = ExtractorFactory.createExtractor(ppt);
  223. assertTrue(
  224. extractor
  225. instanceof PowerPointExtractor
  226. );
  227. assertTrue(
  228. extractor.getText().length() > 120
  229. );
  230. extractor.close();
  231. // PowerPoint (PPTX)
  232. extractor = ExtractorFactory.createExtractor(pptx);
  233. assertTrue(
  234. extractor
  235. instanceof XSLFPowerPointExtractor
  236. );
  237. assertTrue(
  238. extractor.getText().length() > 120
  239. );
  240. extractor.close();
  241. // Visio - binary
  242. extractor = ExtractorFactory.createExtractor(vsd);
  243. assertTrue(
  244. extractor
  245. instanceof VisioTextExtractor
  246. );
  247. assertTrue(
  248. extractor.getText().length() > 50
  249. );
  250. extractor.close();
  251. // Visio - vsdx
  252. extractor = ExtractorFactory.createExtractor(vsdx);
  253. assertTrue(
  254. extractor
  255. instanceof XDGFVisioExtractor
  256. );
  257. assertTrue(
  258. extractor.getText().length() > 20
  259. );
  260. extractor.close();
  261. // Publisher
  262. extractor = ExtractorFactory.createExtractor(pub);
  263. assertTrue(
  264. extractor
  265. instanceof PublisherTextExtractor
  266. );
  267. assertTrue(
  268. extractor.getText().length() > 50
  269. );
  270. extractor.close();
  271. // Outlook msg
  272. extractor = ExtractorFactory.createExtractor(msg);
  273. assertTrue(
  274. extractor
  275. instanceof OutlookTextExtactor
  276. );
  277. assertTrue(
  278. extractor.getText().length() > 50
  279. );
  280. extractor.close();
  281. // Text
  282. try {
  283. ExtractorFactory.createExtractor(txt);
  284. fail();
  285. } catch(IllegalArgumentException e) {
  286. // Good
  287. }
  288. }
  289. @Test
  290. public void testInputStream() throws Exception {
  291. // Excel
  292. POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
  293. assertTrue(
  294. extractor
  295. instanceof ExcelExtractor
  296. );
  297. assertTrue(
  298. extractor.getText().length() > 200
  299. );
  300. extractor.close();
  301. extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
  302. assertTrue(
  303. extractor.getClass().getName(),
  304. extractor
  305. instanceof XSSFExcelExtractor
  306. );
  307. assertTrue(
  308. extractor.getText().length() > 200
  309. );
  310. // TODO Support OOXML-Strict, see bug #57699
  311. // assertTrue(
  312. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
  313. // instanceof XSSFExcelExtractor
  314. // );
  315. // assertTrue(
  316. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
  317. // );
  318. extractor.close();
  319. // Word
  320. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
  321. assertTrue(
  322. extractor.getClass().getName(),
  323. extractor
  324. instanceof WordExtractor
  325. );
  326. assertTrue(
  327. extractor.getText().length() > 120
  328. );
  329. extractor.close();
  330. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
  331. assertTrue(
  332. extractor.getClass().getName(),
  333. extractor
  334. instanceof Word6Extractor
  335. );
  336. assertTrue(
  337. extractor.getText().length() > 20
  338. );
  339. extractor.close();
  340. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
  341. assertTrue(
  342. extractor.getClass().getName(),
  343. extractor
  344. instanceof Word6Extractor
  345. );
  346. assertTrue(
  347. extractor.getText().length() > 120
  348. );
  349. extractor.close();
  350. extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
  351. assertTrue(
  352. extractor
  353. instanceof XWPFWordExtractor
  354. );
  355. assertTrue(
  356. extractor.getText().length() > 120
  357. );
  358. extractor.close();
  359. // PowerPoint
  360. extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
  361. assertTrue(
  362. extractor
  363. instanceof PowerPointExtractor
  364. );
  365. assertTrue(
  366. extractor.getText().length() > 120
  367. );
  368. extractor.close();
  369. extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
  370. assertTrue(
  371. extractor
  372. instanceof XSLFPowerPointExtractor
  373. );
  374. assertTrue(
  375. extractor.getText().length() > 120
  376. );
  377. extractor.close();
  378. // Visio
  379. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
  380. assertTrue(
  381. extractor
  382. instanceof VisioTextExtractor
  383. );
  384. assertTrue(
  385. extractor.getText().length() > 50
  386. );
  387. extractor.close();
  388. // Visio - vsdx
  389. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
  390. assertTrue(
  391. extractor
  392. instanceof XDGFVisioExtractor
  393. );
  394. assertTrue(
  395. extractor.getText().length() > 20
  396. );
  397. extractor.close();
  398. // Publisher
  399. extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
  400. assertTrue(
  401. extractor
  402. instanceof PublisherTextExtractor
  403. );
  404. assertTrue(
  405. extractor.getText().length() > 50
  406. );
  407. extractor.close();
  408. // Outlook msg
  409. extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
  410. assertTrue(
  411. extractor
  412. instanceof OutlookTextExtactor
  413. );
  414. assertTrue(
  415. extractor.getText().length() > 50
  416. );
  417. extractor.close();
  418. // Text
  419. try {
  420. FileInputStream stream = new FileInputStream(txt);
  421. try {
  422. ExtractorFactory.createExtractor(stream);
  423. fail();
  424. } finally {
  425. IOUtils.closeQuietly(stream);
  426. }
  427. } catch(IllegalArgumentException e) {
  428. // Good
  429. }
  430. }
  431. @Test
  432. public void testPOIFS() throws Exception {
  433. // Excel
  434. assertTrue(
  435. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
  436. instanceof ExcelExtractor
  437. );
  438. assertTrue(
  439. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  440. );
  441. // Word
  442. assertTrue(
  443. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
  444. instanceof WordExtractor
  445. );
  446. assertTrue(
  447. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  448. );
  449. assertTrue(
  450. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
  451. instanceof Word6Extractor
  452. );
  453. assertTrue(
  454. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  455. );
  456. assertTrue(
  457. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
  458. instanceof Word6Extractor
  459. );
  460. assertTrue(
  461. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  462. );
  463. // PowerPoint
  464. assertTrue(
  465. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
  466. instanceof PowerPointExtractor
  467. );
  468. assertTrue(
  469. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  470. );
  471. // Visio
  472. assertTrue(
  473. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
  474. instanceof VisioTextExtractor
  475. );
  476. assertTrue(
  477. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  478. );
  479. // Publisher
  480. assertTrue(
  481. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
  482. instanceof PublisherTextExtractor
  483. );
  484. assertTrue(
  485. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  486. );
  487. // Outlook msg
  488. assertTrue(
  489. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
  490. instanceof OutlookTextExtactor
  491. );
  492. assertTrue(
  493. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  494. );
  495. // Text
  496. try {
  497. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
  498. fail();
  499. } catch(IOException e) {
  500. // Good
  501. }
  502. }
  503. @Test
  504. public void testOPOIFS() throws Exception {
  505. // Excel
  506. assertTrue(
  507. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
  508. instanceof ExcelExtractor
  509. );
  510. assertTrue(
  511. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  512. );
  513. // Word
  514. assertTrue(
  515. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
  516. instanceof WordExtractor
  517. );
  518. assertTrue(
  519. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  520. );
  521. assertTrue(
  522. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
  523. instanceof Word6Extractor
  524. );
  525. assertTrue(
  526. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  527. );
  528. assertTrue(
  529. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
  530. instanceof Word6Extractor
  531. );
  532. assertTrue(
  533. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  534. );
  535. // PowerPoint
  536. assertTrue(
  537. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
  538. instanceof PowerPointExtractor
  539. );
  540. assertTrue(
  541. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  542. );
  543. // Visio
  544. assertTrue(
  545. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
  546. instanceof VisioTextExtractor
  547. );
  548. assertTrue(
  549. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  550. );
  551. // Publisher
  552. assertTrue(
  553. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
  554. instanceof PublisherTextExtractor
  555. );
  556. assertTrue(
  557. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  558. );
  559. // Outlook msg
  560. assertTrue(
  561. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
  562. instanceof OutlookTextExtactor
  563. );
  564. assertTrue(
  565. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  566. );
  567. // Text
  568. try {
  569. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
  570. fail();
  571. } catch(IOException e) {
  572. // Good
  573. }
  574. }
  575. @Test
  576. public void testPackage() throws Exception {
  577. // Excel
  578. POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  579. assertTrue(extractor instanceof XSSFExcelExtractor);
  580. extractor.close();
  581. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  582. assertTrue(extractor.getText().length() > 200);
  583. extractor.close();
  584. // Word
  585. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  586. assertTrue(extractor instanceof XWPFWordExtractor);
  587. extractor.close();
  588. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  589. assertTrue(extractor.getText().length() > 120);
  590. extractor.close();
  591. // PowerPoint
  592. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  593. assertTrue(extractor instanceof XSLFPowerPointExtractor);
  594. extractor.close();
  595. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  596. assertTrue(extractor.getText().length() > 120);
  597. extractor.close();
  598. // Visio
  599. extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
  600. assertTrue(extractor instanceof XDGFVisioExtractor);
  601. assertTrue(extractor.getText().length() > 20);
  602. extractor.close();
  603. // Text
  604. try {
  605. ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
  606. fail("TestExtractorFactory.testPackage() failed on " + txt);
  607. } catch(UnsupportedFileFormatException e) {
  608. // Good
  609. } catch (Exception e) {
  610. System.out.println("TestExtractorFactory.testPackage() failed on " + txt);
  611. throw e;
  612. }
  613. }
  614. @Test
  615. public void testPreferEventBased() throws Exception {
  616. assertFalse(ExtractorFactory.getPreferEventExtractor());
  617. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  618. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  619. ExtractorFactory.setThreadPrefersEventExtractors(true);
  620. assertTrue(ExtractorFactory.getPreferEventExtractor());
  621. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  622. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  623. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  624. assertFalse(ExtractorFactory.getPreferEventExtractor());
  625. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  626. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  627. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  628. assertTrue(ExtractorFactory.getPreferEventExtractor());
  629. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  630. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  631. // Check we get the right extractors now
  632. POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  633. assertTrue(
  634. extractor
  635. instanceof EventBasedExcelExtractor
  636. );
  637. extractor.close();
  638. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  639. assertTrue(
  640. extractor.getText().length() > 200
  641. );
  642. extractor.close();
  643. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  644. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  645. extractor.close();
  646. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  647. assertTrue(
  648. extractor.getText().length() > 200
  649. );
  650. extractor.close();
  651. // Put back to normal
  652. ExtractorFactory.setThreadPrefersEventExtractors(false);
  653. assertFalse(ExtractorFactory.getPreferEventExtractor());
  654. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  655. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  656. // And back
  657. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  658. assertTrue(
  659. extractor
  660. instanceof ExcelExtractor
  661. );
  662. extractor.close();
  663. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  664. assertTrue(
  665. extractor.getText().length() > 200
  666. );
  667. extractor.close();
  668. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  669. assertTrue(
  670. extractor
  671. instanceof XSSFExcelExtractor
  672. );
  673. extractor.close();
  674. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  675. assertTrue(
  676. extractor.getText().length() > 200
  677. );
  678. extractor.close();
  679. }
  680. /**
  681. * Test embeded docs text extraction. For now, only
  682. * does poifs embeded, but will do ooxml ones
  683. * at some point.
  684. */
  685. @Test
  686. public void testEmbeded() throws Exception {
  687. POIOLE2TextExtractor ext;
  688. POITextExtractor[] embeds;
  689. // No embedings
  690. ext = (POIOLE2TextExtractor)
  691. ExtractorFactory.createExtractor(xls);
  692. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  693. assertEquals(0, embeds.length);
  694. ext.close();
  695. // Excel
  696. ext = (POIOLE2TextExtractor)
  697. ExtractorFactory.createExtractor(xlsEmb);
  698. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  699. assertEquals(6, embeds.length);
  700. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
  701. for (POITextExtractor embed : embeds) {
  702. assertTrue(embed.getText().length() > 20);
  703. if (embed instanceof PowerPointExtractor) numPpt++;
  704. else if (embed instanceof ExcelExtractor) numXls++;
  705. else if (embed instanceof WordExtractor) numWord++;
  706. else if (embed instanceof OutlookTextExtactor) numMsg++;
  707. }
  708. assertEquals(2, numPpt);
  709. assertEquals(2, numXls);
  710. assertEquals(2, numWord);
  711. assertEquals(0, numMsg);
  712. ext.close();
  713. // Word
  714. ext = (POIOLE2TextExtractor)
  715. ExtractorFactory.createExtractor(docEmb);
  716. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  717. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  718. assertEquals(4, embeds.length);
  719. for (POITextExtractor embed : embeds) {
  720. assertTrue(embed.getText().length() > 20);
  721. if (embed instanceof PowerPointExtractor) numPpt++;
  722. else if (embed instanceof ExcelExtractor) numXls++;
  723. else if (embed instanceof WordExtractor) numWord++;
  724. else if (embed instanceof OutlookTextExtactor) numMsg++;
  725. }
  726. assertEquals(1, numPpt);
  727. assertEquals(2, numXls);
  728. assertEquals(1, numWord);
  729. assertEquals(0, numMsg);
  730. ext.close();
  731. // Word which contains an OOXML file
  732. ext = (POIOLE2TextExtractor)
  733. ExtractorFactory.createExtractor(docEmbOOXML);
  734. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  735. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
  736. assertEquals(3, embeds.length);
  737. for (POITextExtractor embed : embeds) {
  738. assertTrue(embed.getText().length() > 20);
  739. if (embed instanceof PowerPointExtractor) numPpt++;
  740. else if (embed instanceof ExcelExtractor) numXls++;
  741. else if (embed instanceof WordExtractor) numWord++;
  742. else if (embed instanceof OutlookTextExtactor) numMsg++;
  743. else if (embed instanceof XWPFWordExtractor) numWordX++;
  744. }
  745. assertEquals(1, numPpt);
  746. assertEquals(1, numXls);
  747. assertEquals(0, numWord);
  748. assertEquals(1, numWordX);
  749. assertEquals(0, numMsg);
  750. ext.close();
  751. // Outlook
  752. ext = (OutlookTextExtactor)
  753. ExtractorFactory.createExtractor(msgEmb);
  754. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  755. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  756. assertEquals(1, embeds.length);
  757. for (POITextExtractor embed : embeds) {
  758. assertTrue(embed.getText().length() > 20);
  759. if (embed instanceof PowerPointExtractor) numPpt++;
  760. else if (embed instanceof ExcelExtractor) numXls++;
  761. else if (embed instanceof WordExtractor) numWord++;
  762. else if (embed instanceof OutlookTextExtactor) numMsg++;
  763. }
  764. assertEquals(0, numPpt);
  765. assertEquals(0, numXls);
  766. assertEquals(1, numWord);
  767. assertEquals(0, numMsg);
  768. ext.close();
  769. // Outlook with another outlook file in it
  770. ext = (OutlookTextExtactor)
  771. ExtractorFactory.createExtractor(msgEmbMsg);
  772. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  773. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  774. assertEquals(1, embeds.length);
  775. for (POITextExtractor embed : embeds) {
  776. assertTrue(embed.getText().length() > 20);
  777. if (embed instanceof PowerPointExtractor) numPpt++;
  778. else if (embed instanceof ExcelExtractor) numXls++;
  779. else if (embed instanceof WordExtractor) numWord++;
  780. else if (embed instanceof OutlookTextExtactor) numMsg++;
  781. }
  782. assertEquals(0, numPpt);
  783. assertEquals(0, numXls);
  784. assertEquals(0, numWord);
  785. assertEquals(1, numMsg);
  786. ext.close();
  787. // TODO - PowerPoint
  788. // TODO - Publisher
  789. // TODO - Visio
  790. }
  791. private static final String[] EXPECTED_FAILURES = new String[] {
  792. // password protected files
  793. "spreadsheet/password.xls",
  794. "spreadsheet/protected_passtika.xlsx",
  795. "spreadsheet/51832.xls",
  796. "document/PasswordProtected.doc",
  797. "slideshow/Password_Protected-hello.ppt",
  798. "slideshow/Password_Protected-56-hello.ppt",
  799. "slideshow/Password_Protected-np-hello.ppt",
  800. "slideshow/cryptoapi-proc2356.ppt",
  801. //"document/bug53475-password-is-pass.docx",
  802. //"document/bug53475-password-is-solrcell.docx",
  803. "spreadsheet/xor-encryption-abc.xls",
  804. "spreadsheet/35897-type4.xls",
  805. //"poifs/protect.xlsx",
  806. //"poifs/protected_sha512.xlsx",
  807. //"poifs/extenxls_pwd123.xlsx",
  808. //"poifs/protected_agile.docx",
  809. "spreadsheet/58616.xlsx",
  810. // TODO: fails XMLExportTest, is this ok?
  811. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  812. "spreadsheet/55864.xlsx",
  813. "spreadsheet/57890.xlsx",
  814. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  815. "spreadsheet/44958.xls",
  816. "spreadsheet/44958_1.xls",
  817. "spreadsheet/testArraysAndTables.xls",
  818. // TODO: good to ignore?
  819. "spreadsheet/sample-beta.xlsx",
  820. // This is actually a spreadsheet!
  821. "hpsf/TestRobert_Flaherty.doc",
  822. // some files that are broken, eg Word 95, ...
  823. "spreadsheet/43493.xls",
  824. "spreadsheet/46904.xls",
  825. "document/Bug50955.doc",
  826. "slideshow/PPT95.ppt",
  827. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  828. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  829. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  830. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  831. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  832. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  833. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  834. "openxml4j/invalid.xlsx",
  835. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  836. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  837. "spreadsheet/Simple.xlsb",
  838. "poifs/unknown_properties.msg", // POIFS properties corrupted
  839. "poifs/only-zero-byte-streams.ole2", // No actual contents
  840. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  841. "spreadsheet/poc-xmlbomb-empty.xlsx", // contains xml-entity-expansion
  842. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  843. // old Excel files, which we only support simple text extraction of
  844. "spreadsheet/testEXCEL_2.xls",
  845. "spreadsheet/testEXCEL_3.xls",
  846. "spreadsheet/testEXCEL_4.xls",
  847. "spreadsheet/testEXCEL_5.xls",
  848. "spreadsheet/testEXCEL_95.xls",
  849. // OOXML Strict is not yet supported, see bug #57699
  850. "spreadsheet/SampleSS.strict.xlsx",
  851. "spreadsheet/SimpleStrict.xlsx",
  852. "spreadsheet/sample.strict.xlsx",
  853. // non-TNEF files
  854. "ddf/Container.dat",
  855. "ddf/47143.dat",
  856. // sheet cloning errors
  857. "spreadsheet/47813.xlsx",
  858. "spreadsheet/56450.xls",
  859. "spreadsheet/57231_MixedGasReport.xls",
  860. "spreadsheet/OddStyleRecord.xls",
  861. "spreadsheet/WithChartSheet.xlsx",
  862. "spreadsheet/chart_sheet.xlsx",
  863. };
  864. @Test
  865. public void testFileLeak() throws Exception {
  866. // run a number of files that might fail in order to catch
  867. // leaked file resources when using file-leak-detector while
  868. // running the test
  869. for(String file : EXPECTED_FAILURES) {
  870. try {
  871. ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
  872. } catch (Exception e) {
  873. // catch all exceptions here as we are only interested in file-handle leaks
  874. }
  875. }
  876. }
  877. /**
  878. * #59074 - Excel 95 files should give a helpful message, not just
  879. * "No supported documents found in the OLE2 stream"
  880. */
  881. @Test
  882. public void bug59074() throws Exception {
  883. try {
  884. ExtractorFactory.createExtractor(
  885. POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
  886. fail("Old excel formats not supported via ExtractorFactory");
  887. } catch (OldExcelFormatException e) {
  888. // expected here
  889. }
  890. }
  891. @Test
  892. public void testGetEmbeddedFromXMLExtractor() {
  893. try {
  894. // currently not implemented
  895. ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
  896. fail("Unsupported currently");
  897. } catch (IllegalStateException e) {
  898. // expected here
  899. }
  900. }
  901. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
  902. // When this happens, change this from @Test(expected=...) to @Test
  903. // bug 45565: text within TextBoxes is extracted by ExcelExtractor and WordExtractor
  904. @Test(expected=AssertionError.class)
  905. public void test45565() throws Exception {
  906. POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
  907. try {
  908. String text = extractor.getText();
  909. assertContains(text, "testdoc");
  910. assertContains(text, "test phrase");
  911. } finally {
  912. extractor.close();
  913. }
  914. }
  915. }