You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractorFactory.java 36KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor;
  16. import static org.junit.Assert.assertEquals;
  17. import static org.junit.Assert.assertFalse;
  18. import static org.junit.Assert.assertNotNull;
  19. import static org.junit.Assert.assertNull;
  20. import static org.junit.Assert.assertTrue;
  21. import static org.junit.Assert.fail;
  22. import java.io.File;
  23. import java.io.FileInputStream;
  24. import java.io.IOException;
  25. import org.apache.poi.POIDataSamples;
  26. import org.apache.poi.POIOLE2TextExtractor;
  27. import org.apache.poi.POITextExtractor;
  28. import org.apache.poi.POIXMLException;
  29. import org.apache.poi.POIXMLTextExtractor;
  30. import org.apache.poi.UnsupportedFileFormatException;
  31. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  32. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  33. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  34. import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
  35. import org.apache.poi.hssf.OldExcelFormatException;
  36. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  37. import org.apache.poi.hssf.extractor.ExcelExtractor;
  38. import org.apache.poi.hwpf.extractor.Word6Extractor;
  39. import org.apache.poi.hwpf.extractor.WordExtractor;
  40. import org.apache.poi.openxml4j.opc.OPCPackage;
  41. import org.apache.poi.openxml4j.opc.PackageAccess;
  42. import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
  43. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  44. import org.apache.poi.util.IOUtils;
  45. import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
  46. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  47. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  48. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  49. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  50. import org.junit.BeforeClass;
  51. import org.junit.Test;
  52. /**
  53. * Test that the extractor factory plays nicely
  54. */
  55. public class TestExtractorFactory {
  56. private static File txt;
  57. private static File xls;
  58. private static File xlsx;
  59. private static File xlsxStrict;
  60. private static File xltx;
  61. private static File xlsEmb;
  62. private static File doc;
  63. private static File doc6;
  64. private static File doc95;
  65. private static File docx;
  66. private static File dotx;
  67. private static File docEmb;
  68. private static File docEmbOOXML;
  69. private static File ppt;
  70. private static File pptx;
  71. private static File msg;
  72. private static File msgEmb;
  73. private static File msgEmbMsg;
  74. private static File vsd;
  75. private static File vsdx;
  76. private static File pub;
  77. private static File getFileAndCheck(POIDataSamples samples, String name) {
  78. File file = samples.getFile(name);
  79. assertNotNull("Did not get a file for " + name, file);
  80. assertTrue("Did not get a type file for " + name, file.isFile());
  81. assertTrue("File did not exist: " + name, file.exists());
  82. return file;
  83. }
  84. @BeforeClass
  85. public static void setUp() throws Exception {
  86. POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  87. xls = getFileAndCheck(ssTests, "SampleSS.xls");
  88. xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  89. xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  90. xltx = getFileAndCheck(ssTests, "test.xltx");
  91. xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  92. POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  93. doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  94. doc6 = getFileAndCheck(wpTests, "Word6.doc");
  95. doc95 = getFileAndCheck(wpTests, "Word95.doc");
  96. docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  97. dotx = getFileAndCheck(wpTests, "test.dotx");
  98. docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  99. docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  100. POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  101. ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  102. pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  103. txt = getFileAndCheck(slTests, "SampleShow.txt");
  104. POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  105. vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  106. vsdx = getFileAndCheck(dgTests, "test.vsdx");
  107. POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  108. pub = getFileAndCheck(pubTests, "Simple.pub");
  109. POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  110. msg = getFileAndCheck(olTests, "quick.msg");
  111. msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  112. msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  113. }
  114. @Test
  115. public void testFile() throws Exception {
  116. // Excel
  117. POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
  118. assertNotNull("Had empty extractor for " + xls, xlsExtractor);
  119. assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
  120. xlsExtractor
  121. instanceof ExcelExtractor
  122. );
  123. assertTrue(
  124. xlsExtractor.getText().length() > 200
  125. );
  126. xlsExtractor.close();
  127. POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
  128. assertTrue(
  129. extractor.getClass().getName(),
  130. extractor
  131. instanceof XSSFExcelExtractor
  132. );
  133. extractor.close();
  134. extractor = ExtractorFactory.createExtractor(xlsx);
  135. assertTrue(
  136. extractor.getText().length() > 200
  137. );
  138. extractor.close();
  139. extractor = ExtractorFactory.createExtractor(xltx);
  140. assertTrue(
  141. extractor.getClass().getName(),
  142. extractor
  143. instanceof XSSFExcelExtractor
  144. );
  145. extractor.close();
  146. extractor = ExtractorFactory.createExtractor(xltx);
  147. assertTrue(
  148. extractor.getText().contains("test")
  149. );
  150. extractor.close();
  151. // TODO Support OOXML-Strict, see bug #57699
  152. try {
  153. /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
  154. fail("OOXML-Strict isn't yet supported");
  155. } catch (POIXMLException e) {
  156. // Expected, for now
  157. }
  158. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  159. // assertTrue(
  160. // extractor
  161. // instanceof XSSFExcelExtractor
  162. // );
  163. // extractor.close();
  164. //
  165. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  166. // assertTrue(
  167. // extractor.getText().contains("test")
  168. // );
  169. // extractor.close();
  170. // Word
  171. extractor = ExtractorFactory.createExtractor(doc);
  172. assertTrue(
  173. extractor
  174. instanceof WordExtractor
  175. );
  176. assertTrue(
  177. extractor.getText().length() > 120
  178. );
  179. extractor.close();
  180. extractor = ExtractorFactory.createExtractor(doc6);
  181. assertTrue(
  182. extractor
  183. instanceof Word6Extractor
  184. );
  185. assertTrue(
  186. extractor.getText().length() > 20
  187. );
  188. extractor.close();
  189. extractor = ExtractorFactory.createExtractor(doc95);
  190. assertTrue(
  191. extractor
  192. instanceof Word6Extractor
  193. );
  194. assertTrue(
  195. extractor.getText().length() > 120
  196. );
  197. extractor.close();
  198. extractor = ExtractorFactory.createExtractor(docx);
  199. assertTrue(
  200. extractor instanceof XWPFWordExtractor
  201. );
  202. extractor.close();
  203. extractor = ExtractorFactory.createExtractor(docx);
  204. assertTrue(
  205. extractor.getText().length() > 120
  206. );
  207. extractor.close();
  208. extractor = ExtractorFactory.createExtractor(dotx);
  209. assertTrue(
  210. extractor instanceof XWPFWordExtractor
  211. );
  212. extractor.close();
  213. extractor = ExtractorFactory.createExtractor(dotx);
  214. assertTrue(
  215. extractor.getText().contains("Test")
  216. );
  217. extractor.close();
  218. // PowerPoint (PPT)
  219. extractor = ExtractorFactory.createExtractor(ppt);
  220. assertTrue(
  221. extractor
  222. instanceof PowerPointExtractor
  223. );
  224. assertTrue(
  225. extractor.getText().length() > 120
  226. );
  227. extractor.close();
  228. // PowerPoint (PPTX)
  229. extractor = ExtractorFactory.createExtractor(pptx);
  230. assertTrue(
  231. extractor
  232. instanceof XSLFPowerPointExtractor
  233. );
  234. assertTrue(
  235. extractor.getText().length() > 120
  236. );
  237. extractor.close();
  238. // Visio - binary
  239. extractor = ExtractorFactory.createExtractor(vsd);
  240. assertTrue(
  241. extractor
  242. instanceof VisioTextExtractor
  243. );
  244. assertTrue(
  245. extractor.getText().length() > 50
  246. );
  247. extractor.close();
  248. // Visio - vsdx
  249. extractor = ExtractorFactory.createExtractor(vsdx);
  250. assertTrue(
  251. extractor
  252. instanceof XDGFVisioExtractor
  253. );
  254. assertTrue(
  255. extractor.getText().length() > 20
  256. );
  257. extractor.close();
  258. // Publisher
  259. extractor = ExtractorFactory.createExtractor(pub);
  260. assertTrue(
  261. extractor
  262. instanceof PublisherTextExtractor
  263. );
  264. assertTrue(
  265. extractor.getText().length() > 50
  266. );
  267. extractor.close();
  268. // Outlook msg
  269. extractor = ExtractorFactory.createExtractor(msg);
  270. assertTrue(
  271. extractor
  272. instanceof OutlookTextExtactor
  273. );
  274. assertTrue(
  275. extractor.getText().length() > 50
  276. );
  277. extractor.close();
  278. // Text
  279. try {
  280. ExtractorFactory.createExtractor(txt);
  281. fail();
  282. } catch(IllegalArgumentException e) {
  283. // Good
  284. }
  285. }
  286. @Test
  287. public void testInputStream() throws Exception {
  288. // Excel
  289. POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
  290. assertTrue(
  291. extractor
  292. instanceof ExcelExtractor
  293. );
  294. assertTrue(
  295. extractor.getText().length() > 200
  296. );
  297. extractor.close();
  298. extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
  299. assertTrue(
  300. extractor.getClass().getName(),
  301. extractor
  302. instanceof XSSFExcelExtractor
  303. );
  304. assertTrue(
  305. extractor.getText().length() > 200
  306. );
  307. // TODO Support OOXML-Strict, see bug #57699
  308. // assertTrue(
  309. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
  310. // instanceof XSSFExcelExtractor
  311. // );
  312. // assertTrue(
  313. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
  314. // );
  315. extractor.close();
  316. // Word
  317. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
  318. assertTrue(
  319. extractor.getClass().getName(),
  320. extractor
  321. instanceof WordExtractor
  322. );
  323. assertTrue(
  324. extractor.getText().length() > 120
  325. );
  326. extractor.close();
  327. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
  328. assertTrue(
  329. extractor.getClass().getName(),
  330. extractor
  331. instanceof Word6Extractor
  332. );
  333. assertTrue(
  334. extractor.getText().length() > 20
  335. );
  336. extractor.close();
  337. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
  338. assertTrue(
  339. extractor.getClass().getName(),
  340. extractor
  341. instanceof Word6Extractor
  342. );
  343. assertTrue(
  344. extractor.getText().length() > 120
  345. );
  346. extractor.close();
  347. extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
  348. assertTrue(
  349. extractor
  350. instanceof XWPFWordExtractor
  351. );
  352. assertTrue(
  353. extractor.getText().length() > 120
  354. );
  355. extractor.close();
  356. // PowerPoint
  357. extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
  358. assertTrue(
  359. extractor
  360. instanceof PowerPointExtractor
  361. );
  362. assertTrue(
  363. extractor.getText().length() > 120
  364. );
  365. extractor.close();
  366. extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
  367. assertTrue(
  368. extractor
  369. instanceof XSLFPowerPointExtractor
  370. );
  371. assertTrue(
  372. extractor.getText().length() > 120
  373. );
  374. extractor.close();
  375. // Visio
  376. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
  377. assertTrue(
  378. extractor
  379. instanceof VisioTextExtractor
  380. );
  381. assertTrue(
  382. extractor.getText().length() > 50
  383. );
  384. extractor.close();
  385. // Visio - vsdx
  386. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
  387. assertTrue(
  388. extractor
  389. instanceof XDGFVisioExtractor
  390. );
  391. assertTrue(
  392. extractor.getText().length() > 20
  393. );
  394. extractor.close();
  395. // Publisher
  396. extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
  397. assertTrue(
  398. extractor
  399. instanceof PublisherTextExtractor
  400. );
  401. assertTrue(
  402. extractor.getText().length() > 50
  403. );
  404. extractor.close();
  405. // Outlook msg
  406. extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
  407. assertTrue(
  408. extractor
  409. instanceof OutlookTextExtactor
  410. );
  411. assertTrue(
  412. extractor.getText().length() > 50
  413. );
  414. extractor.close();
  415. // Text
  416. try {
  417. FileInputStream stream = new FileInputStream(txt);
  418. try {
  419. ExtractorFactory.createExtractor(stream);
  420. fail();
  421. } finally {
  422. IOUtils.closeQuietly(stream);
  423. }
  424. } catch(IllegalArgumentException e) {
  425. // Good
  426. }
  427. }
  428. @Test
  429. public void testPOIFS() throws Exception {
  430. // Excel
  431. assertTrue(
  432. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
  433. instanceof ExcelExtractor
  434. );
  435. assertTrue(
  436. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  437. );
  438. // Word
  439. assertTrue(
  440. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
  441. instanceof WordExtractor
  442. );
  443. assertTrue(
  444. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  445. );
  446. assertTrue(
  447. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
  448. instanceof Word6Extractor
  449. );
  450. assertTrue(
  451. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  452. );
  453. assertTrue(
  454. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
  455. instanceof Word6Extractor
  456. );
  457. assertTrue(
  458. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  459. );
  460. // PowerPoint
  461. assertTrue(
  462. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
  463. instanceof PowerPointExtractor
  464. );
  465. assertTrue(
  466. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  467. );
  468. // Visio
  469. assertTrue(
  470. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
  471. instanceof VisioTextExtractor
  472. );
  473. assertTrue(
  474. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  475. );
  476. // Publisher
  477. assertTrue(
  478. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
  479. instanceof PublisherTextExtractor
  480. );
  481. assertTrue(
  482. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  483. );
  484. // Outlook msg
  485. assertTrue(
  486. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
  487. instanceof OutlookTextExtactor
  488. );
  489. assertTrue(
  490. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  491. );
  492. // Text
  493. try {
  494. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
  495. fail();
  496. } catch(IOException e) {
  497. // Good
  498. }
  499. }
  500. @Test
  501. public void testOPOIFS() throws Exception {
  502. // Excel
  503. assertTrue(
  504. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
  505. instanceof ExcelExtractor
  506. );
  507. assertTrue(
  508. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  509. );
  510. // Word
  511. assertTrue(
  512. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
  513. instanceof WordExtractor
  514. );
  515. assertTrue(
  516. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  517. );
  518. assertTrue(
  519. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
  520. instanceof Word6Extractor
  521. );
  522. assertTrue(
  523. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  524. );
  525. assertTrue(
  526. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
  527. instanceof Word6Extractor
  528. );
  529. assertTrue(
  530. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  531. );
  532. // PowerPoint
  533. assertTrue(
  534. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
  535. instanceof PowerPointExtractor
  536. );
  537. assertTrue(
  538. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  539. );
  540. // Visio
  541. assertTrue(
  542. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
  543. instanceof VisioTextExtractor
  544. );
  545. assertTrue(
  546. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  547. );
  548. // Publisher
  549. assertTrue(
  550. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
  551. instanceof PublisherTextExtractor
  552. );
  553. assertTrue(
  554. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  555. );
  556. // Outlook msg
  557. assertTrue(
  558. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
  559. instanceof OutlookTextExtactor
  560. );
  561. assertTrue(
  562. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  563. );
  564. // Text
  565. try {
  566. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
  567. fail();
  568. } catch(IOException e) {
  569. // Good
  570. }
  571. }
  572. @Test
  573. public void testPackage() throws Exception {
  574. // Excel
  575. POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  576. assertTrue(extractor instanceof XSSFExcelExtractor);
  577. extractor.close();
  578. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  579. assertTrue(extractor.getText().length() > 200);
  580. extractor.close();
  581. // Word
  582. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  583. assertTrue(extractor instanceof XWPFWordExtractor);
  584. extractor.close();
  585. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  586. assertTrue(extractor.getText().length() > 120);
  587. extractor.close();
  588. // PowerPoint
  589. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  590. assertTrue(extractor instanceof XSLFPowerPointExtractor);
  591. extractor.close();
  592. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  593. assertTrue(extractor.getText().length() > 120);
  594. extractor.close();
  595. // Visio
  596. extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
  597. assertTrue(extractor instanceof XDGFVisioExtractor);
  598. assertTrue(extractor.getText().length() > 20);
  599. extractor.close();
  600. // Text
  601. try {
  602. ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
  603. fail("TestExtractorFactory.testPackage() failed on " + txt.toString());
  604. } catch(UnsupportedFileFormatException e) {
  605. // Good
  606. } catch (Exception e) {
  607. System.out.println("TestExtractorFactory.testPackage() failed on " + txt.toString());
  608. throw e;
  609. }
  610. }
  611. @Test
  612. public void testPreferEventBased() throws Exception {
  613. assertFalse(ExtractorFactory.getPreferEventExtractor());
  614. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  615. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  616. ExtractorFactory.setThreadPrefersEventExtractors(true);
  617. assertTrue(ExtractorFactory.getPreferEventExtractor());
  618. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  619. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  620. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  621. assertFalse(ExtractorFactory.getPreferEventExtractor());
  622. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  623. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  624. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  625. assertTrue(ExtractorFactory.getPreferEventExtractor());
  626. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  627. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  628. // Check we get the right extractors now
  629. POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  630. assertTrue(
  631. extractor
  632. instanceof EventBasedExcelExtractor
  633. );
  634. extractor.close();
  635. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  636. assertTrue(
  637. extractor.getText().length() > 200
  638. );
  639. extractor.close();
  640. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  641. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  642. extractor.close();
  643. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  644. assertTrue(
  645. extractor.getText().length() > 200
  646. );
  647. extractor.close();
  648. // Put back to normal
  649. ExtractorFactory.setThreadPrefersEventExtractors(false);
  650. assertFalse(ExtractorFactory.getPreferEventExtractor());
  651. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  652. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  653. // And back
  654. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  655. assertTrue(
  656. extractor
  657. instanceof ExcelExtractor
  658. );
  659. extractor.close();
  660. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  661. assertTrue(
  662. extractor.getText().length() > 200
  663. );
  664. extractor.close();
  665. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  666. assertTrue(
  667. extractor
  668. instanceof XSSFExcelExtractor
  669. );
  670. extractor.close();
  671. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  672. assertTrue(
  673. extractor.getText().length() > 200
  674. );
  675. extractor.close();
  676. }
  677. /**
  678. * Test embeded docs text extraction. For now, only
  679. * does poifs embeded, but will do ooxml ones
  680. * at some point.
  681. */
  682. @Test
  683. public void testEmbeded() throws Exception {
  684. POIOLE2TextExtractor ext;
  685. POITextExtractor[] embeds;
  686. // No embedings
  687. ext = (POIOLE2TextExtractor)
  688. ExtractorFactory.createExtractor(xls);
  689. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  690. assertEquals(0, embeds.length);
  691. ext.close();
  692. // Excel
  693. ext = (POIOLE2TextExtractor)
  694. ExtractorFactory.createExtractor(xlsEmb);
  695. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  696. assertEquals(6, embeds.length);
  697. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
  698. for (POITextExtractor embed : embeds) {
  699. assertTrue(embed.getText().length() > 20);
  700. if (embed instanceof PowerPointExtractor) numPpt++;
  701. else if (embed instanceof ExcelExtractor) numXls++;
  702. else if (embed instanceof WordExtractor) numWord++;
  703. else if (embed instanceof OutlookTextExtactor) numMsg++;
  704. }
  705. assertEquals(2, numPpt);
  706. assertEquals(2, numXls);
  707. assertEquals(2, numWord);
  708. assertEquals(0, numMsg);
  709. ext.close();
  710. // Word
  711. ext = (POIOLE2TextExtractor)
  712. ExtractorFactory.createExtractor(docEmb);
  713. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  714. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  715. assertEquals(4, embeds.length);
  716. for (POITextExtractor embed : embeds) {
  717. assertTrue(embed.getText().length() > 20);
  718. if (embed instanceof PowerPointExtractor) numPpt++;
  719. else if (embed instanceof ExcelExtractor) numXls++;
  720. else if (embed instanceof WordExtractor) numWord++;
  721. else if (embed instanceof OutlookTextExtactor) numMsg++;
  722. }
  723. assertEquals(1, numPpt);
  724. assertEquals(2, numXls);
  725. assertEquals(1, numWord);
  726. assertEquals(0, numMsg);
  727. ext.close();
  728. // Word which contains an OOXML file
  729. ext = (POIOLE2TextExtractor)
  730. ExtractorFactory.createExtractor(docEmbOOXML);
  731. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  732. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
  733. assertEquals(3, embeds.length);
  734. for (POITextExtractor embed : embeds) {
  735. assertTrue(embed.getText().length() > 20);
  736. if (embed instanceof PowerPointExtractor) numPpt++;
  737. else if (embed instanceof ExcelExtractor) numXls++;
  738. else if (embed instanceof WordExtractor) numWord++;
  739. else if (embed instanceof OutlookTextExtactor) numMsg++;
  740. else if (embed instanceof XWPFWordExtractor) numWordX++;
  741. }
  742. assertEquals(1, numPpt);
  743. assertEquals(1, numXls);
  744. assertEquals(0, numWord);
  745. assertEquals(1, numWordX);
  746. assertEquals(0, numMsg);
  747. ext.close();
  748. // Outlook
  749. ext = (OutlookTextExtactor)
  750. ExtractorFactory.createExtractor(msgEmb);
  751. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  752. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  753. assertEquals(1, embeds.length);
  754. for (POITextExtractor embed : embeds) {
  755. assertTrue(embed.getText().length() > 20);
  756. if (embed instanceof PowerPointExtractor) numPpt++;
  757. else if (embed instanceof ExcelExtractor) numXls++;
  758. else if (embed instanceof WordExtractor) numWord++;
  759. else if (embed instanceof OutlookTextExtactor) numMsg++;
  760. }
  761. assertEquals(0, numPpt);
  762. assertEquals(0, numXls);
  763. assertEquals(1, numWord);
  764. assertEquals(0, numMsg);
  765. ext.close();
  766. // Outlook with another outlook file in it
  767. ext = (OutlookTextExtactor)
  768. ExtractorFactory.createExtractor(msgEmbMsg);
  769. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  770. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  771. assertEquals(1, embeds.length);
  772. for (POITextExtractor embed : embeds) {
  773. assertTrue(embed.getText().length() > 20);
  774. if (embed instanceof PowerPointExtractor) numPpt++;
  775. else if (embed instanceof ExcelExtractor) numXls++;
  776. else if (embed instanceof WordExtractor) numWord++;
  777. else if (embed instanceof OutlookTextExtactor) numMsg++;
  778. }
  779. assertEquals(0, numPpt);
  780. assertEquals(0, numXls);
  781. assertEquals(0, numWord);
  782. assertEquals(1, numMsg);
  783. ext.close();
  784. // TODO - PowerPoint
  785. // TODO - Publisher
  786. // TODO - Visio
  787. }
  788. private static final String[] EXPECTED_FAILURES = new String[] {
  789. // password protected files
  790. "spreadsheet/password.xls",
  791. "spreadsheet/protected_passtika.xlsx",
  792. "spreadsheet/51832.xls",
  793. "document/PasswordProtected.doc",
  794. "slideshow/Password_Protected-hello.ppt",
  795. "slideshow/Password_Protected-56-hello.ppt",
  796. "slideshow/Password_Protected-np-hello.ppt",
  797. "slideshow/cryptoapi-proc2356.ppt",
  798. //"document/bug53475-password-is-pass.docx",
  799. //"document/bug53475-password-is-solrcell.docx",
  800. "spreadsheet/xor-encryption-abc.xls",
  801. "spreadsheet/35897-type4.xls",
  802. //"poifs/protect.xlsx",
  803. //"poifs/protected_sha512.xlsx",
  804. //"poifs/extenxls_pwd123.xlsx",
  805. //"poifs/protected_agile.docx",
  806. "spreadsheet/58616.xlsx",
  807. // TODO: fails XMLExportTest, is this ok?
  808. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  809. "spreadsheet/55864.xlsx",
  810. "spreadsheet/57890.xlsx",
  811. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  812. "spreadsheet/44958.xls",
  813. "spreadsheet/44958_1.xls",
  814. "spreadsheet/testArraysAndTables.xls",
  815. // TODO: good to ignore?
  816. "spreadsheet/sample-beta.xlsx",
  817. // This is actually a spreadsheet!
  818. "hpsf/TestRobert_Flaherty.doc",
  819. // some files that are broken, eg Word 95, ...
  820. "spreadsheet/43493.xls",
  821. "spreadsheet/46904.xls",
  822. "document/Bug50955.doc",
  823. "slideshow/PPT95.ppt",
  824. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  825. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  826. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  827. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  828. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  829. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  830. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  831. "openxml4j/invalid.xlsx",
  832. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  833. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  834. "spreadsheet/Simple.xlsb",
  835. "poifs/unknown_properties.msg", // POIFS properties corrupted
  836. "poifs/only-zero-byte-streams.ole2", // No actual contents
  837. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  838. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  839. // old Excel files, which we only support simple text extraction of
  840. "spreadsheet/testEXCEL_2.xls",
  841. "spreadsheet/testEXCEL_3.xls",
  842. "spreadsheet/testEXCEL_4.xls",
  843. "spreadsheet/testEXCEL_5.xls",
  844. "spreadsheet/testEXCEL_95.xls",
  845. // OOXML Strict is not yet supported, see bug #57699
  846. "spreadsheet/SampleSS.strict.xlsx",
  847. "spreadsheet/SimpleStrict.xlsx",
  848. "spreadsheet/sample.strict.xlsx",
  849. // non-TNEF files
  850. "ddf/Container.dat",
  851. "ddf/47143.dat",
  852. // sheet cloning errors
  853. "spreadsheet/47813.xlsx",
  854. "spreadsheet/56450.xls",
  855. "spreadsheet/57231_MixedGasReport.xls",
  856. "spreadsheet/OddStyleRecord.xls",
  857. "spreadsheet/WithChartSheet.xlsx",
  858. "spreadsheet/chart_sheet.xlsx",
  859. };
  860. @Test
  861. public void testFileLeak() throws Exception {
  862. // run a number of files that might fail in order to catch
  863. // leaked file resources when using file-leak-detector while
  864. // running the test
  865. for(String file : EXPECTED_FAILURES) {
  866. try {
  867. ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
  868. } catch (Exception e) {
  869. // catch all exceptions here as we are only interested in file-handle leaks
  870. }
  871. }
  872. }
  873. /**
  874. * #59074 - Excel 95 files should give a helpful message, not just
  875. * "No supported documents found in the OLE2 stream"
  876. */
  877. @Test
  878. public void bug59074() throws Exception {
  879. try {
  880. ExtractorFactory.createExtractor(
  881. POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
  882. fail("Old excel formats not supported via ExtractorFactory");
  883. } catch (OldExcelFormatException e) {
  884. // expected here
  885. }
  886. }
  887. @Test
  888. public void testGetEmbeddedFromXMLExtractor() {
  889. try {
  890. // currently not implemented
  891. ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
  892. fail("Unsupported currently");
  893. } catch (IllegalStateException e) {
  894. // expected here
  895. }
  896. }
  897. }