You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractorFactory.java 27KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor;
  16. import static org.junit.Assert.assertEquals;
  17. import static org.junit.Assert.assertFalse;
  18. import static org.junit.Assert.assertNotNull;
  19. import static org.junit.Assert.assertNull;
  20. import static org.junit.Assert.assertTrue;
  21. import static org.junit.Assert.fail;
  22. import java.io.File;
  23. import java.io.FileInputStream;
  24. import java.io.IOException;
  25. import org.apache.poi.POIDataSamples;
  26. import org.apache.poi.POIOLE2TextExtractor;
  27. import org.apache.poi.POITextExtractor;
  28. import org.apache.poi.POIXMLException;
  29. import org.apache.poi.POIXMLTextExtractor;
  30. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  31. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  32. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  33. import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
  34. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  35. import org.apache.poi.hssf.extractor.ExcelExtractor;
  36. import org.apache.poi.hwpf.extractor.Word6Extractor;
  37. import org.apache.poi.hwpf.extractor.WordExtractor;
  38. import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
  39. import org.apache.poi.openxml4j.opc.OPCPackage;
  40. import org.apache.poi.openxml4j.opc.PackageAccess;
  41. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  42. import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
  43. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  44. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  45. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  46. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  47. import org.junit.BeforeClass;
  48. import org.junit.Test;
  49. /**
  50. * Test that the extractor factory plays nicely
  51. */
  52. public class TestExtractorFactory {
  53. private static File txt;
  54. private static File xls;
  55. private static File xlsx;
  56. private static File xlsxStrict;
  57. private static File xltx;
  58. private static File xlsEmb;
  59. private static File doc;
  60. private static File doc6;
  61. private static File doc95;
  62. private static File docx;
  63. private static File dotx;
  64. private static File docEmb;
  65. private static File docEmbOOXML;
  66. private static File ppt;
  67. private static File pptx;
  68. private static File msg;
  69. private static File msgEmb;
  70. private static File msgEmbMsg;
  71. private static File vsd;
  72. private static File vsdx;
  73. private static File pub;
  74. private static File getFileAndCheck(POIDataSamples samples, String name) {
  75. File file = samples.getFile(name);
  76. assertNotNull("Did not get a file for " + name, file);
  77. assertTrue("Did not get a type file for " + name, file.isFile());
  78. assertTrue("File did not exist: " + name, file.exists());
  79. return file;
  80. }
  81. @BeforeClass
  82. public static void setUp() throws Exception {
  83. POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  84. xls = getFileAndCheck(ssTests, "SampleSS.xls");
  85. xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  86. xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  87. xltx = getFileAndCheck(ssTests, "test.xltx");
  88. xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  89. POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  90. doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  91. doc6 = getFileAndCheck(wpTests, "Word6.doc");
  92. doc95 = getFileAndCheck(wpTests, "Word95.doc");
  93. docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  94. dotx = getFileAndCheck(wpTests, "test.dotx");
  95. docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  96. docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  97. POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  98. ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  99. pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  100. txt = getFileAndCheck(slTests, "SampleShow.txt");
  101. POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  102. vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  103. vsdx = getFileAndCheck(dgTests, "test.vsdx");
  104. POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  105. pub = getFileAndCheck(pubTests, "Simple.pub");
  106. POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  107. msg = getFileAndCheck(olTests, "quick.msg");
  108. msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  109. msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  110. }
  111. @Test
  112. public void testFile() throws Exception {
  113. // Excel
  114. POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
  115. assertNotNull("Had empty extractor for " + xls, xlsExtractor);
  116. assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
  117. xlsExtractor
  118. instanceof ExcelExtractor
  119. );
  120. assertTrue(
  121. xlsExtractor.getText().length() > 200
  122. );
  123. xlsExtractor.close();
  124. POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
  125. assertTrue(
  126. extractor
  127. instanceof XSSFExcelExtractor
  128. );
  129. extractor.close();
  130. extractor = ExtractorFactory.createExtractor(xlsx);
  131. assertTrue(
  132. extractor.getText().length() > 200
  133. );
  134. extractor.close();
  135. extractor = ExtractorFactory.createExtractor(xltx);
  136. assertTrue(
  137. extractor
  138. instanceof XSSFExcelExtractor
  139. );
  140. extractor.close();
  141. extractor = ExtractorFactory.createExtractor(xltx);
  142. assertTrue(
  143. extractor.getText().contains("test")
  144. );
  145. extractor.close();
  146. // TODO Support OOXML-Strict, see bug #57699
  147. try {
  148. extractor = ExtractorFactory.createExtractor(xlsxStrict);
  149. fail("OOXML-Strict isn't yet supported");
  150. } catch (POIXMLException e) {
  151. // Expected, for now
  152. }
  153. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  154. // assertTrue(
  155. // extractor
  156. // instanceof XSSFExcelExtractor
  157. // );
  158. // extractor.close();
  159. //
  160. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  161. // assertTrue(
  162. // extractor.getText().contains("test")
  163. // );
  164. // extractor.close();
  165. // Word
  166. assertTrue(
  167. ExtractorFactory.createExtractor(doc)
  168. instanceof WordExtractor
  169. );
  170. assertTrue(
  171. ExtractorFactory.createExtractor(doc).getText().length() > 120
  172. );
  173. assertTrue(
  174. ExtractorFactory.createExtractor(doc6)
  175. instanceof Word6Extractor
  176. );
  177. assertTrue(
  178. ExtractorFactory.createExtractor(doc6).getText().length() > 20
  179. );
  180. assertTrue(
  181. ExtractorFactory.createExtractor(doc95)
  182. instanceof Word6Extractor
  183. );
  184. assertTrue(
  185. ExtractorFactory.createExtractor(doc95).getText().length() > 120
  186. );
  187. extractor = ExtractorFactory.createExtractor(docx);
  188. assertTrue(
  189. extractor instanceof XWPFWordExtractor
  190. );
  191. extractor.close();
  192. extractor = ExtractorFactory.createExtractor(docx);
  193. assertTrue(
  194. extractor.getText().length() > 120
  195. );
  196. extractor.close();
  197. extractor = ExtractorFactory.createExtractor(dotx);
  198. assertTrue(
  199. extractor instanceof XWPFWordExtractor
  200. );
  201. extractor.close();
  202. extractor = ExtractorFactory.createExtractor(dotx);
  203. assertTrue(
  204. extractor.getText().contains("Test")
  205. );
  206. extractor.close();
  207. // PowerPoint
  208. assertTrue(
  209. ExtractorFactory.createExtractor(ppt)
  210. instanceof PowerPointExtractor
  211. );
  212. assertTrue(
  213. ExtractorFactory.createExtractor(ppt).getText().length() > 120
  214. );
  215. extractor = ExtractorFactory.createExtractor(pptx);
  216. assertTrue(
  217. extractor
  218. instanceof XSLFPowerPointExtractor
  219. );
  220. extractor.close();
  221. extractor = ExtractorFactory.createExtractor(pptx);
  222. assertTrue(
  223. extractor.getText().length() > 120
  224. );
  225. extractor.close();
  226. // Visio - binary
  227. assertTrue(
  228. ExtractorFactory.createExtractor(vsd)
  229. instanceof VisioTextExtractor
  230. );
  231. assertTrue(
  232. ExtractorFactory.createExtractor(vsd).getText().length() > 50
  233. );
  234. // Visio - vsdx
  235. assertTrue(
  236. ExtractorFactory.createExtractor(vsdx)
  237. instanceof XDGFVisioExtractor
  238. );
  239. assertTrue(
  240. ExtractorFactory.createExtractor(vsdx).getText().length() > 20
  241. );
  242. // Publisher
  243. assertTrue(
  244. ExtractorFactory.createExtractor(pub)
  245. instanceof PublisherTextExtractor
  246. );
  247. assertTrue(
  248. ExtractorFactory.createExtractor(pub).getText().length() > 50
  249. );
  250. // Outlook msg
  251. assertTrue(
  252. ExtractorFactory.createExtractor(msg)
  253. instanceof OutlookTextExtactor
  254. );
  255. assertTrue(
  256. ExtractorFactory.createExtractor(msg).getText().length() > 50
  257. );
  258. // Text
  259. try {
  260. ExtractorFactory.createExtractor(txt);
  261. fail();
  262. } catch(IllegalArgumentException e) {
  263. // Good
  264. }
  265. }
  266. @Test
  267. public void testInputStream() throws Exception {
  268. // Excel
  269. assertTrue(
  270. ExtractorFactory.createExtractor(new FileInputStream(xls))
  271. instanceof ExcelExtractor
  272. );
  273. assertTrue(
  274. ExtractorFactory.createExtractor(new FileInputStream(xls)).getText().length() > 200
  275. );
  276. assertTrue(
  277. ExtractorFactory.createExtractor(new FileInputStream(xlsx))
  278. instanceof XSSFExcelExtractor
  279. );
  280. assertTrue(
  281. ExtractorFactory.createExtractor(new FileInputStream(xlsx)).getText().length() > 200
  282. );
  283. // TODO Support OOXML-Strict, see bug #57699
  284. // assertTrue(
  285. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
  286. // instanceof XSSFExcelExtractor
  287. // );
  288. // assertTrue(
  289. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
  290. // );
  291. // Word
  292. assertTrue(
  293. ExtractorFactory.createExtractor(new FileInputStream(doc))
  294. instanceof WordExtractor
  295. );
  296. assertTrue(
  297. ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120
  298. );
  299. assertTrue(
  300. ExtractorFactory.createExtractor(new FileInputStream(doc6))
  301. instanceof Word6Extractor
  302. );
  303. assertTrue(
  304. ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20
  305. );
  306. assertTrue(
  307. ExtractorFactory.createExtractor(new FileInputStream(doc95))
  308. instanceof Word6Extractor
  309. );
  310. assertTrue(
  311. ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120
  312. );
  313. assertTrue(
  314. ExtractorFactory.createExtractor(new FileInputStream(docx))
  315. instanceof XWPFWordExtractor
  316. );
  317. assertTrue(
  318. ExtractorFactory.createExtractor(new FileInputStream(docx)).getText().length() > 120
  319. );
  320. // PowerPoint
  321. assertTrue(
  322. ExtractorFactory.createExtractor(new FileInputStream(ppt))
  323. instanceof PowerPointExtractor
  324. );
  325. assertTrue(
  326. ExtractorFactory.createExtractor(new FileInputStream(ppt)).getText().length() > 120
  327. );
  328. assertTrue(
  329. ExtractorFactory.createExtractor(new FileInputStream(pptx))
  330. instanceof XSLFPowerPointExtractor
  331. );
  332. assertTrue(
  333. ExtractorFactory.createExtractor(new FileInputStream(pptx)).getText().length() > 120
  334. );
  335. // Visio
  336. assertTrue(
  337. ExtractorFactory.createExtractor(new FileInputStream(vsd))
  338. instanceof VisioTextExtractor
  339. );
  340. assertTrue(
  341. ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
  342. );
  343. // Visio - vsdx
  344. assertTrue(
  345. ExtractorFactory.createExtractor(new FileInputStream(vsdx))
  346. instanceof XDGFVisioExtractor
  347. );
  348. assertTrue(
  349. ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20
  350. );
  351. // Publisher
  352. assertTrue(
  353. ExtractorFactory.createExtractor(new FileInputStream(pub))
  354. instanceof PublisherTextExtractor
  355. );
  356. assertTrue(
  357. ExtractorFactory.createExtractor(new FileInputStream(pub)).getText().length() > 50
  358. );
  359. // Outlook msg
  360. assertTrue(
  361. ExtractorFactory.createExtractor(new FileInputStream(msg))
  362. instanceof OutlookTextExtactor
  363. );
  364. assertTrue(
  365. ExtractorFactory.createExtractor(new FileInputStream(msg)).getText().length() > 50
  366. );
  367. // Text
  368. try {
  369. FileInputStream stream = new FileInputStream(txt);
  370. try {
  371. ExtractorFactory.createExtractor(stream);
  372. fail();
  373. } finally {
  374. stream.close();
  375. }
  376. } catch(IllegalArgumentException e) {
  377. // Good
  378. }
  379. }
  380. @Test
  381. public void testPOIFS() throws Exception {
  382. // Excel
  383. assertTrue(
  384. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
  385. instanceof ExcelExtractor
  386. );
  387. assertTrue(
  388. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  389. );
  390. // Word
  391. assertTrue(
  392. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
  393. instanceof WordExtractor
  394. );
  395. assertTrue(
  396. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  397. );
  398. assertTrue(
  399. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
  400. instanceof Word6Extractor
  401. );
  402. assertTrue(
  403. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  404. );
  405. assertTrue(
  406. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
  407. instanceof Word6Extractor
  408. );
  409. assertTrue(
  410. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  411. );
  412. // PowerPoint
  413. assertTrue(
  414. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
  415. instanceof PowerPointExtractor
  416. );
  417. assertTrue(
  418. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  419. );
  420. // Visio
  421. assertTrue(
  422. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
  423. instanceof VisioTextExtractor
  424. );
  425. assertTrue(
  426. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  427. );
  428. // Publisher
  429. assertTrue(
  430. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
  431. instanceof PublisherTextExtractor
  432. );
  433. assertTrue(
  434. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  435. );
  436. // Outlook msg
  437. assertTrue(
  438. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
  439. instanceof OutlookTextExtactor
  440. );
  441. assertTrue(
  442. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  443. );
  444. // Text
  445. try {
  446. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
  447. fail();
  448. } catch(IOException e) {
  449. // Good
  450. }
  451. }
  452. @Test
  453. public void testPackage() throws Exception {
  454. // Excel
  455. POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  456. assertTrue(
  457. extractor
  458. instanceof XSSFExcelExtractor
  459. );
  460. extractor.close();
  461. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  462. assertTrue(extractor.getText().length() > 200);
  463. extractor.close();
  464. // Word
  465. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  466. assertTrue(
  467. extractor
  468. instanceof XWPFWordExtractor
  469. );
  470. extractor.close();
  471. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  472. assertTrue(
  473. extractor.getText().length() > 120
  474. );
  475. extractor.close();
  476. // PowerPoint
  477. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  478. assertTrue(
  479. extractor
  480. instanceof XSLFPowerPointExtractor
  481. );
  482. extractor.close();
  483. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  484. assertTrue(
  485. extractor.getText().length() > 120
  486. );
  487. extractor.close();
  488. // Visio
  489. assertTrue(
  490. ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
  491. instanceof XDGFVisioExtractor
  492. );
  493. assertTrue(
  494. extractor.getText().length() > 20
  495. );
  496. // Text
  497. try {
  498. ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
  499. fail();
  500. } catch(InvalidOperationException e) {
  501. // Good
  502. }
  503. }
  504. @Test
  505. public void testPreferEventBased() throws Exception {
  506. assertFalse(ExtractorFactory.getPreferEventExtractor());
  507. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  508. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  509. ExtractorFactory.setThreadPrefersEventExtractors(true);
  510. assertTrue(ExtractorFactory.getPreferEventExtractor());
  511. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  512. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  513. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  514. assertFalse(ExtractorFactory.getPreferEventExtractor());
  515. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  516. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  517. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  518. assertTrue(ExtractorFactory.getPreferEventExtractor());
  519. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  520. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  521. // Check we get the right extractors now
  522. POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  523. assertTrue(
  524. extractor
  525. instanceof EventBasedExcelExtractor
  526. );
  527. extractor.close();
  528. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  529. assertTrue(
  530. extractor.getText().length() > 200
  531. );
  532. extractor.close();
  533. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  534. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  535. extractor.close();
  536. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  537. assertTrue(
  538. extractor.getText().length() > 200
  539. );
  540. extractor.close();
  541. // Put back to normal
  542. ExtractorFactory.setThreadPrefersEventExtractors(false);
  543. assertFalse(ExtractorFactory.getPreferEventExtractor());
  544. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  545. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  546. // And back
  547. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  548. assertTrue(
  549. extractor
  550. instanceof ExcelExtractor
  551. );
  552. extractor.close();
  553. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  554. assertTrue(
  555. extractor.getText().length() > 200
  556. );
  557. extractor.close();
  558. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  559. assertTrue(
  560. extractor
  561. instanceof XSSFExcelExtractor
  562. );
  563. extractor.close();
  564. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  565. assertTrue(
  566. extractor.getText().length() > 200
  567. );
  568. extractor.close();
  569. }
  570. /**
  571. * Test embeded docs text extraction. For now, only
  572. * does poifs embeded, but will do ooxml ones
  573. * at some point.
  574. */
  575. @Test
  576. public void testEmbeded() throws Exception {
  577. POIOLE2TextExtractor ext;
  578. POITextExtractor[] embeds;
  579. // No embedings
  580. ext = (POIOLE2TextExtractor)
  581. ExtractorFactory.createExtractor(xls);
  582. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  583. assertEquals(0, embeds.length);
  584. // Excel
  585. ext = (POIOLE2TextExtractor)
  586. ExtractorFactory.createExtractor(xlsEmb);
  587. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  588. assertEquals(6, embeds.length);
  589. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
  590. for(int i=0; i<embeds.length; i++) {
  591. assertTrue(embeds[i].getText().length() > 20);
  592. if(embeds[i] instanceof PowerPointExtractor) numPpt++;
  593. else if(embeds[i] instanceof ExcelExtractor) numXls++;
  594. else if(embeds[i] instanceof WordExtractor) numWord++;
  595. else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
  596. }
  597. assertEquals(2, numPpt);
  598. assertEquals(2, numXls);
  599. assertEquals(2, numWord);
  600. assertEquals(0, numMsg);
  601. // Word
  602. ext = (POIOLE2TextExtractor)
  603. ExtractorFactory.createExtractor(docEmb);
  604. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  605. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  606. assertEquals(4, embeds.length);
  607. for(int i=0; i<embeds.length; i++) {
  608. assertTrue(embeds[i].getText().length() > 20);
  609. if(embeds[i] instanceof PowerPointExtractor) numPpt++;
  610. else if(embeds[i] instanceof ExcelExtractor) numXls++;
  611. else if(embeds[i] instanceof WordExtractor) numWord++;
  612. else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
  613. }
  614. assertEquals(1, numPpt);
  615. assertEquals(2, numXls);
  616. assertEquals(1, numWord);
  617. assertEquals(0, numMsg);
  618. // Word which contains an OOXML file
  619. ext = (POIOLE2TextExtractor)
  620. ExtractorFactory.createExtractor(docEmbOOXML);
  621. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  622. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
  623. assertEquals(3, embeds.length);
  624. for(int i=0; i<embeds.length; i++) {
  625. assertTrue(embeds[i].getText().length() > 20);
  626. if(embeds[i] instanceof PowerPointExtractor) numPpt++;
  627. else if(embeds[i] instanceof ExcelExtractor) numXls++;
  628. else if(embeds[i] instanceof WordExtractor) numWord++;
  629. else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
  630. else if(embeds[i] instanceof XWPFWordExtractor) numWordX++;
  631. }
  632. assertEquals(1, numPpt);
  633. assertEquals(1, numXls);
  634. assertEquals(0, numWord);
  635. assertEquals(1, numWordX);
  636. assertEquals(0, numMsg);
  637. // Outlook
  638. ext = (OutlookTextExtactor)
  639. ExtractorFactory.createExtractor(msgEmb);
  640. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  641. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  642. assertEquals(1, embeds.length);
  643. for(int i=0; i<embeds.length; i++) {
  644. assertTrue(embeds[i].getText().length() > 20);
  645. if(embeds[i] instanceof PowerPointExtractor) numPpt++;
  646. else if(embeds[i] instanceof ExcelExtractor) numXls++;
  647. else if(embeds[i] instanceof WordExtractor) numWord++;
  648. else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
  649. }
  650. assertEquals(0, numPpt);
  651. assertEquals(0, numXls);
  652. assertEquals(1, numWord);
  653. assertEquals(0, numMsg);
  654. // Outlook with another outlook file in it
  655. ext = (OutlookTextExtactor)
  656. ExtractorFactory.createExtractor(msgEmbMsg);
  657. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  658. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  659. assertEquals(1, embeds.length);
  660. for(int i=0; i<embeds.length; i++) {
  661. assertTrue(embeds[i].getText().length() > 20);
  662. if(embeds[i] instanceof PowerPointExtractor) numPpt++;
  663. else if(embeds[i] instanceof ExcelExtractor) numXls++;
  664. else if(embeds[i] instanceof WordExtractor) numWord++;
  665. else if(embeds[i] instanceof OutlookTextExtactor) numMsg++;
  666. }
  667. assertEquals(0, numPpt);
  668. assertEquals(0, numXls);
  669. assertEquals(0, numWord);
  670. assertEquals(1, numMsg);
  671. // TODO - PowerPoint
  672. // TODO - Publisher
  673. // TODO - Visio
  674. }
  675. }