您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

TestExtractorFactory.java 37KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.junit.Assert.assertEquals;
  18. import static org.junit.Assert.assertFalse;
  19. import static org.junit.Assert.assertNotNull;
  20. import static org.junit.Assert.assertNull;
  21. import static org.junit.Assert.assertTrue;
  22. import static org.junit.Assert.fail;
  23. import java.io.File;
  24. import java.io.FileInputStream;
  25. import java.io.IOException;
  26. import org.apache.poi.POIDataSamples;
  27. import org.apache.poi.POIOLE2TextExtractor;
  28. import org.apache.poi.POITextExtractor;
  29. import org.apache.poi.POIXMLException;
  30. import org.apache.poi.POIXMLTextExtractor;
  31. import org.apache.poi.UnsupportedFileFormatException;
  32. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  33. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  34. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  35. import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
  36. import org.apache.poi.hssf.HSSFTestDataSamples;
  37. import org.apache.poi.hssf.OldExcelFormatException;
  38. import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
  39. import org.apache.poi.hssf.extractor.ExcelExtractor;
  40. import org.apache.poi.hwpf.extractor.Word6Extractor;
  41. import org.apache.poi.hwpf.extractor.WordExtractor;
  42. import org.apache.poi.openxml4j.opc.OPCPackage;
  43. import org.apache.poi.openxml4j.opc.PackageAccess;
  44. import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
  45. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  46. import org.apache.poi.util.IOUtils;
  47. import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
  48. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  49. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  50. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  51. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  52. import org.junit.BeforeClass;
  53. import org.junit.Test;
  54. /**
  55. * Test that the extractor factory plays nicely
  56. */
  57. public class TestExtractorFactory {
  58. private static File txt;
  59. private static File xls;
  60. private static File xlsx;
  61. private static File xlsxStrict;
  62. private static File xltx;
  63. private static File xlsEmb;
  64. private static File doc;
  65. private static File doc6;
  66. private static File doc95;
  67. private static File docx;
  68. private static File dotx;
  69. private static File docEmb;
  70. private static File docEmbOOXML;
  71. private static File ppt;
  72. private static File pptx;
  73. private static File msg;
  74. private static File msgEmb;
  75. private static File msgEmbMsg;
  76. private static File vsd;
  77. private static File vsdx;
  78. private static File pub;
  79. private static File getFileAndCheck(POIDataSamples samples, String name) {
  80. File file = samples.getFile(name);
  81. assertNotNull("Did not get a file for " + name, file);
  82. assertTrue("Did not get a type file for " + name, file.isFile());
  83. assertTrue("File did not exist: " + name, file.exists());
  84. return file;
  85. }
  86. @BeforeClass
  87. public static void setUp() throws Exception {
  88. POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
  89. xls = getFileAndCheck(ssTests, "SampleSS.xls");
  90. xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
  91. xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
  92. xltx = getFileAndCheck(ssTests, "test.xltx");
  93. xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
  94. POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
  95. doc = getFileAndCheck(wpTests, "SampleDoc.doc");
  96. doc6 = getFileAndCheck(wpTests, "Word6.doc");
  97. doc95 = getFileAndCheck(wpTests, "Word95.doc");
  98. docx = getFileAndCheck(wpTests, "SampleDoc.docx");
  99. dotx = getFileAndCheck(wpTests, "test.dotx");
  100. docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
  101. docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
  102. POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  103. ppt = getFileAndCheck(slTests, "SampleShow.ppt");
  104. pptx = getFileAndCheck(slTests, "SampleShow.pptx");
  105. txt = getFileAndCheck(slTests, "SampleShow.txt");
  106. POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
  107. vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
  108. vsdx = getFileAndCheck(dgTests, "test.vsdx");
  109. POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
  110. pub = getFileAndCheck(pubTests, "Simple.pub");
  111. POIDataSamples olTests = POIDataSamples.getHSMFInstance();
  112. msg = getFileAndCheck(olTests, "quick.msg");
  113. msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
  114. msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
  115. }
  116. @Test
  117. public void testFile() throws Exception {
  118. // Excel
  119. POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
  120. assertNotNull("Had empty extractor for " + xls, xlsExtractor);
  121. assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
  122. xlsExtractor
  123. instanceof ExcelExtractor
  124. );
  125. assertTrue(
  126. xlsExtractor.getText().length() > 200
  127. );
  128. xlsExtractor.close();
  129. POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
  130. assertTrue(
  131. extractor.getClass().getName(),
  132. extractor
  133. instanceof XSSFExcelExtractor
  134. );
  135. extractor.close();
  136. extractor = ExtractorFactory.createExtractor(xlsx);
  137. assertTrue(
  138. extractor.getText().length() > 200
  139. );
  140. extractor.close();
  141. extractor = ExtractorFactory.createExtractor(xltx);
  142. assertTrue(
  143. extractor.getClass().getName(),
  144. extractor
  145. instanceof XSSFExcelExtractor
  146. );
  147. extractor.close();
  148. extractor = ExtractorFactory.createExtractor(xltx);
  149. assertTrue(
  150. extractor.getText().contains("test")
  151. );
  152. extractor.close();
  153. // TODO Support OOXML-Strict, see bug #57699
  154. try {
  155. /*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
  156. fail("OOXML-Strict isn't yet supported");
  157. } catch (POIXMLException e) {
  158. // Expected, for now
  159. }
  160. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  161. // assertTrue(
  162. // extractor
  163. // instanceof XSSFExcelExtractor
  164. // );
  165. // extractor.close();
  166. //
  167. // extractor = ExtractorFactory.createExtractor(xlsxStrict);
  168. // assertTrue(
  169. // extractor.getText().contains("test")
  170. // );
  171. // extractor.close();
  172. // Word
  173. extractor = ExtractorFactory.createExtractor(doc);
  174. assertTrue(
  175. extractor
  176. instanceof WordExtractor
  177. );
  178. assertTrue(
  179. extractor.getText().length() > 120
  180. );
  181. extractor.close();
  182. extractor = ExtractorFactory.createExtractor(doc6);
  183. assertTrue(
  184. extractor
  185. instanceof Word6Extractor
  186. );
  187. assertTrue(
  188. extractor.getText().length() > 20
  189. );
  190. extractor.close();
  191. extractor = ExtractorFactory.createExtractor(doc95);
  192. assertTrue(
  193. extractor
  194. instanceof Word6Extractor
  195. );
  196. assertTrue(
  197. extractor.getText().length() > 120
  198. );
  199. extractor.close();
  200. extractor = ExtractorFactory.createExtractor(docx);
  201. assertTrue(
  202. extractor instanceof XWPFWordExtractor
  203. );
  204. extractor.close();
  205. extractor = ExtractorFactory.createExtractor(docx);
  206. assertTrue(
  207. extractor.getText().length() > 120
  208. );
  209. extractor.close();
  210. extractor = ExtractorFactory.createExtractor(dotx);
  211. assertTrue(
  212. extractor instanceof XWPFWordExtractor
  213. );
  214. extractor.close();
  215. extractor = ExtractorFactory.createExtractor(dotx);
  216. assertTrue(
  217. extractor.getText().contains("Test")
  218. );
  219. extractor.close();
  220. // PowerPoint (PPT)
  221. extractor = ExtractorFactory.createExtractor(ppt);
  222. assertTrue(
  223. extractor
  224. instanceof PowerPointExtractor
  225. );
  226. assertTrue(
  227. extractor.getText().length() > 120
  228. );
  229. extractor.close();
  230. // PowerPoint (PPTX)
  231. extractor = ExtractorFactory.createExtractor(pptx);
  232. assertTrue(
  233. extractor
  234. instanceof XSLFPowerPointExtractor
  235. );
  236. assertTrue(
  237. extractor.getText().length() > 120
  238. );
  239. extractor.close();
  240. // Visio - binary
  241. extractor = ExtractorFactory.createExtractor(vsd);
  242. assertTrue(
  243. extractor
  244. instanceof VisioTextExtractor
  245. );
  246. assertTrue(
  247. extractor.getText().length() > 50
  248. );
  249. extractor.close();
  250. // Visio - vsdx
  251. extractor = ExtractorFactory.createExtractor(vsdx);
  252. assertTrue(
  253. extractor
  254. instanceof XDGFVisioExtractor
  255. );
  256. assertTrue(
  257. extractor.getText().length() > 20
  258. );
  259. extractor.close();
  260. // Publisher
  261. extractor = ExtractorFactory.createExtractor(pub);
  262. assertTrue(
  263. extractor
  264. instanceof PublisherTextExtractor
  265. );
  266. assertTrue(
  267. extractor.getText().length() > 50
  268. );
  269. extractor.close();
  270. // Outlook msg
  271. extractor = ExtractorFactory.createExtractor(msg);
  272. assertTrue(
  273. extractor
  274. instanceof OutlookTextExtactor
  275. );
  276. assertTrue(
  277. extractor.getText().length() > 50
  278. );
  279. extractor.close();
  280. // Text
  281. try {
  282. ExtractorFactory.createExtractor(txt);
  283. fail();
  284. } catch(IllegalArgumentException e) {
  285. // Good
  286. }
  287. }
  288. @Test
  289. public void testInputStream() throws Exception {
  290. // Excel
  291. POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
  292. assertTrue(
  293. extractor
  294. instanceof ExcelExtractor
  295. );
  296. assertTrue(
  297. extractor.getText().length() > 200
  298. );
  299. extractor.close();
  300. extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
  301. assertTrue(
  302. extractor.getClass().getName(),
  303. extractor
  304. instanceof XSSFExcelExtractor
  305. );
  306. assertTrue(
  307. extractor.getText().length() > 200
  308. );
  309. // TODO Support OOXML-Strict, see bug #57699
  310. // assertTrue(
  311. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
  312. // instanceof XSSFExcelExtractor
  313. // );
  314. // assertTrue(
  315. // ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
  316. // );
  317. extractor.close();
  318. // Word
  319. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
  320. assertTrue(
  321. extractor.getClass().getName(),
  322. extractor
  323. instanceof WordExtractor
  324. );
  325. assertTrue(
  326. extractor.getText().length() > 120
  327. );
  328. extractor.close();
  329. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
  330. assertTrue(
  331. extractor.getClass().getName(),
  332. extractor
  333. instanceof Word6Extractor
  334. );
  335. assertTrue(
  336. extractor.getText().length() > 20
  337. );
  338. extractor.close();
  339. extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
  340. assertTrue(
  341. extractor.getClass().getName(),
  342. extractor
  343. instanceof Word6Extractor
  344. );
  345. assertTrue(
  346. extractor.getText().length() > 120
  347. );
  348. extractor.close();
  349. extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
  350. assertTrue(
  351. extractor
  352. instanceof XWPFWordExtractor
  353. );
  354. assertTrue(
  355. extractor.getText().length() > 120
  356. );
  357. extractor.close();
  358. // PowerPoint
  359. extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
  360. assertTrue(
  361. extractor
  362. instanceof PowerPointExtractor
  363. );
  364. assertTrue(
  365. extractor.getText().length() > 120
  366. );
  367. extractor.close();
  368. extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
  369. assertTrue(
  370. extractor
  371. instanceof XSLFPowerPointExtractor
  372. );
  373. assertTrue(
  374. extractor.getText().length() > 120
  375. );
  376. extractor.close();
  377. // Visio
  378. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
  379. assertTrue(
  380. extractor
  381. instanceof VisioTextExtractor
  382. );
  383. assertTrue(
  384. extractor.getText().length() > 50
  385. );
  386. extractor.close();
  387. // Visio - vsdx
  388. extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
  389. assertTrue(
  390. extractor
  391. instanceof XDGFVisioExtractor
  392. );
  393. assertTrue(
  394. extractor.getText().length() > 20
  395. );
  396. extractor.close();
  397. // Publisher
  398. extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
  399. assertTrue(
  400. extractor
  401. instanceof PublisherTextExtractor
  402. );
  403. assertTrue(
  404. extractor.getText().length() > 50
  405. );
  406. extractor.close();
  407. // Outlook msg
  408. extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
  409. assertTrue(
  410. extractor
  411. instanceof OutlookTextExtactor
  412. );
  413. assertTrue(
  414. extractor.getText().length() > 50
  415. );
  416. extractor.close();
  417. // Text
  418. try {
  419. FileInputStream stream = new FileInputStream(txt);
  420. try {
  421. ExtractorFactory.createExtractor(stream);
  422. fail();
  423. } finally {
  424. IOUtils.closeQuietly(stream);
  425. }
  426. } catch(IllegalArgumentException e) {
  427. // Good
  428. }
  429. }
  430. @Test
  431. public void testPOIFS() throws Exception {
  432. // Excel
  433. assertTrue(
  434. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
  435. instanceof ExcelExtractor
  436. );
  437. assertTrue(
  438. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  439. );
  440. // Word
  441. assertTrue(
  442. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
  443. instanceof WordExtractor
  444. );
  445. assertTrue(
  446. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  447. );
  448. assertTrue(
  449. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
  450. instanceof Word6Extractor
  451. );
  452. assertTrue(
  453. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  454. );
  455. assertTrue(
  456. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
  457. instanceof Word6Extractor
  458. );
  459. assertTrue(
  460. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  461. );
  462. // PowerPoint
  463. assertTrue(
  464. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
  465. instanceof PowerPointExtractor
  466. );
  467. assertTrue(
  468. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  469. );
  470. // Visio
  471. assertTrue(
  472. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
  473. instanceof VisioTextExtractor
  474. );
  475. assertTrue(
  476. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  477. );
  478. // Publisher
  479. assertTrue(
  480. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
  481. instanceof PublisherTextExtractor
  482. );
  483. assertTrue(
  484. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  485. );
  486. // Outlook msg
  487. assertTrue(
  488. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
  489. instanceof OutlookTextExtactor
  490. );
  491. assertTrue(
  492. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  493. );
  494. // Text
  495. try {
  496. ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
  497. fail();
  498. } catch(IOException e) {
  499. // Good
  500. }
  501. }
  502. @Test
  503. public void testOPOIFS() throws Exception {
  504. // Excel
  505. assertTrue(
  506. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
  507. instanceof ExcelExtractor
  508. );
  509. assertTrue(
  510. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
  511. );
  512. // Word
  513. assertTrue(
  514. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
  515. instanceof WordExtractor
  516. );
  517. assertTrue(
  518. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
  519. );
  520. assertTrue(
  521. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
  522. instanceof Word6Extractor
  523. );
  524. assertTrue(
  525. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
  526. );
  527. assertTrue(
  528. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
  529. instanceof Word6Extractor
  530. );
  531. assertTrue(
  532. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
  533. );
  534. // PowerPoint
  535. assertTrue(
  536. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
  537. instanceof PowerPointExtractor
  538. );
  539. assertTrue(
  540. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
  541. );
  542. // Visio
  543. assertTrue(
  544. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
  545. instanceof VisioTextExtractor
  546. );
  547. assertTrue(
  548. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
  549. );
  550. // Publisher
  551. assertTrue(
  552. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
  553. instanceof PublisherTextExtractor
  554. );
  555. assertTrue(
  556. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
  557. );
  558. // Outlook msg
  559. assertTrue(
  560. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
  561. instanceof OutlookTextExtactor
  562. );
  563. assertTrue(
  564. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
  565. );
  566. // Text
  567. try {
  568. ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
  569. fail();
  570. } catch(IOException e) {
  571. // Good
  572. }
  573. }
  574. @Test
  575. public void testPackage() throws Exception {
  576. // Excel
  577. POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  578. assertTrue(extractor instanceof XSSFExcelExtractor);
  579. extractor.close();
  580. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  581. assertTrue(extractor.getText().length() > 200);
  582. extractor.close();
  583. // Word
  584. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  585. assertTrue(extractor instanceof XWPFWordExtractor);
  586. extractor.close();
  587. extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
  588. assertTrue(extractor.getText().length() > 120);
  589. extractor.close();
  590. // PowerPoint
  591. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  592. assertTrue(extractor instanceof XSLFPowerPointExtractor);
  593. extractor.close();
  594. extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
  595. assertTrue(extractor.getText().length() > 120);
  596. extractor.close();
  597. // Visio
  598. extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
  599. assertTrue(extractor instanceof XDGFVisioExtractor);
  600. assertTrue(extractor.getText().length() > 20);
  601. extractor.close();
  602. // Text
  603. try {
  604. ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
  605. fail("TestExtractorFactory.testPackage() failed on " + txt.toString());
  606. } catch(UnsupportedFileFormatException e) {
  607. // Good
  608. } catch (Exception e) {
  609. System.out.println("TestExtractorFactory.testPackage() failed on " + txt.toString());
  610. throw e;
  611. }
  612. }
  613. @Test
  614. public void testPreferEventBased() throws Exception {
  615. assertFalse(ExtractorFactory.getPreferEventExtractor());
  616. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  617. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  618. ExtractorFactory.setThreadPrefersEventExtractors(true);
  619. assertTrue(ExtractorFactory.getPreferEventExtractor());
  620. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  621. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  622. ExtractorFactory.setAllThreadsPreferEventExtractors(false);
  623. assertFalse(ExtractorFactory.getPreferEventExtractor());
  624. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  625. assertEquals(Boolean.FALSE, ExtractorFactory.getAllThreadsPreferEventExtractors());
  626. ExtractorFactory.setAllThreadsPreferEventExtractors(null);
  627. assertTrue(ExtractorFactory.getPreferEventExtractor());
  628. assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
  629. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  630. // Check we get the right extractors now
  631. POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  632. assertTrue(
  633. extractor
  634. instanceof EventBasedExcelExtractor
  635. );
  636. extractor.close();
  637. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  638. assertTrue(
  639. extractor.getText().length() > 200
  640. );
  641. extractor.close();
  642. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  643. assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
  644. extractor.close();
  645. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  646. assertTrue(
  647. extractor.getText().length() > 200
  648. );
  649. extractor.close();
  650. // Put back to normal
  651. ExtractorFactory.setThreadPrefersEventExtractors(false);
  652. assertFalse(ExtractorFactory.getPreferEventExtractor());
  653. assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
  654. assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
  655. // And back
  656. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  657. assertTrue(
  658. extractor
  659. instanceof ExcelExtractor
  660. );
  661. extractor.close();
  662. extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
  663. assertTrue(
  664. extractor.getText().length() > 200
  665. );
  666. extractor.close();
  667. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
  668. assertTrue(
  669. extractor
  670. instanceof XSSFExcelExtractor
  671. );
  672. extractor.close();
  673. extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
  674. assertTrue(
  675. extractor.getText().length() > 200
  676. );
  677. extractor.close();
  678. }
  679. /**
  680. * Test embeded docs text extraction. For now, only
  681. * does poifs embeded, but will do ooxml ones
  682. * at some point.
  683. */
  684. @Test
  685. public void testEmbeded() throws Exception {
  686. POIOLE2TextExtractor ext;
  687. POITextExtractor[] embeds;
  688. // No embedings
  689. ext = (POIOLE2TextExtractor)
  690. ExtractorFactory.createExtractor(xls);
  691. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  692. assertEquals(0, embeds.length);
  693. ext.close();
  694. // Excel
  695. ext = (POIOLE2TextExtractor)
  696. ExtractorFactory.createExtractor(xlsEmb);
  697. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  698. assertEquals(6, embeds.length);
  699. int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
  700. for (POITextExtractor embed : embeds) {
  701. assertTrue(embed.getText().length() > 20);
  702. if (embed instanceof PowerPointExtractor) numPpt++;
  703. else if (embed instanceof ExcelExtractor) numXls++;
  704. else if (embed instanceof WordExtractor) numWord++;
  705. else if (embed instanceof OutlookTextExtactor) numMsg++;
  706. }
  707. assertEquals(2, numPpt);
  708. assertEquals(2, numXls);
  709. assertEquals(2, numWord);
  710. assertEquals(0, numMsg);
  711. ext.close();
  712. // Word
  713. ext = (POIOLE2TextExtractor)
  714. ExtractorFactory.createExtractor(docEmb);
  715. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  716. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  717. assertEquals(4, embeds.length);
  718. for (POITextExtractor embed : embeds) {
  719. assertTrue(embed.getText().length() > 20);
  720. if (embed instanceof PowerPointExtractor) numPpt++;
  721. else if (embed instanceof ExcelExtractor) numXls++;
  722. else if (embed instanceof WordExtractor) numWord++;
  723. else if (embed instanceof OutlookTextExtactor) numMsg++;
  724. }
  725. assertEquals(1, numPpt);
  726. assertEquals(2, numXls);
  727. assertEquals(1, numWord);
  728. assertEquals(0, numMsg);
  729. ext.close();
  730. // Word which contains an OOXML file
  731. ext = (POIOLE2TextExtractor)
  732. ExtractorFactory.createExtractor(docEmbOOXML);
  733. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  734. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
  735. assertEquals(3, embeds.length);
  736. for (POITextExtractor embed : embeds) {
  737. assertTrue(embed.getText().length() > 20);
  738. if (embed instanceof PowerPointExtractor) numPpt++;
  739. else if (embed instanceof ExcelExtractor) numXls++;
  740. else if (embed instanceof WordExtractor) numWord++;
  741. else if (embed instanceof OutlookTextExtactor) numMsg++;
  742. else if (embed instanceof XWPFWordExtractor) numWordX++;
  743. }
  744. assertEquals(1, numPpt);
  745. assertEquals(1, numXls);
  746. assertEquals(0, numWord);
  747. assertEquals(1, numWordX);
  748. assertEquals(0, numMsg);
  749. ext.close();
  750. // Outlook
  751. ext = (OutlookTextExtactor)
  752. ExtractorFactory.createExtractor(msgEmb);
  753. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  754. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  755. assertEquals(1, embeds.length);
  756. for (POITextExtractor embed : embeds) {
  757. assertTrue(embed.getText().length() > 20);
  758. if (embed instanceof PowerPointExtractor) numPpt++;
  759. else if (embed instanceof ExcelExtractor) numXls++;
  760. else if (embed instanceof WordExtractor) numWord++;
  761. else if (embed instanceof OutlookTextExtactor) numMsg++;
  762. }
  763. assertEquals(0, numPpt);
  764. assertEquals(0, numXls);
  765. assertEquals(1, numWord);
  766. assertEquals(0, numMsg);
  767. ext.close();
  768. // Outlook with another outlook file in it
  769. ext = (OutlookTextExtactor)
  770. ExtractorFactory.createExtractor(msgEmbMsg);
  771. embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
  772. numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
  773. assertEquals(1, embeds.length);
  774. for (POITextExtractor embed : embeds) {
  775. assertTrue(embed.getText().length() > 20);
  776. if (embed instanceof PowerPointExtractor) numPpt++;
  777. else if (embed instanceof ExcelExtractor) numXls++;
  778. else if (embed instanceof WordExtractor) numWord++;
  779. else if (embed instanceof OutlookTextExtactor) numMsg++;
  780. }
  781. assertEquals(0, numPpt);
  782. assertEquals(0, numXls);
  783. assertEquals(0, numWord);
  784. assertEquals(1, numMsg);
  785. ext.close();
  786. // TODO - PowerPoint
  787. // TODO - Publisher
  788. // TODO - Visio
  789. }
  790. private static final String[] EXPECTED_FAILURES = new String[] {
  791. // password protected files
  792. "spreadsheet/password.xls",
  793. "spreadsheet/protected_passtika.xlsx",
  794. "spreadsheet/51832.xls",
  795. "document/PasswordProtected.doc",
  796. "slideshow/Password_Protected-hello.ppt",
  797. "slideshow/Password_Protected-56-hello.ppt",
  798. "slideshow/Password_Protected-np-hello.ppt",
  799. "slideshow/cryptoapi-proc2356.ppt",
  800. //"document/bug53475-password-is-pass.docx",
  801. //"document/bug53475-password-is-solrcell.docx",
  802. "spreadsheet/xor-encryption-abc.xls",
  803. "spreadsheet/35897-type4.xls",
  804. //"poifs/protect.xlsx",
  805. //"poifs/protected_sha512.xlsx",
  806. //"poifs/extenxls_pwd123.xlsx",
  807. //"poifs/protected_agile.docx",
  808. "spreadsheet/58616.xlsx",
  809. // TODO: fails XMLExportTest, is this ok?
  810. "spreadsheet/CustomXMLMapping-singleattributenamespace.xlsx",
  811. "spreadsheet/55864.xlsx",
  812. "spreadsheet/57890.xlsx",
  813. // TODO: these fail now with some NPE/file read error because we now try to compute every value via Cell.toString()!
  814. "spreadsheet/44958.xls",
  815. "spreadsheet/44958_1.xls",
  816. "spreadsheet/testArraysAndTables.xls",
  817. // TODO: good to ignore?
  818. "spreadsheet/sample-beta.xlsx",
  819. // This is actually a spreadsheet!
  820. "hpsf/TestRobert_Flaherty.doc",
  821. // some files that are broken, eg Word 95, ...
  822. "spreadsheet/43493.xls",
  823. "spreadsheet/46904.xls",
  824. "document/Bug50955.doc",
  825. "slideshow/PPT95.ppt",
  826. "openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx",
  827. "openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx",
  828. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_NotPresentFAIL.docx",
  829. "openxml4j/OPCCompliance_CoreProperties_LimitedXSITypeAttribute_PresentWithUnauthorizedValueFAIL.docx",
  830. "openxml4j/OPCCompliance_CoreProperties_OnlyOneCorePropertiesPartFAIL.docx",
  831. "openxml4j/OPCCompliance_CoreProperties_UnauthorizedXMLLangAttributeFAIL.docx",
  832. "openxml4j/OPCCompliance_DerivedPartNameFAIL.docx",
  833. "openxml4j/invalid.xlsx",
  834. "spreadsheet/54764-2.xlsx", // see TestXSSFBugs.bug54764()
  835. "spreadsheet/54764.xlsx", // see TestXSSFBugs.bug54764()
  836. "spreadsheet/Simple.xlsb",
  837. "poifs/unknown_properties.msg", // POIFS properties corrupted
  838. "poifs/only-zero-byte-streams.ole2", // No actual contents
  839. "spreadsheet/poc-xmlbomb.xlsx", // contains xml-entity-expansion
  840. "spreadsheet/poc-shared-strings.xlsx", // contains shared-string-entity-expansion
  841. // old Excel files, which we only support simple text extraction of
  842. "spreadsheet/testEXCEL_2.xls",
  843. "spreadsheet/testEXCEL_3.xls",
  844. "spreadsheet/testEXCEL_4.xls",
  845. "spreadsheet/testEXCEL_5.xls",
  846. "spreadsheet/testEXCEL_95.xls",
  847. // OOXML Strict is not yet supported, see bug #57699
  848. "spreadsheet/SampleSS.strict.xlsx",
  849. "spreadsheet/SimpleStrict.xlsx",
  850. "spreadsheet/sample.strict.xlsx",
  851. // non-TNEF files
  852. "ddf/Container.dat",
  853. "ddf/47143.dat",
  854. // sheet cloning errors
  855. "spreadsheet/47813.xlsx",
  856. "spreadsheet/56450.xls",
  857. "spreadsheet/57231_MixedGasReport.xls",
  858. "spreadsheet/OddStyleRecord.xls",
  859. "spreadsheet/WithChartSheet.xlsx",
  860. "spreadsheet/chart_sheet.xlsx",
  861. };
  862. @Test
  863. public void testFileLeak() throws Exception {
  864. // run a number of files that might fail in order to catch
  865. // leaked file resources when using file-leak-detector while
  866. // running the test
  867. for(String file : EXPECTED_FAILURES) {
  868. try {
  869. ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile(file));
  870. } catch (Exception e) {
  871. // catch all exceptions here as we are only interested in file-handle leaks
  872. }
  873. }
  874. }
  875. /**
  876. * #59074 - Excel 95 files should give a helpful message, not just
  877. * "No supported documents found in the OLE2 stream"
  878. */
  879. @Test
  880. public void bug59074() throws Exception {
  881. try {
  882. ExtractorFactory.createExtractor(
  883. POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
  884. fail("Old excel formats not supported via ExtractorFactory");
  885. } catch (OldExcelFormatException e) {
  886. // expected here
  887. }
  888. }
  889. @Test
  890. public void testGetEmbeddedFromXMLExtractor() {
  891. try {
  892. // currently not implemented
  893. ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor)null);
  894. fail("Unsupported currently");
  895. } catch (IllegalStateException e) {
  896. // expected here
  897. }
  898. }
  899. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
  900. // When this happens, change this from @Test(expected=...) to @Test
  901. @Test(expected=AssertionError.class)
  902. public void test45565() throws Exception {
  903. POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("45565.xls"));
  904. String text = extractor.getText();
  905. assertContains(text, "testdoc");
  906. assertContains(text, "test phrase");
  907. extractor.close();
  908. }
  909. }