You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestExtractor.java 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hslf.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.apache.poi.POITestCase.assertContainsIgnoreCase;
  18. import static org.apache.poi.POITestCase.assertNotContained;
  19. import static org.junit.Assert.assertEquals;
  20. import static org.junit.Assert.assertFalse;
  21. import static org.junit.Assert.assertNotNull;
  22. import static org.junit.Assert.assertTrue;
  23. import java.io.ByteArrayOutputStream;
  24. import java.io.File;
  25. import java.io.FileInputStream;
  26. import java.io.IOException;
  27. import java.io.InputStream;
  28. import java.util.List;
  29. import org.apache.poi.POIDataSamples;
  30. import org.apache.poi.hslf.usermodel.HSLFObjectShape;
  31. import org.apache.poi.hslf.usermodel.HSLFSlideShow;
  32. import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
  33. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  34. import org.apache.poi.hwpf.HWPFDocument;
  35. import org.apache.poi.poifs.filesystem.DirectoryNode;
  36. import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
  37. import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
  38. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  39. import org.apache.poi.util.IOUtils;
  40. import org.junit.Test;
  41. /**
  42. * Tests that the extractor correctly gets the text out of our sample file
  43. */
  44. public final class TestExtractor {
  45. /**
  46. * Extractor primed on the 2 page basic test data
  47. */
  48. private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";
  49. /**
  50. * Extractor primed on the 1 page but text-box'd test data
  51. */
  52. private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
  53. /**
  54. * Where our embeded files live
  55. */
  56. private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  57. // @Before
  58. // public void setUp() throws Exception {
  59. // ppe = new PowerPointExtractor(slTests.getFile("basic_test_ppt_file.ppt").getCanonicalPath());
  60. // ppe2 = new PowerPointExtractor(slTests.getFile("with_textbox.ppt").getCanonicalPath());
  61. // }
  62. // @After
  63. // public void closeResources() throws Exception {
  64. // ppe2.close();
  65. // ppe.close();
  66. // }
  67. private PowerPointExtractor openExtractor(String fileName) throws IOException {
  68. InputStream is = slTests.openResourceAsStream(fileName);
  69. try {
  70. return new PowerPointExtractor(is);
  71. } finally {
  72. is.close();
  73. }
  74. }
  75. @Test
  76. public void testReadSheetText() throws IOException {
  77. // Basic 2 page example
  78. PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
  79. assertEquals(expectText, ppe.getText());
  80. ppe.close();
  81. // 1 page example with text boxes
  82. PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
  83. assertEquals(expectText2, ppe2.getText());
  84. ppe2.close();
  85. }
  86. @Test
  87. public void testReadNoteText() throws IOException {
  88. // Basic 2 page example
  89. PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
  90. String notesText = ppe.getNotes();
  91. String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
  92. assertEquals(expText, notesText);
  93. ppe.close();
  94. // Other one doesn't have notes
  95. PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
  96. notesText = ppe2.getNotes();
  97. expText = "";
  98. assertEquals(expText, notesText);
  99. ppe2.close();
  100. }
  101. @Test
  102. public void testReadBoth() throws IOException {
  103. String[] slText = new String[]{
  104. "This is a test title\nThis is a test subtitle\nThis is on page 1\n",
  105. "This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
  106. };
  107. String[] ntText = new String[]{
  108. "\nThese are the notes for page 1\n",
  109. "\nThese are the notes on page two, again lacking formatting\n"
  110. };
  111. PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
  112. ppe.setSlidesByDefault(true);
  113. ppe.setNotesByDefault(false);
  114. assertEquals(slText[0] + slText[1], ppe.getText());
  115. ppe.setSlidesByDefault(false);
  116. ppe.setNotesByDefault(true);
  117. assertEquals(ntText[0] + ntText[1], ppe.getText());
  118. ppe.setSlidesByDefault(true);
  119. ppe.setNotesByDefault(true);
  120. assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
  121. ppe.close();
  122. }
  123. /**
  124. * Test that when presented with a PPT file missing the odd
  125. * core record, we can still get the rest of the text out
  126. *
  127. * @throws Exception
  128. */
  129. @Test
  130. public void testMissingCoreRecords() throws IOException {
  131. PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
  132. String text = ppe.getText(true, false);
  133. String nText = ppe.getNotes();
  134. assertNotNull(text);
  135. assertNotNull(nText);
  136. // Notes record were corrupt, so don't expect any
  137. assertEquals(nText.length(), 0);
  138. // Slide records were fine
  139. assertContains(text, "Using Disease Surveillance and Response");
  140. ppe.close();
  141. }
  142. @Test
  143. public void testExtractFromEmbeded() throws IOException {
  144. InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
  145. POIFSFileSystem fs = new POIFSFileSystem(is);
  146. DirectoryNode root = fs.getRoot();
  147. PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
  148. PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
  149. ppe2.close();
  150. ppe1.close();
  151. fs.close();
  152. }
  153. private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
  154. throws IOException {
  155. DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
  156. assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
  157. // Check the first file
  158. HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
  159. PowerPointExtractor ppe = new PowerPointExtractor(ppt);
  160. assertEquals(expected, ppe.getText(true, false));
  161. return ppe;
  162. }
  163. /**
  164. * A powerpoint file with embeded powerpoint files
  165. */
  166. @Test
  167. public void testExtractFromOwnEmbeded() throws IOException {
  168. PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
  169. List<HSLFObjectShape> shapes = ppe.getOLEShapes();
  170. assertEquals("Expected 6 ole shapes", 6, shapes.size());
  171. int num_ppt = 0, num_doc = 0, num_xls = 0;
  172. for (HSLFObjectShape ole : shapes) {
  173. String name = ole.getInstanceName();
  174. InputStream data = ole.getObjectData().getInputStream();
  175. if ("Worksheet".equals(name)) {
  176. HSSFWorkbook wb = new HSSFWorkbook(data);
  177. num_xls++;
  178. wb.close();
  179. } else if ("Document".equals(name)) {
  180. HWPFDocument doc = new HWPFDocument(data);
  181. num_doc++;
  182. doc.close();
  183. } else if ("Presentation".equals(name)) {
  184. num_ppt++;
  185. HSLFSlideShow ppt = new HSLFSlideShow(data);
  186. ppt.close();
  187. }
  188. data.close();
  189. }
  190. assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
  191. assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
  192. assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
  193. ppe.close();
  194. }
  195. /**
  196. * A powerpoint file with embeded powerpoint files
  197. */
  198. @Test
  199. public void test52991() throws IOException {
  200. PowerPointExtractor ppe = openExtractor("badzip.ppt");
  201. for (HSLFObjectShape shape : ppe.getOLEShapes()) {
  202. IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
  203. }
  204. ppe.close();
  205. }
  206. /**
  207. * From bug #45543
  208. */
  209. @Test
  210. public void testWithComments() throws IOException {
  211. PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
  212. String text = ppe1.getText();
  213. assertFalse("Comments not in by default", text.contains("This is a test comment"));
  214. ppe1.setCommentsByDefault(true);
  215. text = ppe1.getText();
  216. assertContains(text, "This is a test comment");
  217. ppe1.close();
  218. // And another file
  219. PowerPointExtractor ppe2 = openExtractor("45543.ppt");
  220. text = ppe2.getText();
  221. assertFalse("Comments not in by default", text.contains("testdoc"));
  222. ppe2.setCommentsByDefault(true);
  223. text = ppe2.getText();
  224. assertContains(text, "testdoc");
  225. ppe2.close();
  226. }
  227. /**
  228. * From bug #45537
  229. */
  230. @Test
  231. public void testHeaderFooter() throws IOException {
  232. String text;
  233. // With a header on the notes
  234. InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
  235. HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
  236. is1.close();
  237. assertNotNull(ppt1.getNotesHeadersFooters());
  238. assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
  239. PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
  240. text = ppe1.getText();
  241. assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
  242. assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
  243. ppe1.setNotesByDefault(true);
  244. text = ppe1.getText();
  245. assertContains(text, "testdoc");
  246. assertContains(text, "test phrase");
  247. ppe1.close();
  248. ppt1.close();
  249. // And with a footer, also on notes
  250. InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
  251. HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
  252. is2.close();
  253. assertNotNull(ppt2.getNotesHeadersFooters());
  254. assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
  255. ppt2.close();
  256. PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
  257. text = ppe2.getText();
  258. assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
  259. assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
  260. ppe2.setNotesByDefault(true);
  261. text = ppe2.getText();
  262. assertContains(text, "testdoc");
  263. assertContains(text, "test phrase");
  264. ppe2.close();
  265. }
  266. @SuppressWarnings("unused")
  267. @Test
  268. public void testSlideMasterText() throws IOException {
  269. String masterTitleText = "This is the Master Title";
  270. String masterRandomText = "This text comes from the Master Slide";
  271. String masterFooterText = "Footer from the master slide";
  272. PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
  273. ppe.setMasterByDefault(true);
  274. String text = ppe.getText();
  275. assertContains(text, masterRandomText);
  276. assertContains(text, masterFooterText);
  277. ppe.close();
  278. }
  279. @Test
  280. public void testMasterText() throws IOException {
  281. PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
  282. // Initially not there
  283. String text = ppe1.getText();
  284. assertFalse(text.contains("Text that I added to the master slide"));
  285. // Enable, shows up
  286. ppe1.setMasterByDefault(true);
  287. text = ppe1.getText();
  288. assertContains(text, "Text that I added to the master slide");
  289. // Make sure placeholder text does not come out
  290. assertNotContained(text, "Click to edit Master");
  291. ppe1.close();
  292. // Now with another file only containing master text
  293. // Will always show up
  294. PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
  295. String masterText = "Footer from the master slide";
  296. text = ppe2.getText();
  297. assertContainsIgnoreCase(text, "master");
  298. assertContains(text, masterText);
  299. ppe2.close();
  300. }
  301. /**
  302. * Bug #54880 Chinese text not extracted properly
  303. */
  304. @Test
  305. public void testChineseText() throws IOException {
  306. PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
  307. String text = ppe.getText();
  308. // Check for the english text line
  309. assertContains(text, "Single byte");
  310. // Check for the english text in the mixed line
  311. assertContains(text, "Mix");
  312. // Check for the chinese text in the mixed line
  313. assertContains(text, "\u8868");
  314. // Check for the chinese only text line
  315. assertContains(text, "\uff8a\uff9d\uff76\uff78");
  316. ppe.close();
  317. }
  318. /**
  319. * Tests that we can work with both {@link POIFSFileSystem}
  320. * and {@link NPOIFSFileSystem}
  321. */
  322. @SuppressWarnings("resource")
  323. @Test
  324. public void testDifferentPOIFS() throws IOException {
  325. // Open the two filesystems
  326. File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
  327. InputStream is1 = new FileInputStream(pptFile);
  328. OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
  329. is1.close();
  330. NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
  331. DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
  332. // Open directly
  333. for (DirectoryNode dir : files) {
  334. PowerPointExtractor extractor = new PowerPointExtractor(dir);
  335. assertEquals(expectText, extractor.getText());
  336. }
  337. // Open via a HSLFSlideShow
  338. for (DirectoryNode dir : files) {
  339. HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
  340. PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
  341. assertEquals(expectText, extractor.getText());
  342. extractor.close();
  343. slideshow.close();
  344. }
  345. npoifs.close();
  346. }
  347. @Test
  348. public void testTable() throws Exception {
  349. PowerPointExtractor ppe1 = openExtractor("54111.ppt");
  350. String text1 = ppe1.getText();
  351. String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
  352. "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
  353. "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
  354. "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
  355. "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
  356. "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
  357. assertContains(text1, target1);
  358. ppe1.close();
  359. PowerPointExtractor ppe2 = openExtractor("54722.ppt");
  360. String text2 = ppe2.getText();
  361. String target2 = "this\tText\tis\twithin\ta\n" +
  362. "table\t1\t2\t3\t4";
  363. assertContains(text2, target2);
  364. ppe2.close();
  365. }
  366. // bug 60003
  367. @Test
  368. public void testExtractMasterSlideFooterText() throws Exception {
  369. PowerPointExtractor ppe = openExtractor("60003.ppt");
  370. ppe.setMasterByDefault(true);
  371. String text = ppe.getText();
  372. assertContains(text, "Prague");
  373. ppe.close();
  374. }
  375. @Test
  376. public void testExtractGroupedShapeText() throws Exception {
  377. try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
  378. final String text = ppe.getText();
  379. //this tests that we're ignoring text shapes at depth=0
  380. //i.e. POI has already included them in the slide's getTextParagraphs()
  381. assertContains(text, "Text box1");
  382. assertEquals(1, countMatches(text,"Text box1"));
  383. //the WordArt and text box count tests will fail
  384. //if this content is available via getTextParagraphs() of the slide in POI
  385. //i.e. when POI is fixed, these tests will fail, and
  386. //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
  387. assertEquals(1, countMatches(text,"WordArt1"));
  388. assertEquals(1, countMatches(text,"WordArt2"));
  389. assertEquals(1, countMatches(text,"Ungrouped text box"));//should only be 1
  390. assertContains(text, "Text box2");
  391. assertContains(text, "Text box3");
  392. assertContains(text, "Text box4");
  393. assertContains(text, "Text box5");
  394. //see below -- need to extract hyperlinks
  395. assertContains(text, "tika");
  396. assertContains(text, "MyTitle");
  397. }
  398. }
  399. private static int countMatches(final String base, final String find) {
  400. return base.split(find).length-1;
  401. }
  402. }