You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestXSLFPowerPointExtractor.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.xslf.extractor;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.apache.poi.POITestCase.assertNotContained;
  18. import static org.apache.poi.POITestCase.assertStartsWith;
  19. import static org.junit.Assert.assertEquals;
  20. import static org.junit.Assert.assertFalse;
  21. import static org.junit.Assert.assertNotNull;
  22. import static org.junit.Assert.assertTrue;
  23. import java.io.File;
  24. import java.io.IOException;
  25. import java.io.InputStream;
  26. import org.apache.poi.POIDataSamples;
  27. import org.apache.poi.extractor.ExtractorFactory;
  28. import org.apache.poi.sl.extractor.SlideShowExtractor;
  29. import org.apache.poi.xslf.usermodel.XMLSlideShow;
  30. import org.apache.poi.xslf.usermodel.XSLFShape;
  31. import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
  32. import org.junit.Test;
  33. /**
  34. * Tests for XSLFPowerPointExtractor
  35. */
  36. public class TestXSLFPowerPointExtractor {
  37. private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  38. /**
  39. * Get text out of the simple file
  40. */
  41. @Test
  42. public void testGetSimpleText() throws IOException {
  43. try (XMLSlideShow xmlA = openPPTX("sample.pptx");
  44. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xmlA)) {
  45. extractor.getText();
  46. String text = extractor.getText();
  47. assertTrue(text.length() > 0);
  48. // Check Basics
  49. assertStartsWith(text, "Lorem ipsum dolor sit amet\n");
  50. assertContains(text, "amet\n");
  51. // Our placeholder master text
  52. // This shouldn't show up in the output
  53. // String masterText =
  54. // "Click to edit Master title style\n" +
  55. // "Click to edit Master subtitle style\n" +
  56. // "\n\n\n\n\n\n" +
  57. // "Click to edit Master title style\n" +
  58. // "Click to edit Master text styles\n" +
  59. // "Second level\n" +
  60. // "Third level\n" +
  61. // "Fourth level\n" +
  62. // "Fifth level\n";
  63. // Just slides, no notes
  64. extractor.setSlidesByDefault(true);
  65. extractor.setNotesByDefault(false);
  66. extractor.setMasterByDefault(false);
  67. text = extractor.getText();
  68. String slideText =
  69. "Lorem ipsum dolor sit amet\n" +
  70. "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
  71. "\n" +
  72. "Lorem ipsum dolor sit amet\n" +
  73. "Lorem\n" +
  74. "ipsum\n" +
  75. "dolor\n" +
  76. "sit\n" +
  77. "amet\n" +
  78. "\n";
  79. assertEquals(slideText, text);
  80. // Just notes, no slides
  81. extractor.setSlidesByDefault(false);
  82. extractor.setNotesByDefault(true);
  83. text = extractor.getText();
  84. assertEquals("\n1\n\n2\n", text);
  85. // Both
  86. extractor.setSlidesByDefault(true);
  87. extractor.setNotesByDefault(true);
  88. text = extractor.getText();
  89. String bothText =
  90. "Lorem ipsum dolor sit amet\n" +
  91. "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
  92. "\n\n1\n" +
  93. "Lorem ipsum dolor sit amet\n" +
  94. "Lorem\n" +
  95. "ipsum\n" +
  96. "dolor\n" +
  97. "sit\n" +
  98. "amet\n" +
  99. "\n\n2\n";
  100. assertEquals(bothText, text);
  101. // With Slides and Master Text
  102. extractor.setSlidesByDefault(true);
  103. extractor.setNotesByDefault(false);
  104. extractor.setMasterByDefault(true);
  105. text = extractor.getText();
  106. String smText =
  107. "Lorem ipsum dolor sit amet\n" +
  108. "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
  109. "\n" +
  110. "Lorem ipsum dolor sit amet\n" +
  111. "Lorem\n" +
  112. "ipsum\n" +
  113. "dolor\n" +
  114. "sit\n" +
  115. "amet\n" +
  116. "\n";
  117. assertEquals(smText, text);
  118. // With Slides, Notes and Master Text
  119. extractor.setSlidesByDefault(true);
  120. extractor.setNotesByDefault(true);
  121. extractor.setMasterByDefault(true);
  122. text = extractor.getText();
  123. String snmText =
  124. "Lorem ipsum dolor sit amet\n" +
  125. "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
  126. "\n\n1\n" +
  127. "Lorem ipsum dolor sit amet\n" +
  128. "Lorem\n" +
  129. "ipsum\n" +
  130. "dolor\n" +
  131. "sit\n" +
  132. "amet\n" +
  133. "\n\n2\n";
  134. assertEquals(snmText, text);
  135. // Via set defaults
  136. extractor.setSlidesByDefault(false);
  137. extractor.setNotesByDefault(true);
  138. text = extractor.getText();
  139. assertEquals("\n1\n\n2\n", text);
  140. }
  141. }
  142. @Test
  143. public void testGetComments() throws IOException {
  144. try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
  145. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xml)) {
  146. extractor.setCommentsByDefault(true);
  147. String text = extractor.getText();
  148. assertTrue(text.length() > 0);
  149. // Check comments are there
  150. assertContains(text, "testdoc");
  151. assertContains(text, "test phrase");
  152. // Check the authors came through too
  153. assertContains(text, "XPVMWARE01");
  154. }
  155. }
  156. @Test
  157. public void testGetMasterText() throws Exception {
  158. try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
  159. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xml)) {
  160. extractor.setSlidesByDefault(true);
  161. extractor.setNotesByDefault(false);
  162. extractor.setMasterByDefault(true);
  163. String text = extractor.getText();
  164. assertTrue(text.length() > 0);
  165. // Check master text is there
  166. assertContains(text, "Footer from the master slide");
  167. // Theme text shouldn't show up
  168. // String themeText =
  169. // "Theme Master Title\n" +
  170. // "Theme Master first level\n" +
  171. // "And the 2nd level\n" +
  172. // "Our 3rd level goes here\n" +
  173. // "And onto the 4th, such fun....\n" +
  174. // "Finally is the Fifth level\n";
  175. // Check the whole text
  176. String wholeText =
  177. "First page title\n" +
  178. "First page subtitle\n" +
  179. "This text comes from the Master Slide\n" +
  180. "\n" +
  181. "2nd page subtitle\n" +
  182. "Footer from the master slide\n" +
  183. "This text comes from the Master Slide\n";
  184. assertEquals(wholeText, text);
  185. }
  186. }
  187. @Test
  188. public void testTable() throws Exception {
  189. try (XMLSlideShow xml = openPPTX("present1.pptx");
  190. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xml)) {
  191. String text = extractor.getText();
  192. assertTrue(text.length() > 0);
  193. // Check comments are there
  194. assertContains(text, "TEST");
  195. }
  196. }
  197. /**
  198. * Test that we can get the text from macro enabled,
  199. * template, theme, slide enabled etc formats, as
  200. * well as from the normal file
  201. */
  202. @Test
  203. public void testDifferentSubformats() throws Exception {
  204. String[] extensions = new String[] {
  205. "pptx", "pptm", "ppsm", "ppsx", "thmx",
  206. // "xps" - Doesn't have a core document
  207. };
  208. for(String extension : extensions) {
  209. String filename = "testPPT." + extension;
  210. try (XMLSlideShow xml = openPPTX(filename);
  211. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xml)) {
  212. String text = extractor.getText();
  213. if (extension.equals("thmx")) {
  214. // Theme file doesn't have any textual content
  215. assertEquals(filename, 0, text.length());
  216. continue;
  217. }
  218. assertTrue(filename, text.length() > 0);
  219. assertContains(filename, text, "Attachment Test");
  220. assertContains(filename, text, "This is a test file data with the same content");
  221. assertContains(filename, text, "content parsing");
  222. assertContains(filename, text, "Different words to test against");
  223. assertContains(filename, text, "Mystery");
  224. }
  225. }
  226. }
  227. @Test
  228. public void test45541() throws IOException {
  229. // extract text from a powerpoint that has a header in the notes-element
  230. final File headerFile = slTests.getFile("45541_Header.pptx");
  231. //noinspection rawtypes
  232. try (final SlideShowExtractor extr = (SlideShowExtractor) ExtractorFactory.createExtractor(headerFile)) {
  233. String text = extr.getText();
  234. assertNotNull(text);
  235. assertFalse("Had: " + text, text.contains("testdoc"));
  236. extr.setSlidesByDefault(false);
  237. extr.setNotesByDefault(true);
  238. text = extr.getText();
  239. assertContains(text, "testdoc");
  240. assertNotNull(text);
  241. }
  242. // extract text from a powerpoint that has a footer in the master-slide
  243. final File footerFile = slTests.getFile("45541_Footer.pptx");
  244. //noinspection rawtypes
  245. try (SlideShowExtractor extr = (SlideShowExtractor)ExtractorFactory.createExtractor(footerFile)) {
  246. String text = extr.getText();
  247. assertNotContained(text, "testdoc");
  248. extr.setSlidesByDefault(false);
  249. extr.setNotesByDefault(true);
  250. text = extr.getText();
  251. assertNotContained(text, "testdoc");
  252. extr.setSlidesByDefault(false);
  253. extr.setNotesByDefault(false);
  254. extr.setMasterByDefault(true);
  255. text = extr.getText();
  256. assertNotContained(text, "testdoc");
  257. }
  258. }
  259. @Test
  260. public void bug54570() throws IOException {
  261. try (XMLSlideShow xml = openPPTX("bug54570.pptx");
  262. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xml)) {
  263. String text = extractor.getText();
  264. assertNotNull(text);
  265. }
  266. }
  267. private XMLSlideShow openPPTX(String file) throws IOException {
  268. try (InputStream is = slTests.openResourceAsStream(file)) {
  269. return new XMLSlideShow(is);
  270. }
  271. }
  272. @Test
  273. public void setSlTests() throws IOException {
  274. try (XMLSlideShow xml = openPPTX("aascu.org_hbcu_leadershipsummit_cooper_.pptx")) {
  275. SlideShowExtractor<XSLFShape, XSLFTextParagraph> extractor = new SlideShowExtractor<>(xml);
  276. assertNotNull(extractor);
  277. extractor.setSlidesByDefault(true);
  278. extractor.setNotesByDefault(true);
  279. extractor.setMasterByDefault(true);
  280. assertNotNull(extractor.getText());
  281. }
  282. }
  283. }