Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

TestExtractor.java 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hslf.extractor;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.InputStream;
  18. import java.util.List;
  19. import junit.framework.TestCase;
  20. import org.apache.poi.POIDataSamples;
  21. import org.apache.poi.POITestCase;
  22. import org.apache.poi.hslf.HSLFSlideShow;
  23. import org.apache.poi.hslf.model.OLEShape;
  24. import org.apache.poi.hslf.usermodel.SlideShow;
  25. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  26. import org.apache.poi.hwpf.HWPFDocument;
  27. import org.apache.poi.poifs.filesystem.DirectoryNode;
  28. import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
  29. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  30. import org.apache.poi.util.IOUtils;
  31. /**
  32. * Tests that the extractor correctly gets the text out of our sample file
  33. */
  34. public final class TestExtractor extends POITestCase {
  35. /** Extractor primed on the 2 page basic test data */
  36. private PowerPointExtractor ppe;
  37. private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";
  38. /** Extractor primed on the 1 page but text-box'd test data */
  39. private PowerPointExtractor ppe2;
  40. private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
  41. /** Where our embeded files live */
  42. //private String pdirname;
  43. private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
  44. //private String pdirname;
  45. protected void setUp() throws Exception {
  46. ppe = new PowerPointExtractor(slTests.openResourceAsStream("basic_test_ppt_file.ppt"));
  47. ppe2 = new PowerPointExtractor(slTests.openResourceAsStream("with_textbox.ppt"));
  48. }
  49. private static void assertContains(String reason, String haystack, String needle) {
  50. assertTrue(reason, haystack.contains(needle));
  51. }
  52. public void testReadSheetText() {
  53. // Basic 2 page example
  54. String sheetText = ppe.getText();
  55. ensureTwoStringsTheSame(expectText, sheetText);
  56. // 1 page example with text boxes
  57. sheetText = ppe2.getText();
  58. ensureTwoStringsTheSame(expectText2, sheetText);
  59. }
  60. public void testReadNoteText() {
  61. // Basic 2 page example
  62. String notesText = ppe.getNotes();
  63. String expectText = "These are the notes for page 1\nThese are the notes on page two, again lacking formatting\n";
  64. ensureTwoStringsTheSame(expectText, notesText);
  65. // Other one doesn't have notes
  66. notesText = ppe2.getNotes();
  67. expectText = "";
  68. ensureTwoStringsTheSame(expectText, notesText);
  69. }
  70. public void testReadBoth() {
  71. String[] slText = new String[] {
  72. "This is a test title\nThis is a test subtitle\nThis is on page 1\n",
  73. "This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
  74. };
  75. String[] ntText = new String[] {
  76. "These are the notes for page 1\n",
  77. "These are the notes on page two, again lacking formatting\n"
  78. };
  79. ppe.setSlidesByDefault(true);
  80. ppe.setNotesByDefault(false);
  81. assertEquals(slText[0]+slText[1], ppe.getText());
  82. ppe.setSlidesByDefault(false);
  83. ppe.setNotesByDefault(true);
  84. assertEquals(ntText[0]+ntText[1], ppe.getText());
  85. ppe.setSlidesByDefault(true);
  86. ppe.setNotesByDefault(true);
  87. assertEquals(slText[0]+slText[1]+"\n"+ntText[0]+ntText[1], ppe.getText());
  88. }
  89. /**
  90. * Test that when presented with a PPT file missing the odd
  91. * core record, we can still get the rest of the text out
  92. * @throws Exception
  93. */
  94. public void testMissingCoreRecords() throws Exception {
  95. ppe = new PowerPointExtractor(slTests.openResourceAsStream("missing_core_records.ppt"));
  96. String text = ppe.getText(true, false);
  97. String nText = ppe.getNotes();
  98. assertNotNull(text);
  99. assertNotNull(nText);
  100. // Notes record were corrupt, so don't expect any
  101. assertEquals(nText.length(), 0);
  102. // Slide records were fine
  103. assertTrue(text.startsWith("Using Disease Surveillance and Response"));
  104. }
  105. private void ensureTwoStringsTheSame(String exp, String act) {
  106. assertEquals(exp.length(),act.length());
  107. char[] expC = exp.toCharArray();
  108. char[] actC = act.toCharArray();
  109. for(int i=0; i<expC.length; i++) {
  110. assertEquals("Char " + i, expC[i], actC[i]);
  111. }
  112. assertEquals(exp,act);
  113. }
  114. public void testExtractFromEmbeded() throws Exception {
  115. POIFSFileSystem fs = new POIFSFileSystem(
  116. POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls")
  117. );
  118. HSLFSlideShow ss;
  119. DirectoryNode dirA = (DirectoryNode)
  120. fs.getRoot().getEntry("MBD0000A3B6");
  121. DirectoryNode dirB = (DirectoryNode)
  122. fs.getRoot().getEntry("MBD0000A3B3");
  123. assertNotNull(dirA.getEntry("PowerPoint Document"));
  124. assertNotNull(dirB.getEntry("PowerPoint Document"));
  125. // Check the first file
  126. ss = new HSLFSlideShow(dirA);
  127. ppe = new PowerPointExtractor(ss);
  128. assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
  129. ppe.getText(true, false)
  130. );
  131. // And the second
  132. ss = new HSLFSlideShow(dirB);
  133. ppe = new PowerPointExtractor(ss);
  134. assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
  135. ppe.getText(true, false)
  136. );
  137. }
  138. /**
  139. * A powerpoint file with embeded powerpoint files
  140. */
  141. public void testExtractFromOwnEmbeded() throws Exception {
  142. String path = "ppt_with_embeded.ppt";
  143. ppe = new PowerPointExtractor(POIDataSamples.getSlideShowInstance().openResourceAsStream(path));
  144. List<OLEShape> shapes = ppe.getOLEShapes();
  145. assertEquals("Expected 6 ole shapes in " + path, 6, shapes.size());
  146. int num_ppt = 0, num_doc = 0, num_xls = 0;
  147. for(OLEShape ole : shapes) {
  148. String name = ole.getInstanceName();
  149. InputStream data = ole.getObjectData().getData();
  150. if ("Worksheet".equals(name)) {
  151. HSSFWorkbook wb = new HSSFWorkbook(data);
  152. num_xls++;
  153. } else if ("Document".equals(name)) {
  154. HWPFDocument doc = new HWPFDocument(data);
  155. num_doc++;
  156. } else if ("Presentation".equals(name)) {
  157. num_ppt++;
  158. SlideShow ppt = new SlideShow(data);
  159. }
  160. }
  161. assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
  162. assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
  163. assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
  164. }
  165. /**
  166. * A powerpoint file with embeded powerpoint files
  167. */
  168. public void test52991() throws Exception {
  169. String path = "badzip.ppt";
  170. ppe = new PowerPointExtractor(POIDataSamples.getSlideShowInstance().openResourceAsStream(path));
  171. List<OLEShape> shapes = ppe.getOLEShapes();
  172. for (OLEShape shape : shapes) {
  173. IOUtils.copy(shape.getObjectData().getData(), new ByteArrayOutputStream());
  174. }
  175. }
  176. /**
  177. * From bug #45543
  178. */
  179. public void testWithComments() throws Exception {
  180. ppe = new PowerPointExtractor(slTests.openResourceAsStream("WithComments.ppt"));
  181. String text = ppe.getText();
  182. assertFalse("Comments not in by default", text.contains("This is a test comment"));
  183. ppe.setCommentsByDefault(true);
  184. text = ppe.getText();
  185. assertContains(text, "This is a test comment");
  186. // And another file
  187. ppe = new PowerPointExtractor(slTests.openResourceAsStream("45543.ppt"));
  188. text = ppe.getText();
  189. assertFalse("Comments not in by default", text.contains("testdoc"));
  190. ppe.setCommentsByDefault(true);
  191. text = ppe.getText();
  192. assertContains(text, "testdoc");
  193. }
  194. /**
  195. * From bug #45537
  196. */
  197. public void testHeaderFooter() throws Exception {
  198. String text;
  199. // With a header on the notes
  200. HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("45537_Header.ppt"));
  201. SlideShow ss = new SlideShow(hslf);
  202. assertNotNull(ss.getNotesHeadersFooters());
  203. assertEquals("testdoc test phrase", ss.getNotesHeadersFooters().getHeaderText());
  204. ppe = new PowerPointExtractor(hslf);
  205. text = ppe.getText();
  206. assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
  207. assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
  208. ppe.setNotesByDefault(true);
  209. text = ppe.getText();
  210. assertContains(text, "testdoc");
  211. assertContains(text, "test phrase");
  212. // And with a footer, also on notes
  213. hslf = new HSLFSlideShow(slTests.openResourceAsStream("45537_Footer.ppt"));
  214. ss = new SlideShow(hslf);
  215. assertNotNull(ss.getNotesHeadersFooters());
  216. assertEquals("testdoc test phrase", ss.getNotesHeadersFooters().getFooterText());
  217. ppe = new PowerPointExtractor(slTests.openResourceAsStream("45537_Footer.ppt"));
  218. text = ppe.getText();
  219. assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
  220. assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
  221. ppe.setNotesByDefault(true);
  222. text = ppe.getText();
  223. assertContains(text, "testdoc");
  224. assertContains(text, "test phrase");
  225. }
  226. public void testSlideMasterText() throws Exception {
  227. String masterTitleText = "This is the Master Title";
  228. String masterRandomText = "This text comes from the Master Slide";
  229. String masterFooterText = "Footer from the master slide";
  230. HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("WithMaster.ppt"));
  231. ppe = new PowerPointExtractor(hslf);
  232. String text = ppe.getText();
  233. //assertContains(text, masterTitleText); // TODO Is this available in PPT?
  234. //assertContains(text, masterRandomText); // TODO Extract
  235. assertContains(text, masterFooterText);
  236. }
  237. public void testMasterText() throws Exception {
  238. ppe = new PowerPointExtractor(slTests.openResourceAsStream("master_text.ppt"));
  239. // Initially not there
  240. String text = ppe.getText();
  241. assertFalse(text.contains("Text that I added to the master slide"));
  242. // Enable, shows up
  243. ppe.setMasterByDefault(true);
  244. text = ppe.getText();
  245. assertTrue(text.contains("Text that I added to the master slide"));
  246. // Make sure placeholder text does not come out
  247. assertFalse(text.contains("Click to edit Master"));
  248. // Now with another file only containing master text
  249. // Will always show up
  250. String masterText = "Footer from the master slide";
  251. HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("WithMaster.ppt"));
  252. ppe = new PowerPointExtractor(hslf);
  253. text = ppe.getText();
  254. assertContains(text.toLowerCase(), "master");
  255. assertContains(text, masterText);
  256. }
  257. /**
  258. * Bug #54880 Chinese text not extracted properly
  259. */
  260. public void testChineseText() throws Exception {
  261. HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("54880_chinese.ppt"));
  262. ppe = new PowerPointExtractor(hslf);
  263. String text = ppe.getText();
  264. // Check for the english text line
  265. assertContains(text, "Single byte");
  266. // Check for the english text in the mixed line
  267. assertContains(text, "Mix");
  268. // Check for the chinese text in the mixed line - 表
  269. assertContains(text, "\u8868");
  270. // Check for the chinese only text line - ハンカク
  271. assertContains(text, "\uff8a\uff9d\uff76\uff78");
  272. }
  273. /**
  274. * Tests that we can work with both {@link POIFSFileSystem}
  275. * and {@link NPOIFSFileSystem}
  276. */
  277. public void testDifferentPOIFS() throws Exception {
  278. // Open the two filesystems
  279. DirectoryNode[] files = new DirectoryNode[2];
  280. files[0] = (new POIFSFileSystem(slTests.openResourceAsStream("basic_test_ppt_file.ppt"))).getRoot();
  281. files[1] = (new NPOIFSFileSystem(slTests.getFile("basic_test_ppt_file.ppt"))).getRoot();
  282. // Open directly
  283. for(DirectoryNode dir : files) {
  284. PowerPointExtractor extractor = new PowerPointExtractor(dir, null);
  285. assertEquals(expectText, extractor.getText());
  286. }
  287. // Open via a HWPFDocument
  288. for(DirectoryNode dir : files) {
  289. HSLFSlideShow slideshow = new HSLFSlideShow(dir);
  290. PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
  291. assertEquals(expectText, extractor.getText());
  292. }
  293. }
  294. public void testTable() throws Exception{
  295. ppe = new PowerPointExtractor(slTests.openResourceAsStream("54111.ppt"));
  296. String text = ppe.getText();
  297. String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
  298. "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
  299. "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
  300. "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
  301. "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
  302. "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
  303. assertTrue(text.contains(target));
  304. ppe = new PowerPointExtractor(slTests.openResourceAsStream("54722.ppt"));
  305. text = ppe.getText();
  306. target = "this\tText\tis\twithin\ta\n"+
  307. "table\t1\t2\t3\t4";
  308. assertTrue(text.contains(target));
  309. }
  310. }