You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PowerPointExtractor.java 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hslf.extractor;
  16. import java.io.File;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.util.List;
  20. import org.apache.poi.EncryptedDocumentException;
  21. import org.apache.poi.extractor.POIOLE2TextExtractor;
  22. import org.apache.poi.hslf.usermodel.HSLFObjectShape;
  23. import org.apache.poi.hslf.usermodel.HSLFShape;
  24. import org.apache.poi.hslf.usermodel.HSLFSlideShow;
  25. import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
  26. import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
  27. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  28. import org.apache.poi.poifs.filesystem.DirectoryNode;
  29. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  30. import org.apache.poi.sl.extractor.SlideShowExtractor;
  31. import org.apache.poi.sl.usermodel.SlideShow;
  32. import org.apache.poi.sl.usermodel.SlideShowFactory;
  33. import org.apache.poi.util.Removal;
  34. /**
  35. * This class can be used to extract text from a PowerPoint file. Can optionally
  36. * also get the notes from one.
  37. *
  38. * @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead
  39. */
  40. @SuppressWarnings("WeakerAccess")
  41. @Deprecated
  42. @Removal(version="5.0.0")
  43. public final class PowerPointExtractor extends POIOLE2TextExtractor {
  44. private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
  45. private boolean slidesByDefault = true;
  46. private boolean notesByDefault;
  47. private boolean commentsByDefault;
  48. private boolean masterByDefault;
  49. /**
  50. * Basic extractor. Returns all the text, and optionally all the notes
  51. */
  52. public static void main(String[] args) throws IOException {
  53. if (args.length < 1) {
  54. System.err.println("Usage:");
  55. System.err.println("\tPowerPointExtractor [-notes] <file>");
  56. System.exit(1);
  57. }
  58. boolean notes = false;
  59. boolean comments = false;
  60. boolean master = true;
  61. String file;
  62. if (args.length > 1) {
  63. notes = true;
  64. file = args[1];
  65. if (args.length > 2) {
  66. comments = true;
  67. }
  68. } else {
  69. file = args[0];
  70. }
  71. try (PowerPointExtractor ppe = new PowerPointExtractor(file)) {
  72. System.out.println(ppe.getText(true, notes, comments, master));
  73. }
  74. }
  75. public PowerPointExtractor(final HSLFSlideShow slideShow) {
  76. super(slideShow.getSlideShowImpl());
  77. setFilesystem(slideShow);
  78. delegate = new SlideShowExtractor<>(slideShow);
  79. }
  80. /**
  81. * Creates a PowerPointExtractor, from a file
  82. *
  83. * @param fileName The name of the file to extract from
  84. */
  85. public PowerPointExtractor(String fileName) throws IOException {
  86. this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true));
  87. }
  88. /**
  89. * Creates a PowerPointExtractor, from an Input Stream
  90. *
  91. * @param iStream The input stream containing the PowerPoint document
  92. */
  93. public PowerPointExtractor(InputStream iStream) throws IOException {
  94. this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword()));
  95. }
  96. /**
  97. * Creates a PowerPointExtractor, from an open POIFSFileSystem
  98. *
  99. * @param fs the POIFSFileSystem containing the PowerPoint document
  100. */
  101. public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
  102. this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword()));
  103. }
  104. /**
  105. * Creates a PowerPointExtractor, from a specific place
  106. * inside an open {@link POIFSFileSystem}
  107. *
  108. * @param dir the POIFS Directory containing the PowerPoint document
  109. */
  110. public PowerPointExtractor(DirectoryNode dir) throws IOException {
  111. this(new HSLFSlideShow(dir));
  112. }
  113. /**
  114. * Creates a PowerPointExtractor, from a HSLFSlideShow
  115. *
  116. * @param ss the HSLFSlideShow to extract text from
  117. */
  118. public PowerPointExtractor(HSLFSlideShowImpl ss) {
  119. this(new HSLFSlideShow(ss));
  120. }
  121. /**
  122. * Should a call to getText() return slide text? Default is yes
  123. */
  124. public void setSlidesByDefault(final boolean slidesByDefault) {
  125. this.slidesByDefault = slidesByDefault;
  126. delegate.setSlidesByDefault(slidesByDefault);
  127. }
  128. /**
  129. * Should a call to getText() return notes text? Default is no
  130. */
  131. public void setNotesByDefault(final boolean notesByDefault) {
  132. this.notesByDefault = notesByDefault;
  133. delegate.setNotesByDefault(notesByDefault);
  134. }
  135. /**
  136. * Should a call to getText() return comments text? Default is no
  137. */
  138. public void setCommentsByDefault(final boolean commentsByDefault) {
  139. this.commentsByDefault = commentsByDefault;
  140. delegate.setCommentsByDefault(commentsByDefault);
  141. }
  142. /**
  143. * Should a call to getText() return text from master? Default is no
  144. */
  145. public void setMasterByDefault(final boolean masterByDefault) {
  146. this.masterByDefault = masterByDefault;
  147. delegate.setMasterByDefault(masterByDefault);
  148. }
  149. /**
  150. * Fetches all the slide text from the slideshow, but not the notes, unless
  151. * you've called setSlidesByDefault() and setNotesByDefault() to change this
  152. */
  153. @Override
  154. public String getText() {
  155. return delegate.getText();
  156. }
  157. /**
  158. * Fetches text from the slideshow, be it slide text or note text. Because
  159. * the final block of text in a TextRun normally have their last \n
  160. * stripped, we add it back
  161. *
  162. * @param getSlideText fetch slide text
  163. * @param getNoteText fetch note text
  164. */
  165. public String getText(boolean getSlideText, boolean getNoteText) {
  166. return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault);
  167. }
  168. public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
  169. delegate.setSlidesByDefault(getSlideText);
  170. delegate.setNotesByDefault(getNoteText);
  171. delegate.setCommentsByDefault(getCommentText);
  172. delegate.setMasterByDefault(getMasterText);
  173. try {
  174. return delegate.getText();
  175. } finally {
  176. delegate.setSlidesByDefault(slidesByDefault);
  177. delegate.setNotesByDefault(notesByDefault);
  178. delegate.setCommentsByDefault(commentsByDefault);
  179. delegate.setMasterByDefault(masterByDefault);
  180. }
  181. }
  182. /**
  183. * Fetches all the notes text from the slideshow, but not the slide text
  184. */
  185. public String getNotes() {
  186. return getText(false, true, false, false);
  187. }
  188. @SuppressWarnings("unchecked")
  189. public List<HSLFObjectShape> getOLEShapes() {
  190. return (List<HSLFObjectShape>)delegate.getOLEShapes();
  191. }
  192. /**
  193. * Helper method to avoid problems with compiling code in Eclipse
  194. *
  195. * Eclipse javac has some bugs with complex casts, this method tries
  196. * to work around this.
  197. *
  198. * @param fs The {@link POIFSFileSystem} to read the document from
  199. * @param password The password that should be used or null if no password is necessary.
  200. *
  201. * @return The created SlideShow
  202. *
  203. * @throws IOException if an error occurs while reading the data
  204. */
  205. private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException {
  206. // Note: don't change the code here, it is required for Eclipse to compile the code
  207. SlideShow slideShowOrig = SlideShowFactory.create(fs, password);
  208. return (HSLFSlideShow)slideShowOrig;
  209. }
  210. /**
  211. * Helper method to avoid problems with compiling code in Eclipse
  212. *
  213. * Eclipse javac has some bugs with complex casts, this method tries
  214. * to work around this.
  215. *
  216. * @param inp The {@link InputStream} to read data from.
  217. * @param password The password that should be used or null if no password is necessary.
  218. *
  219. * @return The created SlideShow
  220. *
  221. * @throws IOException if an error occurs while reading the data
  222. * @throws EncryptedDocumentException If the wrong password is given for a protected file
  223. */
  224. private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException {
  225. // Note: don't change the code here, it is required for Eclipse to compile the code
  226. SlideShow slideShowOrig = SlideShowFactory.create(inp, password);
  227. return (HSLFSlideShow)slideShowOrig;
  228. }
  229. /**
  230. * Helper method to avoid problems with compiling code in Eclipse
  231. *
  232. * Eclipse javac has some bugs with complex casts, this method tries
  233. * to work around this.
  234. *
  235. * @param file The file to read data from.
  236. * @param password The password that should be used or null if no password is necessary.
  237. * @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back
  238. * changes when the document is closed.
  239. *
  240. * @return The created SlideShow
  241. *
  242. * @throws IOException if an error occurs while reading the data
  243. * @throws EncryptedDocumentException If the wrong password is given for a protected file
  244. */
  245. private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException {
  246. // Note: don't change the code here, it is required for Eclipse to compile the code
  247. SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly);
  248. return (HSLFSlideShow)slideShowOrig;
  249. }
  250. }