You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PowerPointExtractor.java 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hslf.extractor;
  16. import java.io.FileInputStream;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.util.HashSet;
  20. import java.util.List;
  21. import java.util.ArrayList;
  22. import org.apache.poi.POIOLE2TextExtractor;
  23. import org.apache.poi.hslf.HSLFSlideShow;
  24. import org.apache.poi.hslf.model.*;
  25. import org.apache.poi.hslf.usermodel.SlideShow;
  26. import org.apache.poi.poifs.filesystem.DirectoryNode;
  27. import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
  28. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  29. /**
  30. * This class can be used to extract text from a PowerPoint file. Can optionally
  31. * also get the notes from one.
  32. *
  33. * @author Nick Burch
  34. */
  35. public final class PowerPointExtractor extends POIOLE2TextExtractor {
  36. private HSLFSlideShow _hslfshow;
  37. private SlideShow _show;
  38. private Slide[] _slides;
  39. private boolean _slidesByDefault = true;
  40. private boolean _notesByDefault = false;
  41. private boolean _commentsByDefault = false;
  42. private boolean _masterByDefault = false;
  43. /**
  44. * Basic extractor. Returns all the text, and optionally all the notes
  45. */
  46. public static void main(String args[]) throws IOException {
  47. if (args.length < 1) {
  48. System.err.println("Useage:");
  49. System.err.println("\tPowerPointExtractor [-notes] <file>");
  50. System.exit(1);
  51. }
  52. boolean notes = false;
  53. boolean comments = false;
  54. boolean master = true;
  55. String file;
  56. if (args.length > 1) {
  57. notes = true;
  58. file = args[1];
  59. if (args.length > 2) {
  60. comments = true;
  61. }
  62. } else {
  63. file = args[0];
  64. }
  65. PowerPointExtractor ppe = new PowerPointExtractor(file);
  66. System.out.println(ppe.getText(true, notes, comments, master));
  67. }
  68. /**
  69. * Creates a PowerPointExtractor, from a file
  70. *
  71. * @param fileName The name of the file to extract from
  72. */
  73. public PowerPointExtractor(String fileName) throws IOException {
  74. this(new FileInputStream(fileName));
  75. }
  76. /**
  77. * Creates a PowerPointExtractor, from an Input Stream
  78. *
  79. * @param iStream The input stream containing the PowerPoint document
  80. */
  81. public PowerPointExtractor(InputStream iStream) throws IOException {
  82. this(new POIFSFileSystem(iStream));
  83. }
  84. /**
  85. * Creates a PowerPointExtractor, from an open POIFSFileSystem
  86. *
  87. * @param fs the POIFSFileSystem containing the PowerPoint document
  88. */
  89. public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
  90. this(fs.getRoot());
  91. }
  92. /**
  93. * Creates a PowerPointExtractor, from an open NPOIFSFileSystem
  94. *
  95. * @param fs the NPOIFSFileSystem containing the PowerPoint document
  96. */
  97. public PowerPointExtractor(NPOIFSFileSystem fs) throws IOException {
  98. this(fs.getRoot());
  99. }
  100. /**
  101. * Creates a PowerPointExtractor, from a specific place
  102. * inside an open NPOIFSFileSystem
  103. *
  104. * @param dir the POIFS Directory containing the PowerPoint document
  105. */
  106. public PowerPointExtractor(DirectoryNode dir) throws IOException {
  107. this(new HSLFSlideShow(dir));
  108. }
  109. /**
  110. * @deprecated Use {@link #PowerPointExtractor(DirectoryNode)} instead
  111. */
  112. @Deprecated
  113. public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
  114. this(new HSLFSlideShow(dir, fs));
  115. }
  116. /**
  117. * Creates a PowerPointExtractor, from a HSLFSlideShow
  118. *
  119. * @param ss the HSLFSlideShow to extract text from
  120. */
  121. public PowerPointExtractor(HSLFSlideShow ss) {
  122. super(ss);
  123. _hslfshow = ss;
  124. _show = new SlideShow(_hslfshow);
  125. _slides = _show.getSlides();
  126. }
  127. /**
  128. * Should a call to getText() return slide text? Default is yes
  129. */
  130. public void setSlidesByDefault(boolean slidesByDefault) {
  131. this._slidesByDefault = slidesByDefault;
  132. }
  133. /**
  134. * Should a call to getText() return notes text? Default is no
  135. */
  136. public void setNotesByDefault(boolean notesByDefault) {
  137. this._notesByDefault = notesByDefault;
  138. }
  139. /**
  140. * Should a call to getText() return comments text? Default is no
  141. */
  142. public void setCommentsByDefault(boolean commentsByDefault) {
  143. this._commentsByDefault = commentsByDefault;
  144. }
  145. /**
  146. * Should a call to getText() return text from master? Default is no
  147. */
  148. public void setMasterByDefault(boolean masterByDefault) {
  149. this._masterByDefault = masterByDefault;
  150. }
  151. /**
  152. * Fetches all the slide text from the slideshow, but not the notes, unless
  153. * you've called setSlidesByDefault() and setNotesByDefault() to change this
  154. */
  155. public String getText() {
  156. return getText(_slidesByDefault, _notesByDefault, _commentsByDefault, _masterByDefault);
  157. }
  158. /**
  159. * Fetches all the notes text from the slideshow, but not the slide text
  160. */
  161. public String getNotes() {
  162. return getText(false, true);
  163. }
  164. public List<OLEShape> getOLEShapes() {
  165. List<OLEShape> list = new ArrayList<OLEShape>();
  166. for (int i = 0; i < _slides.length; i++) {
  167. Slide slide = _slides[i];
  168. Shape[] shapes = slide.getShapes();
  169. for (int j = 0; j < shapes.length; j++) {
  170. if (shapes[j] instanceof OLEShape) {
  171. list.add((OLEShape) shapes[j]);
  172. }
  173. }
  174. }
  175. return list;
  176. }
  177. /**
  178. * Fetches text from the slideshow, be it slide text or note text. Because
  179. * the final block of text in a TextRun normally have their last \n
  180. * stripped, we add it back
  181. *
  182. * @param getSlideText fetch slide text
  183. * @param getNoteText fetch note text
  184. */
  185. public String getText(boolean getSlideText, boolean getNoteText) {
  186. return getText(getSlideText, getNoteText, _commentsByDefault, _masterByDefault);
  187. }
  188. public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
  189. StringBuffer ret = new StringBuffer();
  190. if (getSlideText) {
  191. if (getMasterText) {
  192. for (SlideMaster master : _show.getSlidesMasters()) {
  193. for(Shape sh : master.getShapes()){
  194. if(sh instanceof TextShape){
  195. if(MasterSheet.isPlaceholder(sh)) {
  196. // don't bother about boiler
  197. // plate text on master
  198. // sheets
  199. continue;
  200. }
  201. TextShape tsh = (TextShape)sh;
  202. String text = tsh.getText();
  203. ret.append(text);
  204. if (!text.endsWith("\n")) {
  205. ret.append("\n");
  206. }
  207. }
  208. }
  209. }
  210. }
  211. for (int i = 0; i < _slides.length; i++) {
  212. Slide slide = _slides[i];
  213. // Slide header, if set
  214. HeadersFooters hf = slide.getHeadersFooters();
  215. if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
  216. ret.append(hf.getHeaderText() + "\n");
  217. }
  218. // Slide text
  219. textRunsToText(ret, slide.getTextRuns());
  220. // Table text
  221. for (Shape shape : slide.getShapes()){
  222. if (shape instanceof Table){
  223. extractTableText(ret, (Table)shape);
  224. }
  225. }
  226. // Slide footer, if set
  227. if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
  228. ret.append(hf.getFooterText() + "\n");
  229. }
  230. // Comments, if requested and present
  231. if (getCommentText) {
  232. Comment[] comments = slide.getComments();
  233. for (int j = 0; j < comments.length; j++) {
  234. ret.append(comments[j].getAuthor() + " - " + comments[j].getText() + "\n");
  235. }
  236. }
  237. }
  238. if (getNoteText) {
  239. ret.append("\n");
  240. }
  241. }
  242. if (getNoteText) {
  243. // Not currently using _notes, as that can have the notes of
  244. // master sheets in. Grab Slide list, then work from there,
  245. // but ensure no duplicates
  246. HashSet<Integer> seenNotes = new HashSet<Integer>();
  247. HeadersFooters hf = _show.getNotesHeadersFooters();
  248. for (int i = 0; i < _slides.length; i++) {
  249. Notes notes = _slides[i].getNotesSheet();
  250. if (notes == null) {
  251. continue;
  252. }
  253. Integer id = Integer.valueOf(notes._getSheetNumber());
  254. if (seenNotes.contains(id)) {
  255. continue;
  256. }
  257. seenNotes.add(id);
  258. // Repeat the Notes header, if set
  259. if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
  260. ret.append(hf.getHeaderText() + "\n");
  261. }
  262. // Notes text
  263. textRunsToText(ret, notes.getTextRuns());
  264. // Repeat the notes footer, if set
  265. if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
  266. ret.append(hf.getFooterText() + "\n");
  267. }
  268. }
  269. }
  270. return ret.toString();
  271. }
  272. private void extractTableText(StringBuffer ret, Table table) {
  273. for (int row = 0; row < table.getNumberOfRows(); row++){
  274. for (int col = 0; col < table.getNumberOfColumns(); col++){
  275. TableCell cell = table.getCell(row, col);
  276. //defensive null checks; don't know if they're necessary
  277. if (cell != null){
  278. String txt = cell.getText();
  279. txt = (txt == null) ? "" : txt;
  280. ret.append(txt);
  281. if (col < table.getNumberOfColumns()-1){
  282. ret.append("\t");
  283. }
  284. }
  285. }
  286. ret.append('\n');
  287. }
  288. }
  289. private void textRunsToText(StringBuffer ret, TextRun[] runs) {
  290. if (runs==null) {
  291. return;
  292. }
  293. for (int j = 0; j < runs.length; j++) {
  294. TextRun run = runs[j];
  295. if (run != null) {
  296. String text = run.getText();
  297. ret.append(text);
  298. if (!text.endsWith("\n")) {
  299. ret.append("\n");
  300. }
  301. }
  302. }
  303. }
  304. }