You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SlideShowExtractor.java 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.sl.extractor;
  16. import java.util.ArrayList;
  17. import java.util.BitSet;
  18. import java.util.LinkedList;
  19. import java.util.List;
  20. import java.util.function.Consumer;
  21. import java.util.function.Function;
  22. import java.util.function.Predicate;
  23. import com.zaxxer.sparsebits.SparseBitSet;
  24. import org.apache.logging.log4j.LogManager;
  25. import org.apache.logging.log4j.Logger;
  26. import org.apache.poi.extractor.POITextExtractor;
  27. import org.apache.poi.sl.usermodel.MasterSheet;
  28. import org.apache.poi.sl.usermodel.Notes;
  29. import org.apache.poi.sl.usermodel.ObjectShape;
  30. import org.apache.poi.sl.usermodel.Placeholder;
  31. import org.apache.poi.sl.usermodel.PlaceholderDetails;
  32. import org.apache.poi.sl.usermodel.Shape;
  33. import org.apache.poi.sl.usermodel.ShapeContainer;
  34. import org.apache.poi.sl.usermodel.Sheet;
  35. import org.apache.poi.sl.usermodel.Slide;
  36. import org.apache.poi.sl.usermodel.SlideShow;
  37. import org.apache.poi.sl.usermodel.TableCell;
  38. import org.apache.poi.sl.usermodel.TableShape;
  39. import org.apache.poi.sl.usermodel.TextParagraph;
  40. import org.apache.poi.sl.usermodel.TextRun;
  41. import org.apache.poi.sl.usermodel.TextShape;
  42. import org.apache.poi.util.Internal;
  43. import org.apache.poi.util.LocaleUtil;
  44. import org.apache.poi.util.Removal;
  45. /**
  46. * Common SlideShow extractor
  47. *
  48. * @since POI 4.0.0
  49. */
  50. public class SlideShowExtractor<
  51. S extends Shape<S,P>,
  52. P extends TextParagraph<S,P,? extends TextRun>
  53. > implements POITextExtractor {
  54. private static final Logger LOG = LogManager.getLogger(SlideShowExtractor.class);
  55. // placeholder text for slide numbers
  56. private static final String SLIDE_NUMBER_PH = "‹#›";
  57. protected final SlideShow<S,P> slideshow;
  58. private boolean slidesByDefault = true;
  59. private boolean notesByDefault;
  60. private boolean commentsByDefault;
  61. private boolean masterByDefault;
  62. private Predicate<Object> filter = o -> true;
  63. private boolean doCloseFilesystem = true;
  64. public SlideShowExtractor(final SlideShow<S,P> slideshow) {
  65. this.slideshow = slideshow;
  66. }
  67. /**
  68. * Returns opened document
  69. *
  70. * @return the opened document
  71. */
  72. @Override
  73. public SlideShow<S,P> getDocument() {
  74. return slideshow;
  75. }
  76. /**
  77. * Should a call to getText() return slide text? Default is yes
  78. */
  79. public void setSlidesByDefault(final boolean slidesByDefault) {
  80. this.slidesByDefault = slidesByDefault;
  81. }
  82. /**
  83. * Should a call to getText() return notes text? Default is no
  84. */
  85. public void setNotesByDefault(final boolean notesByDefault) {
  86. this.notesByDefault = notesByDefault;
  87. }
  88. /**
  89. * Should a call to getText() return comments text? Default is no
  90. */
  91. public void setCommentsByDefault(final boolean commentsByDefault) {
  92. this.commentsByDefault = commentsByDefault;
  93. }
  94. /**
  95. * Should a call to getText() return text from master? Default is no
  96. */
  97. public void setMasterByDefault(final boolean masterByDefault) {
  98. this.masterByDefault = masterByDefault;
  99. }
  100. @Override
  101. public POITextExtractor getMetadataTextExtractor() {
  102. return slideshow.getMetadataTextExtractor();
  103. }
  104. /**
  105. * Fetches all the slide text from the slideshow, but not the notes, unless
  106. * you've called setSlidesByDefault() and setNotesByDefault() to change this
  107. */
  108. @Override
  109. public String getText() {
  110. final StringBuilder sb = new StringBuilder();
  111. for (final Slide<S, P> slide : slideshow.getSlides()) {
  112. getText(slide, sb::append);
  113. }
  114. return sb.toString();
  115. }
  116. public String getText(final Slide<S,P> slide) {
  117. final StringBuilder sb = new StringBuilder();
  118. getText(slide, sb::append);
  119. return sb.toString();
  120. }
  121. private void getText(final Slide<S,P> slide, final Consumer<String> consumer) {
  122. if (slidesByDefault) {
  123. printShapeText(slide, consumer);
  124. }
  125. if (masterByDefault) {
  126. final MasterSheet<S,P> ms = slide.getMasterSheet();
  127. printSlideMaster(ms, consumer);
  128. // only print slide layout, if it's a different instance
  129. final MasterSheet<S,P> sl = slide.getSlideLayout();
  130. if (sl != ms) {
  131. printSlideMaster(sl, consumer);
  132. }
  133. }
  134. if (commentsByDefault) {
  135. printComments(slide, consumer);
  136. }
  137. if (notesByDefault) {
  138. printNotes(slide, consumer);
  139. }
  140. }
  141. private void printSlideMaster(final MasterSheet<S,P> master, final Consumer<String> consumer) {
  142. if (master == null) {
  143. return;
  144. }
  145. for (final Shape<S,P> shape : master) {
  146. if (shape instanceof TextShape) {
  147. final TextShape<S,P> ts = (TextShape<S,P>)shape;
  148. final String text = ts.getText();
  149. if (text == null || text.isEmpty() || "*".equals(text)) {
  150. continue;
  151. }
  152. if (ts.isPlaceholder()) {
  153. // don't bother about boiler plate text on master sheets
  154. LOG.atInfo().log("Ignoring boiler plate (placeholder) text on slide master: {}", text);
  155. continue;
  156. }
  157. printTextParagraphs(ts.getTextParagraphs(), consumer);
  158. }
  159. }
  160. }
  161. private void printTextParagraphs(final List<P> paras, final Consumer<String> consumer) {
  162. printTextParagraphs(paras, consumer, "\n");
  163. }
  164. private void printTextParagraphs(final List<P> paras, final Consumer<String> consumer, String trailer) {
  165. printTextParagraphs(paras, consumer, trailer, SlideShowExtractor::replaceTextCap);
  166. }
  167. private void printTextParagraphs(final List<P> paras, final Consumer<String> consumer, String trailer, final Function<TextRun,String> converter) {
  168. for (P p : paras) {
  169. for (TextRun r : p) {
  170. if (filter.test(r)) {
  171. consumer.accept(converter.apply(r));
  172. }
  173. }
  174. if (!trailer.isEmpty() && filter.test(trailer)) {
  175. consumer.accept(trailer);
  176. }
  177. }
  178. }
  179. private void printHeaderFooter(final Sheet<S,P> sheet, final Consumer<String> consumer, final Consumer<String> footerCon) {
  180. final Sheet<S, P> m = (sheet instanceof Slide) ? sheet.getMasterSheet() : sheet;
  181. addSheetPlaceholderDatails(sheet, Placeholder.HEADER, consumer);
  182. addSheetPlaceholderDatails(sheet, Placeholder.FOOTER, footerCon);
  183. if (!masterByDefault) {
  184. return;
  185. }
  186. // write header texts and determine footer text
  187. for (Shape<S, P> s : m) {
  188. if (!(s instanceof TextShape)) {
  189. continue;
  190. }
  191. final TextShape<S, P> ts = (TextShape<S, P>) s;
  192. final PlaceholderDetails pd = ts.getPlaceholderDetails();
  193. if (pd == null || !pd.isVisible() || pd.getPlaceholder() == null) {
  194. continue;
  195. }
  196. switch (pd.getPlaceholder()) {
  197. case HEADER:
  198. printTextParagraphs(ts.getTextParagraphs(), consumer);
  199. break;
  200. case FOOTER:
  201. printTextParagraphs(ts.getTextParagraphs(), footerCon);
  202. break;
  203. case SLIDE_NUMBER:
  204. printTextParagraphs(ts.getTextParagraphs(), footerCon, "\n", SlideShowExtractor::replaceSlideNumber);
  205. break;
  206. case DATETIME:
  207. // currently not supported
  208. default:
  209. break;
  210. }
  211. }
  212. }
  213. private void addSheetPlaceholderDatails(final Sheet<S,P> sheet, final Placeholder placeholder, final Consumer<String> consumer) {
  214. final PlaceholderDetails headerPD = sheet.getPlaceholderDetails(placeholder);
  215. final String headerStr = (headerPD != null) ? headerPD.getText() : null;
  216. if (headerStr != null && filter.test(headerPD)) {
  217. consumer.accept(headerStr);
  218. }
  219. }
  220. private void printShapeText(final Sheet<S,P> sheet, final Consumer<String> consumer) {
  221. final List<String> footer = new LinkedList<>();
  222. printHeaderFooter(sheet, consumer, footer::add);
  223. printShapeText((ShapeContainer<S,P>)sheet, consumer);
  224. footer.forEach(consumer);
  225. }
  226. @SuppressWarnings("unchecked")
  227. private void printShapeText(final ShapeContainer<S,P> container, final Consumer<String> consumer) {
  228. for (Shape<S,P> shape : container) {
  229. if (shape instanceof TextShape) {
  230. printTextParagraphs(((TextShape<S,P>)shape).getTextParagraphs(), consumer);
  231. } else if (shape instanceof TableShape) {
  232. printShapeText((TableShape<S,P>)shape, consumer);
  233. } else if (shape instanceof ShapeContainer) {
  234. printShapeText((ShapeContainer<S,P>)shape, consumer);
  235. }
  236. }
  237. }
  238. @SuppressWarnings("Duplicates")
  239. private void printShapeText(final TableShape<S,P> shape, final Consumer<String> consumer) {
  240. final int nrows = shape.getNumberOfRows();
  241. final int ncols = shape.getNumberOfColumns();
  242. for (int row = 0; row < nrows; row++) {
  243. String trailer = "";
  244. for (int col = 0; col < ncols; col++){
  245. TableCell<S, P> cell = shape.getCell(row, col);
  246. //defensive null checks; don't know if they're necessary
  247. if (cell != null) {
  248. trailer = col < ncols-1 ? "\t" : "\n";
  249. printTextParagraphs(cell.getTextParagraphs(), consumer, trailer);
  250. }
  251. }
  252. if (!trailer.equals("\n") && filter.test("\n")) {
  253. consumer.accept("\n");
  254. }
  255. }
  256. }
  257. private void printComments(final Slide<S,P> slide, final Consumer<String> consumer) {
  258. slide.getComments().stream().filter(filter).map(c -> c.getAuthor()+" - "+c.getText()).forEach(consumer);
  259. }
  260. private void printNotes(final Slide<S,P> slide, final Consumer<String> consumer) {
  261. final Notes<S, P> notes = slide.getNotes();
  262. if (notes == null) {
  263. return;
  264. }
  265. List<String> footer = new LinkedList<>();
  266. printHeaderFooter(notes, consumer, footer::add);
  267. printShapeText(notes, consumer);
  268. footer.forEach(consumer);
  269. }
  270. public List<? extends ObjectShape<S,P>> getOLEShapes() {
  271. final List<ObjectShape<S,P>> oleShapes = new ArrayList<>();
  272. for (final Slide<S,P> slide : slideshow.getSlides()) {
  273. addOLEShapes(oleShapes, slide);
  274. }
  275. return oleShapes;
  276. }
  277. @SuppressWarnings("unchecked")
  278. private void addOLEShapes(final List<ObjectShape<S,P>> oleShapes, ShapeContainer<S,P> container) {
  279. for (Shape<S,P> shape : container) {
  280. if (shape instanceof ShapeContainer) {
  281. addOLEShapes(oleShapes, (ShapeContainer<S,P>)shape);
  282. } else if (shape instanceof ObjectShape) {
  283. oleShapes.add((ObjectShape<S,P>)shape);
  284. }
  285. }
  286. }
  287. private static String replaceSlideNumber(TextRun tr) {
  288. String raw = tr.getRawText();
  289. if (!raw.contains(SLIDE_NUMBER_PH)) {
  290. return raw;
  291. }
  292. TextParagraph<?,?,?> tp = tr.getParagraph();
  293. TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null;
  294. Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null;
  295. String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : "";
  296. return raw.replace(SLIDE_NUMBER_PH, slideNr);
  297. }
  298. private static String replaceTextCap(TextRun tr) {
  299. final TextParagraph<?,?,?> tp = tr.getParagraph();
  300. final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null;
  301. final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null;
  302. // 0xB acts like cariage return in page titles and like blank in the others
  303. final char sep = (
  304. ph == Placeholder.TITLE ||
  305. ph == Placeholder.CENTERED_TITLE ||
  306. ph == Placeholder.SUBTITLE
  307. ) ? '\n' : ' ';
  308. // PowerPoint seems to store files with \r as the line break
  309. // The messes things up on everything but a Mac, so translate them to \n
  310. String txt = tr.getRawText();
  311. txt = txt.replace('\r', '\n');
  312. txt = txt.replace((char) 0x0B, sep);
  313. switch (tr.getTextCap()) {
  314. case ALL:
  315. txt = txt.toUpperCase(LocaleUtil.getUserLocale());
  316. break;
  317. case SMALL:
  318. txt = txt.toLowerCase(LocaleUtil.getUserLocale());
  319. break;
  320. }
  321. return txt;
  322. }
  323. /**
  324. * Extract the used codepoints for font embedding / subsetting
  325. * @param typeface the typeface/font family of the textruns to examine
  326. * @param italic use {@code true} for italic TextRuns, {@code false} for non-italic ones and
  327. * {@code null} if it doesn't matter
  328. * @param bold use {@code true} for bold TextRuns, {@code false} for non-bold ones and
  329. * {@code null} if it doesn't matter
  330. * @return a bitset with the marked/used codepoints
  331. * @deprecated use {@link #getCodepointsInSparseBitSet(String, Boolean, Boolean)}
  332. */
  333. @Deprecated
  334. @Removal(version = "6.0.0")
  335. public BitSet getCodepoints(String typeface, Boolean italic, Boolean bold) {
  336. final BitSet glyphs = new BitSet();
  337. Predicate<Object> filterOld = filter;
  338. try {
  339. filter = o -> filterFonts(o, typeface, italic, bold);
  340. slideshow.getSlides().forEach(slide ->
  341. getText(slide, s -> s.codePoints().forEach(glyphs::set))
  342. );
  343. } finally {
  344. filter = filterOld;
  345. }
  346. return glyphs;
  347. }
  348. /**
  349. * Extract the used codepoints for font embedding / subsetting. This method is not intended for public use.
  350. *
  351. * @param typeface the typeface/font family of the textruns to examine
  352. * @param italic use {@code true} for italic TextRuns, {@code false} for non-italic ones and
  353. * {@code null} if it doesn't matter
  354. * @param bold use {@code true} for bold TextRuns, {@code false} for non-bold ones and
  355. * {@code null} if it doesn't matter
  356. * @return a bitset with the marked/used codepoints
  357. */
  358. @Internal
  359. public SparseBitSet getCodepointsInSparseBitSet(String typeface, Boolean italic, Boolean bold) {
  360. final SparseBitSet glyphs = new SparseBitSet();
  361. Predicate<Object> filterOld = filter;
  362. try {
  363. filter = o -> filterFonts(o, typeface, italic, bold);
  364. slideshow.getSlides().forEach(slide ->
  365. getText(slide, s -> s.codePoints().forEach(glyphs::set))
  366. );
  367. } finally {
  368. filter = filterOld;
  369. }
  370. return glyphs;
  371. }
  372. private static boolean filterFonts(Object o, String typeface, Boolean italic, Boolean bold) {
  373. if (!(o instanceof TextRun)) {
  374. return false;
  375. }
  376. TextRun tr = (TextRun)o;
  377. return
  378. typeface.equalsIgnoreCase(tr.getFontFamily()) &&
  379. (italic == null || tr.isItalic() == italic) &&
  380. (bold == null || tr.isBold() == bold);
  381. }
  382. @Override
  383. public void setCloseFilesystem(boolean doCloseFilesystem) {
  384. this.doCloseFilesystem = doCloseFilesystem;
  385. }
  386. @Override
  387. public boolean isCloseFilesystem() {
  388. return doCloseFilesystem;
  389. }
  390. @Override
  391. public SlideShow<S,P> getFilesystem() {
  392. return getDocument();
  393. }
  394. }