123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.sl.extractor;
-
- import java.util.ArrayList;
- import java.util.BitSet;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.function.Consumer;
- import java.util.function.Function;
- import java.util.function.Predicate;
-
- import com.zaxxer.sparsebits.SparseBitSet;
- import org.apache.logging.log4j.LogManager;
- import org.apache.logging.log4j.Logger;
- import org.apache.poi.extractor.POITextExtractor;
- import org.apache.poi.sl.usermodel.MasterSheet;
- import org.apache.poi.sl.usermodel.Notes;
- import org.apache.poi.sl.usermodel.ObjectShape;
- import org.apache.poi.sl.usermodel.Placeholder;
- import org.apache.poi.sl.usermodel.PlaceholderDetails;
- import org.apache.poi.sl.usermodel.Shape;
- import org.apache.poi.sl.usermodel.ShapeContainer;
- import org.apache.poi.sl.usermodel.Sheet;
- import org.apache.poi.sl.usermodel.Slide;
- import org.apache.poi.sl.usermodel.SlideShow;
- import org.apache.poi.sl.usermodel.TableCell;
- import org.apache.poi.sl.usermodel.TableShape;
- import org.apache.poi.sl.usermodel.TextParagraph;
- import org.apache.poi.sl.usermodel.TextRun;
- import org.apache.poi.sl.usermodel.TextShape;
- import org.apache.poi.util.LocaleUtil;
-
- /**
- * Common SlideShow extractor
- *
- * @since POI 4.0.0
- */
- public class SlideShowExtractor<
- S extends Shape<S,P>,
- P extends TextParagraph<S,P,? extends TextRun>
- > implements POITextExtractor {
- private static final Logger LOG = LogManager.getLogger(SlideShowExtractor.class);
-
- // placeholder text for slide numbers
- private static final String SLIDE_NUMBER_PH = "‹#›";
-
-
- protected final SlideShow<S,P> slideshow;
-
- private boolean slidesByDefault = true;
- private boolean notesByDefault;
- private boolean commentsByDefault;
- private boolean masterByDefault;
-
- private Predicate<Object> filter = o -> true;
- private boolean doCloseFilesystem = true;
-
- public SlideShowExtractor(final SlideShow<S,P> slideshow) {
- this.slideshow = slideshow;
- }
-
- /**
- * Returns opened document
- *
- * @return the opened document
- */
- @Override
- public SlideShow<S,P> getDocument() {
- return slideshow;
- }
-
- /**
- * Should a call to getText() return slide text? Default is yes
- */
- public void setSlidesByDefault(final boolean slidesByDefault) {
- this.slidesByDefault = slidesByDefault;
- }
-
- /**
- * Should a call to getText() return notes text? Default is no
- */
- public void setNotesByDefault(final boolean notesByDefault) {
- this.notesByDefault = notesByDefault;
- }
-
- /**
- * Should a call to getText() return comments text? Default is no
- */
- public void setCommentsByDefault(final boolean commentsByDefault) {
- this.commentsByDefault = commentsByDefault;
- }
-
- /**
- * Should a call to getText() return text from master? Default is no
- */
- public void setMasterByDefault(final boolean masterByDefault) {
- this.masterByDefault = masterByDefault;
- }
-
- @Override
- public POITextExtractor getMetadataTextExtractor() {
- return slideshow.getMetadataTextExtractor();
- }
-
- /**
- * Fetches all the slide text from the slideshow, but not the notes, unless
- * you've called setSlidesByDefault() and setNotesByDefault() to change this
- */
- @Override
- public String getText() {
- final StringBuilder sb = new StringBuilder();
- for (final Slide<S, P> slide : slideshow.getSlides()) {
- getText(slide, sb::append);
- }
-
- return sb.toString();
- }
-
- public String getText(final Slide<S,P> slide) {
- final StringBuilder sb = new StringBuilder();
- getText(slide, sb::append);
- return sb.toString();
- }
-
-
- private void getText(final Slide<S,P> slide, final Consumer<String> consumer) {
- if (slidesByDefault) {
- printShapeText(slide, consumer);
- }
-
- if (masterByDefault) {
- final MasterSheet<S,P> ms = slide.getMasterSheet();
- printSlideMaster(ms, consumer);
-
- // only print slide layout, if it's a different instance
- final MasterSheet<S,P> sl = slide.getSlideLayout();
- if (sl != ms) {
- printSlideMaster(sl, consumer);
- }
- }
-
- if (commentsByDefault) {
- printComments(slide, consumer);
- }
-
- if (notesByDefault) {
- printNotes(slide, consumer);
- }
- }
-
- private void printSlideMaster(final MasterSheet<S,P> master, final Consumer<String> consumer) {
- if (master == null) {
- return;
- }
- for (final Shape<S,P> shape : master) {
- if (shape instanceof TextShape) {
- final TextShape<S,P> ts = (TextShape<S,P>)shape;
- final String text = ts.getText();
- if (text == null || text.isEmpty() || "*".equals(text)) {
- continue;
- }
-
- if (ts.isPlaceholder()) {
- // don't bother about boiler plate text on master sheets
- LOG.atInfo().log("Ignoring boiler plate (placeholder) text on slide master: {}", text);
- continue;
- }
-
- printTextParagraphs(ts.getTextParagraphs(), consumer);
- }
- }
- }
-
- private void printTextParagraphs(final List<P> paras, final Consumer<String> consumer) {
- printTextParagraphs(paras, consumer, "\n");
- }
-
-
- private void printTextParagraphs(final List<P> paras, final Consumer<String> consumer, String trailer) {
- printTextParagraphs(paras, consumer, trailer, SlideShowExtractor::replaceTextCap);
- }
-
- private void printTextParagraphs(final List<P> paras, final Consumer<String> consumer, String trailer, final Function<TextRun,String> converter) {
- for (P p : paras) {
- for (TextRun r : p) {
- if (filter.test(r)) {
- consumer.accept(converter.apply(r));
- }
- }
- if (!trailer.isEmpty() && filter.test(trailer)) {
- consumer.accept(trailer);
- }
- }
- }
-
- private void printHeaderFooter(final Sheet<S,P> sheet, final Consumer<String> consumer, final Consumer<String> footerCon) {
- final Sheet<S, P> m = (sheet instanceof Slide) ? sheet.getMasterSheet() : sheet;
- addSheetPlaceholderDatails(sheet, Placeholder.HEADER, consumer);
- addSheetPlaceholderDatails(sheet, Placeholder.FOOTER, footerCon);
-
- if (!masterByDefault) {
- return;
- }
-
- // write header texts and determine footer text
- for (Shape<S, P> s : m) {
- if (!(s instanceof TextShape)) {
- continue;
- }
- final TextShape<S, P> ts = (TextShape<S, P>) s;
- final PlaceholderDetails pd = ts.getPlaceholderDetails();
- if (pd == null || !pd.isVisible() || pd.getPlaceholder() == null) {
- continue;
- }
- switch (pd.getPlaceholder()) {
- case HEADER:
- printTextParagraphs(ts.getTextParagraphs(), consumer);
- break;
- case FOOTER:
- printTextParagraphs(ts.getTextParagraphs(), footerCon);
- break;
- case SLIDE_NUMBER:
- printTextParagraphs(ts.getTextParagraphs(), footerCon, "\n", SlideShowExtractor::replaceSlideNumber);
- break;
- case DATETIME:
- // currently not supported
- default:
- break;
- }
- }
- }
-
-
- private void addSheetPlaceholderDatails(final Sheet<S,P> sheet, final Placeholder placeholder, final Consumer<String> consumer) {
- final PlaceholderDetails headerPD = sheet.getPlaceholderDetails(placeholder);
- final String headerStr = (headerPD != null) ? headerPD.getText() : null;
- if (headerStr != null && filter.test(headerPD)) {
- consumer.accept(headerStr);
- }
- }
-
- private void printShapeText(final Sheet<S,P> sheet, final Consumer<String> consumer) {
- final List<String> footer = new LinkedList<>();
- printHeaderFooter(sheet, consumer, footer::add);
- printShapeText((ShapeContainer<S,P>)sheet, consumer);
- footer.forEach(consumer);
- }
-
- @SuppressWarnings("unchecked")
- private void printShapeText(final ShapeContainer<S,P> container, final Consumer<String> consumer) {
- for (Shape<S,P> shape : container) {
- if (shape instanceof TextShape) {
- printTextParagraphs(((TextShape<S,P>)shape).getTextParagraphs(), consumer);
- } else if (shape instanceof TableShape) {
- printShapeText((TableShape<S,P>)shape, consumer);
- } else if (shape instanceof ShapeContainer) {
- printShapeText((ShapeContainer<S,P>)shape, consumer);
- }
- }
- }
-
- @SuppressWarnings("Duplicates")
- private void printShapeText(final TableShape<S,P> shape, final Consumer<String> consumer) {
- final int nrows = shape.getNumberOfRows();
- final int ncols = shape.getNumberOfColumns();
- for (int row = 0; row < nrows; row++) {
- String trailer = "";
- for (int col = 0; col < ncols; col++){
- TableCell<S, P> cell = shape.getCell(row, col);
- //defensive null checks; don't know if they're necessary
- if (cell != null) {
- trailer = col < ncols-1 ? "\t" : "\n";
- printTextParagraphs(cell.getTextParagraphs(), consumer, trailer);
- }
- }
- if (!trailer.equals("\n") && filter.test("\n")) {
- consumer.accept("\n");
- }
- }
- }
-
- private void printComments(final Slide<S,P> slide, final Consumer<String> consumer) {
- slide.getComments().stream().filter(filter).map(c -> c.getAuthor()+" - "+c.getText()).forEach(consumer);
- }
-
- private void printNotes(final Slide<S,P> slide, final Consumer<String> consumer) {
- final Notes<S, P> notes = slide.getNotes();
- if (notes == null) {
- return;
- }
-
- List<String> footer = new LinkedList<>();
- printHeaderFooter(notes, consumer, footer::add);
- printShapeText(notes, consumer);
- footer.forEach(consumer);
- }
-
- public List<? extends ObjectShape<S,P>> getOLEShapes() {
- final List<ObjectShape<S,P>> oleShapes = new ArrayList<>();
-
- for (final Slide<S,P> slide : slideshow.getSlides()) {
- addOLEShapes(oleShapes, slide);
- }
-
- return oleShapes;
- }
-
- @SuppressWarnings("unchecked")
- private void addOLEShapes(final List<ObjectShape<S,P>> oleShapes, ShapeContainer<S,P> container) {
- for (Shape<S,P> shape : container) {
- if (shape instanceof ShapeContainer) {
- addOLEShapes(oleShapes, (ShapeContainer<S,P>)shape);
- } else if (shape instanceof ObjectShape) {
- oleShapes.add((ObjectShape<S,P>)shape);
- }
- }
- }
-
- private static String replaceSlideNumber(TextRun tr) {
- String raw = tr.getRawText();
-
- if (!raw.contains(SLIDE_NUMBER_PH)) {
- return raw;
- }
-
- TextParagraph<?,?,?> tp = tr.getParagraph();
- TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null;
- Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null;
- String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : "";
-
- return raw.replace(SLIDE_NUMBER_PH, slideNr);
- }
-
- private static String replaceTextCap(TextRun tr) {
- final TextParagraph<?,?,?> tp = tr.getParagraph();
- final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null;
- final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null;
-
- // 0xB acts like cariage return in page titles and like blank in the others
- final char sep = (
- ph == Placeholder.TITLE ||
- ph == Placeholder.CENTERED_TITLE ||
- ph == Placeholder.SUBTITLE
- ) ? '\n' : ' ';
-
- // PowerPoint seems to store files with \r as the line break
- // The messes things up on everything but a Mac, so translate them to \n
- String txt = tr.getRawText();
- txt = txt.replace('\r', '\n');
- txt = txt.replace((char) 0x0B, sep);
-
- switch (tr.getTextCap()) {
- case ALL:
- txt = txt.toUpperCase(LocaleUtil.getUserLocale());
- break;
- case SMALL:
- txt = txt.toLowerCase(LocaleUtil.getUserLocale());
- break;
- }
-
- return txt;
- }
-
- /**
- * Extract the used codepoints for font embedding / subsetting
- * @param typeface the typeface/font family of the textruns to examine
- * @param italic use {@code true} for italic TextRuns, {@code false} for non-italic ones and
- * {@code null} if it doesn't matter
- * @param bold use {@code true} for bold TextRuns, {@code false} for non-bold ones and
- * {@code null} if it doesn't matter
- * @return a bitset with the marked/used codepoints
- * @deprecated use {@link #getCodepointsInSparseBitSet(String, Boolean, Boolean)}
- */
- @Deprecated
- public BitSet getCodepoints(String typeface, Boolean italic, Boolean bold) {
- final BitSet glyphs = new BitSet();
-
- Predicate<Object> filterOld = filter;
- try {
- filter = o -> filterFonts(o, typeface, italic, bold);
- slideshow.getSlides().forEach(slide ->
- getText(slide, s -> s.codePoints().forEach(glyphs::set))
- );
- } finally {
- filter = filterOld;
- }
-
- return glyphs;
- }
-
- /**
- * Extract the used codepoints for font embedding / subsetting
- * @param typeface the typeface/font family of the textruns to examine
- * @param italic use {@code true} for italic TextRuns, {@code false} for non-italic ones and
- * {@code null} if it doesn't matter
- * @param bold use {@code true} for bold TextRuns, {@code false} for non-bold ones and
- * {@code null} if it doesn't matter
- * @return a bitset with the marked/used codepoints
- */
- public SparseBitSet getCodepointsInSparseBitSet(String typeface, Boolean italic, Boolean bold) {
- final SparseBitSet glyphs = new SparseBitSet();
-
- Predicate<Object> filterOld = filter;
- try {
- filter = o -> filterFonts(o, typeface, italic, bold);
- slideshow.getSlides().forEach(slide ->
- getText(slide, s -> s.codePoints().forEach(glyphs::set))
- );
- } finally {
- filter = filterOld;
- }
-
- return glyphs;
- }
- private static boolean filterFonts(Object o, String typeface, Boolean italic, Boolean bold) {
- if (!(o instanceof TextRun)) {
- return false;
- }
- TextRun tr = (TextRun)o;
- return
- typeface.equalsIgnoreCase(tr.getFontFamily()) &&
- (italic == null || tr.isItalic() == italic) &&
- (bold == null || tr.isBold() == bold);
- }
-
- @Override
- public void setCloseFilesystem(boolean doCloseFilesystem) {
- this.doCloseFilesystem = doCloseFilesystem;
- }
-
- @Override
- public boolean isCloseFilesystem() {
- return doCloseFilesystem;
- }
-
- @Override
- public SlideShow<S,P> getFilesystem() {
- return getDocument();
- }
- }
|