123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.xslf.extractor;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.apache.poi.POITestCase.assertNotContained;
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertFalse;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertTrue;
-
- import java.io.IOException;
- import java.io.InputStream;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.POITextExtractor;
- import org.apache.poi.extractor.ExtractorFactory;
- import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.xslf.usermodel.XMLSlideShow;
- import org.apache.xmlbeans.XmlException;
- import org.junit.Test;
-
- /**
- * Tests for XSLFPowerPointExtractor
- */
- public class TestXSLFPowerPointExtractor {
- private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
-
- /**
- * Get text out of the simple file
- * @throws XmlException
- * @throws OpenXML4JException
- */
- @Test
- public void testGetSimpleText()
- throws IOException, XmlException, OpenXML4JException {
- XMLSlideShow xmlA = openPPTX("sample.pptx");
- @SuppressWarnings("resource")
- OPCPackage pkg = xmlA.getPackage();
-
- new XSLFPowerPointExtractor(xmlA).close();
- new XSLFPowerPointExtractor(pkg).close();
-
- XSLFPowerPointExtractor extractor =
- new XSLFPowerPointExtractor(xmlA);
- extractor.getText();
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check Basics
- assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
- assertContains(text, "amet\n\n");
-
- // Our placeholder master text
- // This shouldn't show up in the output
- // String masterText =
- // "Click to edit Master title style\n" +
- // "Click to edit Master subtitle style\n" +
- // "\n\n\n\n\n\n" +
- // "Click to edit Master title style\n" +
- // "Click to edit Master text styles\n" +
- // "Second level\n" +
- // "Third level\n" +
- // "Fourth level\n" +
- // "Fifth level\n";
-
- // Just slides, no notes
- text = extractor.getText(true, false, false);
- String slideText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n";
- assertEquals(slideText, text);
-
- // Just notes, no slides
- text = extractor.getText(false, true);
- assertEquals("\n\n\n\n", text);
-
- // Both
- text = extractor.getText(true, true, false);
- String bothText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n\n\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n\n\n";
- assertEquals(bothText, text);
-
- // With Slides and Master Text
- text = extractor.getText(true, false, true);
- String smText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n";
- assertEquals(smText, text);
-
- // With Slides, Notes and Master Text
- text = extractor.getText(true, true, true);
- String snmText =
- "Lorem ipsum dolor sit amet\n" +
- "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
- "\n" +
- "\n\n" +
- "Lorem ipsum dolor sit amet\n" +
- "Lorem\n" +
- "ipsum\n" +
- "dolor\n" +
- "sit\n" +
- "amet\n" +
- "\n\n\n";
- assertEquals(snmText, text);
-
- // Via set defaults
- extractor.setSlidesByDefault(false);
- extractor.setNotesByDefault(true);
- text = extractor.getText();
- assertEquals("\n\n\n\n", text);
-
- extractor.close();
- xmlA.close();
- }
-
- public void testGetComments() throws IOException {
- XMLSlideShow xml = openPPTX("45545_Comment.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check comments are there
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
-
- // Check the authors came through too
- assertContains(text, "XPVMWARE01");
-
- extractor.close();
- xml.close();
- }
-
- public void testGetMasterText() throws Exception {
- XMLSlideShow xml = openPPTX("WithMaster.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
- extractor.setSlidesByDefault(true);
- extractor.setNotesByDefault(false);
- extractor.setMasterByDefault(true);
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check master text is there
- assertContains(text, "Footer from the master slide");
-
- // Theme text shouldn't show up
- // String themeText =
- // "Theme Master Title\n" +
- // "Theme Master first level\n" +
- // "And the 2nd level\n" +
- // "Our 3rd level goes here\n" +
- // "And onto the 4th, such fun....\n" +
- // "Finally is the Fifth level\n";
-
- // Check the whole text
- String wholeText =
- "First page title\n" +
- "First page subtitle\n" +
- "This is the Master Title\n" +
- "This text comes from the Master Slide\n" +
- "\n" +
- // TODO Detect we didn't have a title, and include the master one
- "2nd page subtitle\n" +
- "Footer from the master slide\n" +
- "This is the Master Title\n" +
- "This text comes from the Master Slide\n";
- assertEquals(wholeText, text);
-
- extractor.close();
- xml.close();
- }
-
- @Test
- public void testTable() throws Exception {
- XMLSlideShow xml = openPPTX("present1.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
- String text = extractor.getText();
- assertTrue(text.length() > 0);
-
- // Check comments are there
- assertTrue("Unable to find expected word in text\n" + text, text.contains("TEST"));
-
- extractor.close();
- xml.close();
- }
-
- /**
- * Test that we can get the text from macro enabled,
- * template, theme, slide enabled etc formats, as
- * well as from the normal file
- */
- @Test
- public void testDifferentSubformats() throws Exception {
- String[] extensions = new String[] {
- "pptx", "pptm", "ppsm", "ppsx", "thmx",
- // "xps" - Doesn't have a core document
- };
- for(String extension : extensions) {
- String filename = "testPPT." + extension;
- XMLSlideShow xml = openPPTX(filename);
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
-
- String text = extractor.getText();
- if (extension.equals("thmx")) {
- // Theme file doesn't have any textual content
- assertEquals(0, text.length());
- continue;
- }
-
- assertTrue(text.length() > 0);
- assertTrue(
- "Text missing for " + filename + "\n" + text,
- text.contains("Attachment Test")
- );
- assertTrue(
- "Text missing for " + filename + "\n" + text,
- text.contains("This is a test file data with the same content")
- );
- assertTrue(
- "Text missing for " + filename + "\n" + text,
- text.contains("content parsing")
- );
- assertTrue(
- "Text missing for " + filename + "\n" + text,
- text.contains("Different words to test against")
- );
- assertTrue(
- "Text missing for " + filename + "\n" + text,
- text.contains("Mystery")
- );
-
- extractor.close();
- xml.close();
- }
- }
-
- @Test
- public void test45541() throws Exception {
- // extract text from a powerpoint that has a header in the notes-element
- POITextExtractor extr = ExtractorFactory.createExtractor(
- slTests.getFile("45541_Header.pptx"));
- String text = extr.getText();
- assertNotNull(text);
- assertFalse("Had: " + text, text.contains("testdoc"));
-
- text = ((XSLFPowerPointExtractor)extr).getText(false, true);
- assertContains(text, "testdoc");
- extr.close();
- assertNotNull(text);
-
- // extract text from a powerpoint that has a footer in the master-slide
- extr = ExtractorFactory.createExtractor(
- slTests.getFile("45541_Footer.pptx"));
- text = extr.getText();
- assertNotContained(text, "testdoc");
-
- text = ((XSLFPowerPointExtractor)extr).getText(false, true);
- assertNotContained(text, "testdoc");
-
- text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
- assertNotContained(text, "testdoc");
-
- extr.close();
- }
-
-
- @Test
- public void bug54570() throws IOException {
- XMLSlideShow xml = openPPTX("bug54570.pptx");
- XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
- String text = extractor.getText();
- assertNotNull(text);
- extractor.close();
- xml.close();
- }
-
- private XMLSlideShow openPPTX(String file) throws IOException {
- InputStream is = slTests.openResourceAsStream(file);
- try {
- return new XMLSlideShow(is);
- } finally {
- is.close();
- }
- }
- }
|