|
|
@@ -16,104 +16,107 @@ |
|
|
|
==================================================================== */ |
|
|
|
package org.apache.poi.xslf.extractor; |
|
|
|
|
|
|
|
import static org.apache.poi.POITestCase.assertContains; |
|
|
|
import static org.apache.poi.POITestCase.assertNotContained; |
|
|
|
import static org.junit.Assert.assertEquals; |
|
|
|
import static org.junit.Assert.assertFalse; |
|
|
|
import static org.junit.Assert.assertNotNull; |
|
|
|
import static org.junit.Assert.assertTrue; |
|
|
|
|
|
|
|
import java.io.IOException; |
|
|
|
import java.io.InputStream; |
|
|
|
|
|
|
|
import org.apache.poi.POIDataSamples; |
|
|
|
import org.apache.poi.POITextExtractor; |
|
|
|
import org.apache.poi.extractor.ExtractorFactory; |
|
|
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; |
|
|
|
import org.apache.poi.openxml4j.opc.OPCPackage; |
|
|
|
import org.apache.poi.xslf.usermodel.XSLFSlideShow; |
|
|
|
|
|
|
|
import junit.framework.TestCase; |
|
|
|
import org.apache.poi.xslf.usermodel.XMLSlideShow; |
|
|
|
import org.apache.xmlbeans.XmlException; |
|
|
|
import org.junit.Test; |
|
|
|
|
|
|
|
/** |
|
|
|
* Tests for HXFPowerPointExtractor |
|
|
|
* Tests for XSLFPowerPointExtractor |
|
|
|
*/ |
|
|
|
public class TestXSLFPowerPointExtractor extends TestCase { |
|
|
|
/** |
|
|
|
* A simple file |
|
|
|
*/ |
|
|
|
private XSLFSlideShow xmlA; |
|
|
|
private OPCPackage pkg; |
|
|
|
|
|
|
|
private POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); |
|
|
|
|
|
|
|
protected void setUp() throws Exception { |
|
|
|
slTests = POIDataSamples.getSlideShowInstance(); |
|
|
|
pkg = OPCPackage.open(slTests.openResourceAsStream("sample.pptx")); |
|
|
|
xmlA = new XSLFSlideShow(pkg); |
|
|
|
} |
|
|
|
public class TestXSLFPowerPointExtractor { |
|
|
|
private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); |
|
|
|
|
|
|
|
/** |
|
|
|
* Get text out of the simple file |
|
|
|
* @throws XmlException |
|
|
|
* @throws OpenXML4JException |
|
|
|
*/ |
|
|
|
public void testGetSimpleText() throws Exception { |
|
|
|
new XSLFPowerPointExtractor(xmlA).close(); |
|
|
|
@Test |
|
|
|
public void testGetSimpleText() |
|
|
|
throws IOException, XmlException, OpenXML4JException { |
|
|
|
XMLSlideShow xmlA = openPPTX("sample.pptx"); |
|
|
|
@SuppressWarnings("resource") |
|
|
|
OPCPackage pkg = xmlA.getPackage(); |
|
|
|
|
|
|
|
new XSLFPowerPointExtractor(xmlA).close(); |
|
|
|
new XSLFPowerPointExtractor(pkg).close(); |
|
|
|
|
|
|
|
XSLFPowerPointExtractor extractor = |
|
|
|
|
|
|
|
XSLFPowerPointExtractor extractor = |
|
|
|
new XSLFPowerPointExtractor(xmlA); |
|
|
|
extractor.getText(); |
|
|
|
|
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
assertTrue(text.length() > 0); |
|
|
|
|
|
|
|
|
|
|
|
// Check Basics |
|
|
|
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n")); |
|
|
|
assertTrue(text.contains("amet\n\n")); |
|
|
|
assertContains(text, "amet\n\n"); |
|
|
|
|
|
|
|
// Our placeholder master text |
|
|
|
// This shouldn't show up in the output |
|
|
|
String masterText = |
|
|
|
"Click to edit Master title style\n" + |
|
|
|
"Click to edit Master subtitle style\n" + |
|
|
|
"\n\n\n\n\n\n" + |
|
|
|
"Click to edit Master title style\n" + |
|
|
|
"Click to edit Master text styles\n" + |
|
|
|
"Second level\n" + |
|
|
|
"Third level\n" + |
|
|
|
"Fourth level\n" + |
|
|
|
"Fifth level\n"; |
|
|
|
|
|
|
|
// String masterText = |
|
|
|
// "Click to edit Master title style\n" + |
|
|
|
// "Click to edit Master subtitle style\n" + |
|
|
|
// "\n\n\n\n\n\n" + |
|
|
|
// "Click to edit Master title style\n" + |
|
|
|
// "Click to edit Master text styles\n" + |
|
|
|
// "Second level\n" + |
|
|
|
// "Third level\n" + |
|
|
|
// "Fourth level\n" + |
|
|
|
// "Fifth level\n"; |
|
|
|
|
|
|
|
// Just slides, no notes |
|
|
|
text = extractor.getText(true, false, false); |
|
|
|
assertEquals( |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + |
|
|
|
"\n" + |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Lorem\n" + |
|
|
|
"ipsum\n" + |
|
|
|
"dolor\n" + |
|
|
|
"sit\n" + |
|
|
|
"amet\n" + |
|
|
|
"\n" |
|
|
|
, text |
|
|
|
); |
|
|
|
|
|
|
|
String slideText = |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + |
|
|
|
"\n" + |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Lorem\n" + |
|
|
|
"ipsum\n" + |
|
|
|
"dolor\n" + |
|
|
|
"sit\n" + |
|
|
|
"amet\n" + |
|
|
|
"\n"; |
|
|
|
assertEquals(slideText, text); |
|
|
|
|
|
|
|
// Just notes, no slides |
|
|
|
text = extractor.getText(false, true); |
|
|
|
assertEquals( |
|
|
|
"\n\n\n\n", text |
|
|
|
); |
|
|
|
|
|
|
|
assertEquals("\n\n\n\n", text); |
|
|
|
|
|
|
|
// Both |
|
|
|
text = extractor.getText(true, true, false); |
|
|
|
assertEquals( |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + |
|
|
|
String bothText = |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + |
|
|
|
"\n\n\n" + |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Lorem\n" + |
|
|
|
"ipsum\n" + |
|
|
|
"dolor\n" + |
|
|
|
"sit\n" + |
|
|
|
"amet\n" + |
|
|
|
"\n\n\n" |
|
|
|
, text |
|
|
|
); |
|
|
|
|
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Lorem\n" + |
|
|
|
"ipsum\n" + |
|
|
|
"dolor\n" + |
|
|
|
"sit\n" + |
|
|
|
"amet\n" + |
|
|
|
"\n\n\n"; |
|
|
|
assertEquals(bothText, text); |
|
|
|
|
|
|
|
// With Slides and Master Text |
|
|
|
text = extractor.getText(true, false, true); |
|
|
|
assertEquals( |
|
|
|
text = extractor.getText(true, false, true); |
|
|
|
String smText = |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + |
|
|
|
"\n" + |
|
|
@@ -123,13 +126,12 @@ public class TestXSLFPowerPointExtractor extends TestCase { |
|
|
|
"dolor\n" + |
|
|
|
"sit\n" + |
|
|
|
"amet\n" + |
|
|
|
"\n" |
|
|
|
, text |
|
|
|
); |
|
|
|
|
|
|
|
"\n"; |
|
|
|
assertEquals(smText, text); |
|
|
|
|
|
|
|
// With Slides, Notes and Master Text |
|
|
|
text = extractor.getText(true, true, true); |
|
|
|
assertEquals( |
|
|
|
text = extractor.getText(true, true, true); |
|
|
|
String snmText = |
|
|
|
"Lorem ipsum dolor sit amet\n" + |
|
|
|
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + |
|
|
|
"\n" + |
|
|
@@ -140,67 +142,61 @@ public class TestXSLFPowerPointExtractor extends TestCase { |
|
|
|
"dolor\n" + |
|
|
|
"sit\n" + |
|
|
|
"amet\n" + |
|
|
|
"\n\n\n" |
|
|
|
, text |
|
|
|
); |
|
|
|
|
|
|
|
"\n\n\n"; |
|
|
|
assertEquals(snmText, text); |
|
|
|
|
|
|
|
// Via set defaults |
|
|
|
extractor.setSlidesByDefault(false); |
|
|
|
extractor.setNotesByDefault(true); |
|
|
|
text = extractor.getText(); |
|
|
|
assertEquals( |
|
|
|
"\n\n\n\n", text |
|
|
|
); |
|
|
|
|
|
|
|
assertEquals("\n\n\n\n", text); |
|
|
|
|
|
|
|
extractor.close(); |
|
|
|
xmlA.close(); |
|
|
|
} |
|
|
|
|
|
|
|
public void testGetComments() throws Exception { |
|
|
|
XSLFSlideShow xml = |
|
|
|
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("45545_Comment.pptx"))); |
|
|
|
XSLFPowerPointExtractor extractor = |
|
|
|
new XSLFPowerPointExtractor(xml); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
assertTrue(text.length() > 0); |
|
|
|
|
|
|
|
// Check comments are there |
|
|
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc")); |
|
|
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("test phrase")); |
|
|
|
|
|
|
|
// Check the authors came through too |
|
|
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("XPVMWARE01")); |
|
|
|
|
|
|
|
extractor.close(); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public void testGetComments() throws IOException { |
|
|
|
XMLSlideShow xml = openPPTX("45545_Comment.pptx"); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
assertTrue(text.length() > 0); |
|
|
|
|
|
|
|
// Check comments are there |
|
|
|
assertContains(text, "testdoc"); |
|
|
|
assertContains(text, "test phrase"); |
|
|
|
|
|
|
|
// Check the authors came through too |
|
|
|
assertContains(text, "XPVMWARE01"); |
|
|
|
|
|
|
|
extractor.close(); |
|
|
|
xml.close(); |
|
|
|
} |
|
|
|
|
|
|
|
public void testGetMasterText() throws Exception { |
|
|
|
XSLFSlideShow xml = |
|
|
|
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx"))); |
|
|
|
XSLFPowerPointExtractor extractor = |
|
|
|
new XSLFPowerPointExtractor(xml); |
|
|
|
extractor.setSlidesByDefault(true); |
|
|
|
extractor.setNotesByDefault(false); |
|
|
|
extractor.setMasterByDefault(true); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
assertTrue(text.length() > 0); |
|
|
|
|
|
|
|
// Check master text is there |
|
|
|
assertTrue("Unable to find expected word in text\n" + text, |
|
|
|
text.contains("Footer from the master slide")); |
|
|
|
|
|
|
|
// Theme text shouldn't show up |
|
|
|
String themeText = |
|
|
|
"Theme Master Title\n" + |
|
|
|
"Theme Master first level\n" + |
|
|
|
"And the 2nd level\n" + |
|
|
|
"Our 3rd level goes here\n" + |
|
|
|
"And onto the 4th, such fun....\n" + |
|
|
|
"Finally is the Fifth level\n"; |
|
|
|
|
|
|
|
// Check the whole text |
|
|
|
assertEquals( |
|
|
|
XMLSlideShow xml = openPPTX("WithMaster.pptx"); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); |
|
|
|
extractor.setSlidesByDefault(true); |
|
|
|
extractor.setNotesByDefault(false); |
|
|
|
extractor.setMasterByDefault(true); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
assertTrue(text.length() > 0); |
|
|
|
|
|
|
|
// Check master text is there |
|
|
|
assertContains(text, "Footer from the master slide"); |
|
|
|
|
|
|
|
// Theme text shouldn't show up |
|
|
|
// String themeText = |
|
|
|
// "Theme Master Title\n" + |
|
|
|
// "Theme Master first level\n" + |
|
|
|
// "And the 2nd level\n" + |
|
|
|
// "Our 3rd level goes here\n" + |
|
|
|
// "And onto the 4th, such fun....\n" + |
|
|
|
// "Finally is the Fifth level\n"; |
|
|
|
|
|
|
|
// Check the whole text |
|
|
|
String wholeText = |
|
|
|
"First page title\n" + |
|
|
|
"First page subtitle\n" + |
|
|
|
"This is the Master Title\n" + |
|
|
@@ -210,108 +206,124 @@ public class TestXSLFPowerPointExtractor extends TestCase { |
|
|
|
"2nd page subtitle\n" + |
|
|
|
"Footer from the master slide\n" + |
|
|
|
"This is the Master Title\n" + |
|
|
|
"This text comes from the Master Slide\n" |
|
|
|
, text |
|
|
|
); |
|
|
|
|
|
|
|
"This text comes from the Master Slide\n"; |
|
|
|
assertEquals(wholeText, text); |
|
|
|
|
|
|
|
extractor.close(); |
|
|
|
xml.close(); |
|
|
|
} |
|
|
|
|
|
|
|
public void testTable() throws Exception { |
|
|
|
XSLFSlideShow xml = |
|
|
|
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("present1.pptx"))); |
|
|
|
XSLFPowerPointExtractor extractor = |
|
|
|
new XSLFPowerPointExtractor(xml); |
|
|
|
@Test |
|
|
|
public void testTable() throws Exception { |
|
|
|
XMLSlideShow xml = openPPTX("present1.pptx"); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
assertTrue(text.length() > 0); |
|
|
|
|
|
|
|
// Check comments are there |
|
|
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("TEST")); |
|
|
|
|
|
|
|
|
|
|
|
extractor.close(); |
|
|
|
xml.close(); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** |
|
|
|
* Test that we can get the text from macro enabled, |
|
|
|
* template, theme, slide enabled etc formats, as |
|
|
|
* template, theme, slide enabled etc formats, as |
|
|
|
* well as from the normal file |
|
|
|
*/ |
|
|
|
@Test |
|
|
|
public void testDifferentSubformats() throws Exception { |
|
|
|
String[] extensions = new String[] { |
|
|
|
"pptx", "pptm", "ppsm", "ppsx", |
|
|
|
"thmx", |
|
|
|
//"xps" // Doesn't have a core document |
|
|
|
}; |
|
|
|
for(String extension : extensions) { |
|
|
|
String filename = "testPPT." + extension; |
|
|
|
XSLFSlideShow xml = |
|
|
|
new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream(filename))); |
|
|
|
XSLFPowerPointExtractor extractor = |
|
|
|
new XSLFPowerPointExtractor(xml); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
if(extension.equals("thmx")) { |
|
|
|
// Theme file doesn't have any textual content |
|
|
|
assertEquals(0, text.length()); |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
String[] extensions = new String[] { |
|
|
|
"pptx", "pptm", "ppsm", "ppsx", "thmx", |
|
|
|
// "xps" - Doesn't have a core document |
|
|
|
}; |
|
|
|
for(String extension : extensions) { |
|
|
|
String filename = "testPPT." + extension; |
|
|
|
XMLSlideShow xml = openPPTX(filename); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); |
|
|
|
|
|
|
|
String text = extractor.getText(); |
|
|
|
if (extension.equals("thmx")) { |
|
|
|
// Theme file doesn't have any textual content |
|
|
|
assertEquals(0, text.length()); |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
assertTrue(text.length() > 0); |
|
|
|
assertTrue( |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
text.contains("Attachment Test") |
|
|
|
); |
|
|
|
assertTrue( |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
text.contains("This is a test file data with the same content") |
|
|
|
); |
|
|
|
assertTrue( |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
text.contains("content parsing") |
|
|
|
); |
|
|
|
assertTrue( |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
text.contains("Different words to test against") |
|
|
|
); |
|
|
|
assertTrue( |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
"Text missing for " + filename + "\n" + text, |
|
|
|
text.contains("Mystery") |
|
|
|
); |
|
|
|
|
|
|
|
|
|
|
|
extractor.close(); |
|
|
|
xml.close(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
public void test45541() throws Exception { |
|
|
|
// extract text from a powerpoint that has a header in the notes-element |
|
|
|
POITextExtractor extr = ExtractorFactory.createExtractor(slTests |
|
|
|
.openResourceAsStream("45541_Header.pptx")); |
|
|
|
String text = extr.getText(); |
|
|
|
assertNotNull(text); |
|
|
|
assertFalse("Had: " + text, text.contains("testdoc")); |
|
|
|
|
|
|
|
text = ((XSLFPowerPointExtractor)extr).getText(false, true); |
|
|
|
assertNotNull(text); |
|
|
|
assertTrue("Had: " + text, text.contains("testdoc")); |
|
|
|
extr.close(); |
|
|
|
assertNotNull(text); |
|
|
|
@Test |
|
|
|
public void test45541() throws Exception { |
|
|
|
// extract text from a powerpoint that has a header in the notes-element |
|
|
|
POITextExtractor extr = ExtractorFactory.createExtractor( |
|
|
|
slTests.getFile("45541_Header.pptx")); |
|
|
|
String text = extr.getText(); |
|
|
|
assertNotNull(text); |
|
|
|
assertFalse("Had: " + text, text.contains("testdoc")); |
|
|
|
|
|
|
|
// extract text from a powerpoint that has a footer in the master-slide |
|
|
|
extr = ExtractorFactory.createExtractor(slTests |
|
|
|
.openResourceAsStream("45541_Footer.pptx")); |
|
|
|
text = extr.getText(); |
|
|
|
assertNotNull(text); |
|
|
|
assertFalse("Had " + text, text.contains("testdoc")); |
|
|
|
|
|
|
|
text = ((XSLFPowerPointExtractor)extr).getText(false, true); |
|
|
|
text = ((XSLFPowerPointExtractor)extr).getText(false, true); |
|
|
|
assertContains(text, "testdoc"); |
|
|
|
extr.close(); |
|
|
|
assertNotNull(text); |
|
|
|
assertFalse("Had: " + text, text.contains("testdoc")); |
|
|
|
|
|
|
|
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true); |
|
|
|
assertNotNull(text); |
|
|
|
assertFalse("Had: " + text, text.contains("testdoc")); |
|
|
|
// extract text from a powerpoint that has a footer in the master-slide |
|
|
|
extr = ExtractorFactory.createExtractor( |
|
|
|
slTests.getFile("45541_Footer.pptx")); |
|
|
|
text = extr.getText(); |
|
|
|
assertNotContained(text, "testdoc"); |
|
|
|
|
|
|
|
text = ((XSLFPowerPointExtractor)extr).getText(false, true); |
|
|
|
assertNotContained(text, "testdoc"); |
|
|
|
|
|
|
|
extr.close(); |
|
|
|
} |
|
|
|
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true); |
|
|
|
assertNotContained(text, "testdoc"); |
|
|
|
|
|
|
|
extr.close(); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
@Test |
|
|
|
public void bug54570() throws IOException { |
|
|
|
XMLSlideShow xml = openPPTX("bug54570.pptx"); |
|
|
|
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml); |
|
|
|
String text = extractor.getText(); |
|
|
|
assertNotNull(text); |
|
|
|
extractor.close(); |
|
|
|
xml.close(); |
|
|
|
} |
|
|
|
|
|
|
|
private XMLSlideShow openPPTX(String file) throws IOException { |
|
|
|
InputStream is = slTests.openResourceAsStream(file); |
|
|
|
try { |
|
|
|
return new XMLSlideShow(is); |
|
|
|
} finally { |
|
|
|
is.close(); |
|
|
|
} |
|
|
|
} |
|
|
|
} |