123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hslf.extractor;
-
- import java.io.ByteArrayOutputStream;
- import java.io.InputStream;
- import java.util.List;
-
- import junit.framework.TestCase;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.hslf.HSLFSlideShow;
- import org.apache.poi.hslf.model.OLEShape;
- import org.apache.poi.hslf.usermodel.SlideShow;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.IOUtils;
-
- /**
- * Tests that the extractor correctly gets the text out of our sample file
- *
- * @author Nick Burch (nick at torchbox dot com)
- */
- public final class TestExtractor extends TestCase {
- /** Extractor primed on the 2 page basic test data */
- private PowerPointExtractor ppe;
- private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";
-
- /** Extractor primed on the 1 page but text-box'd test data */
- private PowerPointExtractor ppe2;
- private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
-
- /** Where our embeded files live */
- //private String pdirname;
- private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
- //private String pdirname;
-
- protected void setUp() throws Exception {
- ppe = new PowerPointExtractor(slTests.openResourceAsStream("basic_test_ppt_file.ppt"));
- ppe2 = new PowerPointExtractor(slTests.openResourceAsStream("with_textbox.ppt"));
- }
-
- private static void assertContains(String haystack, String needle) {
- assertContains(
- "Unable to find expected text '" + needle + "' in text:\n" + haystack,
- haystack, needle
- );
- }
- private static void assertContains(String reason, String haystack, String needle) {
- assertTrue(reason, haystack.contains(needle));
- }
-
- public void testReadSheetText() {
- // Basic 2 page example
- String sheetText = ppe.getText();
-
- ensureTwoStringsTheSame(expectText, sheetText);
-
-
- // 1 page example with text boxes
- sheetText = ppe2.getText();
-
- ensureTwoStringsTheSame(expectText2, sheetText);
- }
-
- public void testReadNoteText() {
- // Basic 2 page example
- String notesText = ppe.getNotes();
- String expectText = "These are the notes for page 1\nThese are the notes on page two, again lacking formatting\n";
-
- ensureTwoStringsTheSame(expectText, notesText);
-
- // Other one doesn't have notes
- notesText = ppe2.getNotes();
- expectText = "";
-
- ensureTwoStringsTheSame(expectText, notesText);
- }
-
- public void testReadBoth() {
- String[] slText = new String[] {
- "This is a test title\nThis is a test subtitle\nThis is on page 1\n",
- "This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
- };
- String[] ntText = new String[] {
- "These are the notes for page 1\n",
- "These are the notes on page two, again lacking formatting\n"
- };
-
- ppe.setSlidesByDefault(true);
- ppe.setNotesByDefault(false);
- assertEquals(slText[0]+slText[1], ppe.getText());
-
- ppe.setSlidesByDefault(false);
- ppe.setNotesByDefault(true);
- assertEquals(ntText[0]+ntText[1], ppe.getText());
-
- ppe.setSlidesByDefault(true);
- ppe.setNotesByDefault(true);
- assertEquals(slText[0]+slText[1]+"\n"+ntText[0]+ntText[1], ppe.getText());
- }
-
- /**
- * Test that when presented with a PPT file missing the odd
- * core record, we can still get the rest of the text out
- * @throws Exception
- */
- public void testMissingCoreRecords() throws Exception {
- ppe = new PowerPointExtractor(slTests.openResourceAsStream("missing_core_records.ppt"));
-
- String text = ppe.getText(true, false);
- String nText = ppe.getNotes();
-
- assertNotNull(text);
- assertNotNull(nText);
-
- // Notes record were corrupt, so don't expect any
- assertEquals(nText.length(), 0);
-
- // Slide records were fine
- assertTrue(text.startsWith("Using Disease Surveillance and Response"));
- }
-
- private void ensureTwoStringsTheSame(String exp, String act) {
- assertEquals(exp.length(),act.length());
- char[] expC = exp.toCharArray();
- char[] actC = act.toCharArray();
- for(int i=0; i<expC.length; i++) {
- assertEquals("Char " + i, expC[i], actC[i]);
- }
- assertEquals(exp,act);
- }
-
- public void testExtractFromEmbeded() throws Exception {
- POIFSFileSystem fs = new POIFSFileSystem(
- POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls")
- );
- HSLFSlideShow ss;
-
- DirectoryNode dirA = (DirectoryNode)
- fs.getRoot().getEntry("MBD0000A3B6");
- DirectoryNode dirB = (DirectoryNode)
- fs.getRoot().getEntry("MBD0000A3B3");
-
- assertNotNull(dirA.getEntry("PowerPoint Document"));
- assertNotNull(dirB.getEntry("PowerPoint Document"));
-
- // Check the first file
- ss = new HSLFSlideShow(dirA);
- ppe = new PowerPointExtractor(ss);
- assertEquals("Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
- ppe.getText(true, false)
- );
-
- // And the second
- ss = new HSLFSlideShow(dirB);
- ppe = new PowerPointExtractor(ss);
- assertEquals("Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n",
- ppe.getText(true, false)
- );
- }
-
- /**
- * A powerpoint file with embeded powerpoint files
- */
- public void testExtractFromOwnEmbeded() throws Exception {
- String path = "ppt_with_embeded.ppt";
- ppe = new PowerPointExtractor(POIDataSamples.getSlideShowInstance().openResourceAsStream(path));
- List<OLEShape> shapes = ppe.getOLEShapes();
- assertEquals("Expected 6 ole shapes in " + path, 6, shapes.size());
- int num_ppt = 0, num_doc = 0, num_xls = 0;
- for(OLEShape ole : shapes) {
- String name = ole.getInstanceName();
- InputStream data = ole.getObjectData().getData();
- if ("Worksheet".equals(name)) {
- HSSFWorkbook wb = new HSSFWorkbook(data);
- num_xls++;
- } else if ("Document".equals(name)) {
- HWPFDocument doc = new HWPFDocument(data);
- num_doc++;
- } else if ("Presentation".equals(name)) {
- num_ppt++;
- SlideShow ppt = new SlideShow(data);
- }
- }
- assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
- assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
- assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
- }
-
- /**
- * A powerpoint file with embeded powerpoint files
- */
- public void test52991() throws Exception {
- String path = "badzip.ppt";
- ppe = new PowerPointExtractor(POIDataSamples.getSlideShowInstance().openResourceAsStream(path));
- List<OLEShape> shapes = ppe.getOLEShapes();
-
- for (OLEShape shape : shapes) {
- IOUtils.copy(shape.getObjectData().getData(), new ByteArrayOutputStream());
- }
- }
-
- /**
- * From bug #45543
- */
- public void testWithComments() throws Exception {
- ppe = new PowerPointExtractor(slTests.openResourceAsStream("WithComments.ppt"));
-
- String text = ppe.getText();
- assertFalse("Comments not in by default", text.contains("This is a test comment"));
-
- ppe.setCommentsByDefault(true);
-
- text = ppe.getText();
- assertContains(text, "This is a test comment");
-
-
- // And another file
- ppe = new PowerPointExtractor(slTests.openResourceAsStream("45543.ppt"));
-
- text = ppe.getText();
- assertFalse("Comments not in by default", text.contains("testdoc"));
-
- ppe.setCommentsByDefault(true);
-
- text = ppe.getText();
- assertContains(text, "testdoc");
- }
-
- /**
- * From bug #45537
- */
- public void testHeaderFooter() throws Exception {
- String text;
-
- // With a header on the notes
- HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("45537_Header.ppt"));
- SlideShow ss = new SlideShow(hslf);
- assertNotNull(ss.getNotesHeadersFooters());
- assertEquals("testdoc test phrase", ss.getNotesHeadersFooters().getHeaderText());
-
- ppe = new PowerPointExtractor(hslf);
-
- text = ppe.getText();
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
-
- ppe.setNotesByDefault(true);
- text = ppe.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
-
-
- // And with a footer, also on notes
- hslf = new HSLFSlideShow(slTests.openResourceAsStream("45537_Footer.ppt"));
- ss = new SlideShow(hslf);
- assertNotNull(ss.getNotesHeadersFooters());
- assertEquals("testdoc test phrase", ss.getNotesHeadersFooters().getFooterText());
-
- ppe = new PowerPointExtractor(slTests.openResourceAsStream("45537_Footer.ppt"));
-
- text = ppe.getText();
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
-
- ppe.setNotesByDefault(true);
- text = ppe.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- }
-
- public void testSlideMasterText() throws Exception {
- String masterTitleText = "This is the Master Title";
- String masterRandomText = "This text comes from the Master Slide";
- String masterFooterText = "Footer from the master slide";
- HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("WithMaster.ppt"));
-
- ppe = new PowerPointExtractor(hslf);
-
- String text = ppe.getText();
- //assertContains(text, masterTitleText); // TODO Is this available in PPT?
- //assertContains(text, masterRandomText); // TODO Extract
- assertContains(text, masterFooterText);
- }
-
- public void testMasterText() throws Exception {
- ppe = new PowerPointExtractor(slTests.openResourceAsStream("master_text.ppt"));
-
- // Initially not there
- String text = ppe.getText();
- assertFalse(text.contains("Text that I added to the master slide"));
-
- // Enable, shows up
- ppe.setMasterByDefault(true);
- text = ppe.getText();
- assertTrue(text.contains("Text that I added to the master slide"));
-
- // Make sure placeholder text does not come out
- assertFalse(text.contains("Click to edit Master"));
-
- // Now with another file only containing master text
- // Will always show up
- String masterText = "Footer from the master slide";
- HSLFSlideShow hslf = new HSLFSlideShow(slTests.openResourceAsStream("WithMaster.ppt"));
-
- ppe = new PowerPointExtractor(hslf);
-
- text = ppe.getText();
- assertContains(text.toLowerCase(), "master");
- assertContains(text, masterText);
- }
-
-
- /**
- * Tests that we can work with both {@link POIFSFileSystem}
- * and {@link NPOIFSFileSystem}
- */
- public void testDifferentPOIFS() throws Exception {
- // Open the two filesystems
- DirectoryNode[] files = new DirectoryNode[2];
- files[0] = (new POIFSFileSystem(slTests.openResourceAsStream("basic_test_ppt_file.ppt"))).getRoot();
- files[1] = (new NPOIFSFileSystem(slTests.getFile("basic_test_ppt_file.ppt"))).getRoot();
-
- // Open directly
- for(DirectoryNode dir : files) {
- PowerPointExtractor extractor = new PowerPointExtractor(dir, null);
- assertEquals(expectText, extractor.getText());
- }
-
- // Open via a HWPFDocument
- for(DirectoryNode dir : files) {
- HSLFSlideShow slideshow = new HSLFSlideShow(dir);
- PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
- assertEquals(expectText, extractor.getText());
- }
- }
- }
|