123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hslf.extractor;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.apache.poi.POITestCase.assertContainsIgnoreCase;
- import static org.apache.poi.POITestCase.assertNotContained;
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertFalse;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertTrue;
-
- import java.io.ByteArrayOutputStream;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.List;
-
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.hslf.usermodel.HSLFObjectShape;
- import org.apache.poi.hslf.usermodel.HSLFSlideShow;
- import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.IOUtils;
- import org.junit.Test;
-
- /**
- * Tests that the extractor correctly gets the text out of our sample file
- */
- public final class TestExtractor {
- /**
- * Extractor primed on the 2 page basic test data
- */
- private static final String expectText = "This is a test title\nThis is a test subtitle\nThis is on page 1\nThis is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n";
-
- /**
- * Extractor primed on the 1 page but text-box'd test data
- */
- private static final String expectText2 = "Hello, World!!!\nI am just a poor boy\nThis is Times New Roman\nPlain Text \n";
-
- /**
- * Where our embeded files live
- */
- private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
-
- // @Before
- // public void setUp() throws Exception {
- // ppe = new PowerPointExtractor(slTests.getFile("basic_test_ppt_file.ppt").getCanonicalPath());
- // ppe2 = new PowerPointExtractor(slTests.getFile("with_textbox.ppt").getCanonicalPath());
- // }
-
- // @After
- // public void closeResources() throws Exception {
- // ppe2.close();
- // ppe.close();
- // }
-
- private PowerPointExtractor openExtractor(String fileName) throws IOException {
- InputStream is = slTests.openResourceAsStream(fileName);
- try {
- return new PowerPointExtractor(is);
- } finally {
- is.close();
- }
- }
-
- @Test
- public void testReadSheetText() throws IOException {
- // Basic 2 page example
- PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
- assertEquals(expectText, ppe.getText());
- ppe.close();
-
- // 1 page example with text boxes
- PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
- assertEquals(expectText2, ppe2.getText());
- ppe2.close();
- }
-
- @Test
- public void testReadNoteText() throws IOException {
- // Basic 2 page example
- PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
- String notesText = ppe.getNotes();
- String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
- assertEquals(expText, notesText);
- ppe.close();
-
- // Other one doesn't have notes
- PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
- notesText = ppe2.getNotes();
- expText = "";
- assertEquals(expText, notesText);
- ppe2.close();
- }
-
- @Test
- public void testReadBoth() throws IOException {
- String[] slText = new String[]{
- "This is a test title\nThis is a test subtitle\nThis is on page 1\n",
- "This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
- };
- String[] ntText = new String[]{
- "\nThese are the notes for page 1\n",
- "\nThese are the notes on page two, again lacking formatting\n"
- };
-
- PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
- ppe.setSlidesByDefault(true);
- ppe.setNotesByDefault(false);
- assertEquals(slText[0] + slText[1], ppe.getText());
-
- ppe.setSlidesByDefault(false);
- ppe.setNotesByDefault(true);
- assertEquals(ntText[0] + ntText[1], ppe.getText());
-
- ppe.setSlidesByDefault(true);
- ppe.setNotesByDefault(true);
- assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
- ppe.close();
- }
-
- /**
- * Test that when presented with a PPT file missing the odd
- * core record, we can still get the rest of the text out
- *
- * @throws Exception
- */
- @Test
- public void testMissingCoreRecords() throws IOException {
- PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
-
- String text = ppe.getText(true, false);
- String nText = ppe.getNotes();
-
- assertNotNull(text);
- assertNotNull(nText);
-
- // Notes record were corrupt, so don't expect any
- assertEquals(nText.length(), 0);
-
- // Slide records were fine
- assertContains(text, "Using Disease Surveillance and Response");
-
- ppe.close();
- }
-
- @Test
- public void testExtractFromEmbeded() throws IOException {
- InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
- POIFSFileSystem fs = new POIFSFileSystem(is);
- DirectoryNode root = fs.getRoot();
- PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
- PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
- ppe2.close();
- ppe1.close();
- fs.close();
- }
-
- private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
- throws IOException {
- DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
- assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
-
- // Check the first file
- HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
- PowerPointExtractor ppe = new PowerPointExtractor(ppt);
- assertEquals(expected, ppe.getText(true, false));
- return ppe;
- }
-
- /**
- * A powerpoint file with embeded powerpoint files
- */
- @Test
- public void testExtractFromOwnEmbeded() throws IOException {
- PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
- List<HSLFObjectShape> shapes = ppe.getOLEShapes();
- assertEquals("Expected 6 ole shapes", 6, shapes.size());
- int num_ppt = 0, num_doc = 0, num_xls = 0;
- for (HSLFObjectShape ole : shapes) {
- String name = ole.getInstanceName();
- InputStream data = ole.getObjectData().getInputStream();
- if ("Worksheet".equals(name)) {
- HSSFWorkbook wb = new HSSFWorkbook(data);
- num_xls++;
- wb.close();
- } else if ("Document".equals(name)) {
- HWPFDocument doc = new HWPFDocument(data);
- num_doc++;
- doc.close();
- } else if ("Presentation".equals(name)) {
- num_ppt++;
- HSLFSlideShow ppt = new HSLFSlideShow(data);
- ppt.close();
- }
- data.close();
- }
- assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
- assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
- assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
- ppe.close();
- }
-
- /**
- * A powerpoint file with embeded powerpoint files
- */
- @Test
- public void test52991() throws IOException {
- PowerPointExtractor ppe = openExtractor("badzip.ppt");
- for (HSLFObjectShape shape : ppe.getOLEShapes()) {
- IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
- }
- ppe.close();
- }
-
- /**
- * From bug #45543
- */
- @Test
- public void testWithComments() throws IOException {
- PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
- String text = ppe1.getText();
- assertFalse("Comments not in by default", text.contains("This is a test comment"));
-
- ppe1.setCommentsByDefault(true);
-
- text = ppe1.getText();
- assertContains(text, "This is a test comment");
- ppe1.close();
-
-
- // And another file
- PowerPointExtractor ppe2 = openExtractor("45543.ppt");
- text = ppe2.getText();
- assertFalse("Comments not in by default", text.contains("testdoc"));
-
- ppe2.setCommentsByDefault(true);
-
- text = ppe2.getText();
- assertContains(text, "testdoc");
- ppe2.close();
- }
-
- /**
- * From bug #45537
- */
- @Test
- public void testHeaderFooter() throws IOException {
- String text;
-
- // With a header on the notes
- InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
- HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
- is1.close();
- assertNotNull(ppt1.getNotesHeadersFooters());
- assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
-
- PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
-
- text = ppe1.getText();
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
-
- ppe1.setNotesByDefault(true);
- text = ppe1.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- ppe1.close();
- ppt1.close();
-
- // And with a footer, also on notes
- InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
- HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
- is2.close();
-
- assertNotNull(ppt2.getNotesHeadersFooters());
- assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
- ppt2.close();
-
- PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
-
- text = ppe2.getText();
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
- assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
-
- ppe2.setNotesByDefault(true);
- text = ppe2.getText();
- assertContains(text, "testdoc");
- assertContains(text, "test phrase");
- ppe2.close();
- }
-
- @SuppressWarnings("unused")
- @Test
- public void testSlideMasterText() throws IOException {
- String masterTitleText = "This is the Master Title";
- String masterRandomText = "This text comes from the Master Slide";
- String masterFooterText = "Footer from the master slide";
- PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
- ppe.setMasterByDefault(true);
-
- String text = ppe.getText();
- assertContains(text, masterRandomText);
- assertContains(text, masterFooterText);
- ppe.close();
- }
-
- @Test
- public void testMasterText() throws IOException {
- PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
-
- // Initially not there
- String text = ppe1.getText();
- assertFalse(text.contains("Text that I added to the master slide"));
-
- // Enable, shows up
- ppe1.setMasterByDefault(true);
- text = ppe1.getText();
- assertContains(text, "Text that I added to the master slide");
-
- // Make sure placeholder text does not come out
- assertNotContained(text, "Click to edit Master");
- ppe1.close();
-
- // Now with another file only containing master text
- // Will always show up
- PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
- String masterText = "Footer from the master slide";
-
- text = ppe2.getText();
- assertContainsIgnoreCase(text, "master");
- assertContains(text, masterText);
- ppe2.close();
- }
-
- /**
- * Bug #54880 Chinese text not extracted properly
- */
- @Test
- public void testChineseText() throws IOException {
- PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
-
- String text = ppe.getText();
-
- // Check for the english text line
- assertContains(text, "Single byte");
-
- // Check for the english text in the mixed line
- assertContains(text, "Mix");
-
- // Check for the chinese text in the mixed line
- assertContains(text, "\u8868");
-
- // Check for the chinese only text line
- assertContains(text, "\uff8a\uff9d\uff76\uff78");
- ppe.close();
- }
-
- /**
- * Tests that we can work with both {@link POIFSFileSystem}
- * and {@link NPOIFSFileSystem}
- */
- @SuppressWarnings("resource")
- @Test
- public void testDifferentPOIFS() throws IOException {
- // Open the two filesystems
- File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
- InputStream is1 = new FileInputStream(pptFile);
- OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
- is1.close();
- NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
-
- DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
-
- // Open directly
- for (DirectoryNode dir : files) {
- PowerPointExtractor extractor = new PowerPointExtractor(dir);
- assertEquals(expectText, extractor.getText());
- }
-
- // Open via a HSLFSlideShow
- for (DirectoryNode dir : files) {
- HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
- PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
- assertEquals(expectText, extractor.getText());
- extractor.close();
- slideshow.close();
- }
-
- npoifs.close();
- }
-
- @Test
- public void testTable() throws Exception {
- PowerPointExtractor ppe1 = openExtractor("54111.ppt");
- String text1 = ppe1.getText();
- String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
- "Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n"+
- "Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n"+
- "Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n"+
- "Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n"+
- "Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
- assertContains(text1, target1);
- ppe1.close();
-
- PowerPointExtractor ppe2 = openExtractor("54722.ppt");
- String text2 = ppe2.getText();
-
- String target2 = "this\tText\tis\twithin\ta\n" +
- "table\t1\t2\t3\t4";
- assertContains(text2, target2);
- ppe2.close();
- }
-
- // bug 60003
- @Test
- public void testExtractMasterSlideFooterText() throws Exception {
- PowerPointExtractor ppe = openExtractor("60003.ppt");
- ppe.setMasterByDefault(true);
-
- String text = ppe.getText();
- assertContains(text, "Prague");
- ppe.close();
- }
-
- @Test
- public void testExtractGroupedShapeText() throws Exception {
- try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
- final String text = ppe.getText();
-
- //this tests that we're ignoring text shapes at depth=0
- //i.e. POI has already included them in the slide's getTextParagraphs()
- assertContains(text, "Text box1");
- assertEquals(1, countMatches(text,"Text box1"));
-
-
- //the WordArt and text box count tests will fail
- //if this content is available via getTextParagraphs() of the slide in POI
- //i.e. when POI is fixed, these tests will fail, and
- //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
- assertEquals(1, countMatches(text,"WordArt1"));
- assertEquals(1, countMatches(text,"WordArt2"));
- assertEquals(1, countMatches(text,"Ungrouped text box"));//should only be 1
- assertContains(text, "Text box2");
- assertContains(text, "Text box3");
- assertContains(text, "Text box4");
- assertContains(text, "Text box5");
-
- //see below -- need to extract hyperlinks
- assertContains(text, "tika");
- assertContains(text, "MyTitle");
-
- }
- }
-
- private static int countMatches(final String base, final String find) {
- return base.split(find).length-1;
- }
- }
|