Bug 62092 - Text not extracted from grouped text shapes in HSLF

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829453 13f79535-47bb-0310-9956-ffa450edef68
author: Andreas Beeker <kiwiwings@apache.org> 2018-04-18 15:02:02 +0000
committer: Andreas Beeker <kiwiwings@apache.org> 2018-04-18 15:02:02 +0000
commit: f395630abd891b9d4ee09932c309d566d8360860 (patch)
tree: 13657f32c2e9f9ee2fc933a11cd47791f328aff9 /src/scratchpad/testcases/org
parent: 1079e66ea7cc4a53d4d4a7e54a615bddbfeaaa12 (diff)
download: poi-f395630abd891b9d4ee09932c309d566d8360860.tar.gz
poi-f395630abd891b9d4ee09932c309d566d8360860.zip
2 files changed, 48 insertions, 29 deletions
diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java
index f2700d4fac..6a34d1af3f 100644
--- a/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hslf/extractor/TestExtractor.java
@@ -33,9 +33,9 @@ import java.io.InputStream;
 import java.util.List;
 
 import org.apache.poi.POIDataSamples;
+import org.apache.poi.hslf.usermodel.HSLFObjectShape;
 import org.apache.poi.hslf.usermodel.HSLFSlideShow;
 import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
-import org.apache.poi.hslf.usermodel.HSLFObjectShape;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
@@ -89,12 +89,12 @@ public final class TestExtractor {
     public void testReadSheetText() throws IOException {
         // Basic 2 page example
         PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
-        ensureTwoStringsTheSame(expectText, ppe.getText());
+        assertEquals(expectText, ppe.getText());
         ppe.close();
 
         // 1 page example with text boxes
         PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
-        ensureTwoStringsTheSame(expectText2, ppe2.getText());
+        assertEquals(expectText2, ppe2.getText());
         ppe2.close();
     }
 
@@ -103,15 +103,15 @@ public final class TestExtractor {
         // Basic 2 page example
         PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
         String notesText = ppe.getNotes();
-        String expText = "These are the notes for page 1\nThese are the notes on page two, again lacking formatting\n";
-        ensureTwoStringsTheSame(expText, notesText);
+        String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
+        assertEquals(expText, notesText);
         ppe.close();
 
         // Other one doesn't have notes
         PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
         notesText = ppe2.getNotes();
         expText = "";
-        ensureTwoStringsTheSame(expText, notesText);
+        assertEquals(expText, notesText);
         ppe2.close();
     }
 
@@ -122,8 +122,8 @@ public final class TestExtractor {
                 "This is the title on page 2\nThis is page two\nIt has several blocks of text\nNone of them have formatting\n"
         };
         String[] ntText = new String[]{
-                "These are the notes for page 1\n",
-                "These are the notes on page two, again lacking formatting\n"
+                "\nThese are the notes for page 1\n",
+                "\nThese are the notes on page two, again lacking formatting\n"
         };
 
         PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
@@ -137,7 +137,7 @@ public final class TestExtractor {
 
         ppe.setSlidesByDefault(true);
         ppe.setNotesByDefault(true);
-        assertEquals(slText[0] + slText[1] + "\n" + ntText[0] + ntText[1], ppe.getText());
+        assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
         ppe.close();
     }
 
@@ -166,16 +166,6 @@ public final class TestExtractor {
         ppe.close();
     }
 
-    private void ensureTwoStringsTheSame(String exp, String act) {
-        assertEquals(exp.length(), act.length());
-        char[] expC = exp.toCharArray();
-        char[] actC = act.toCharArray();
-        for (int i = 0; i < expC.length; i++) {
-            assertEquals("Char " + i, expC[i], actC[i]);
-        }
-        assertEquals(exp, act);
-    }
-
     @Test
     public void testExtractFromEmbeded() throws IOException {
         InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
@@ -454,4 +444,38 @@ public final class TestExtractor {
         assertContains(text, "Prague");
         ppe.close();
     }
+
+    @Test
+    public void testExtractGroupedShapeText() throws Exception {
+        try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
+            final String text = ppe.getText();
+
+            //this tests that we're ignoring text shapes at depth=0
+            //i.e. POI has already included them in the slide's getTextParagraphs()
+            assertContains(text, "Text box1");
+            assertEquals(1, countMatches(text,"Text box1"));
+
+
+            //the WordArt and text box count tests will fail
+            //if this content is available via getTextParagraphs() of the slide in POI
+            //i.e. when POI is fixed, these tests will fail, and
+            //we'll have to remove the workaround in HSLFExtractor's extractGroupText(...)
+            assertEquals(1, countMatches(text,"WordArt1"));
+            assertEquals(1, countMatches(text,"WordArt2"));
+            assertEquals(1, countMatches(text,"Ungrouped text box"));//should only be 1
+            assertContains(text, "Text box2");
+            assertContains(text, "Text box3");
+            assertContains(text, "Text box4");
+            assertContains(text, "Text box5");
+
+            //see below -- need to extract hyperlinks
+            assertContains(text, "tika");
+            assertContains(text, "MyTitle");
+
+        }
+    }
+
+    private static int countMatches(final String base, final String find) {
+        return base.split(find).length-1;
+    }
 }
diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/record/TestRecordTypes.java b/src/scratchpad/testcases/org/apache/poi/hslf/record/TestRecordTypes.java
index 814259ca15..dbf89692bb 100644
--- a/src/scratchpad/testcases/org/apache/poi/hslf/record/TestRecordTypes.java
+++ b/src/scratchpad/testcases/org/apache/poi/hslf/record/TestRecordTypes.java
@@ -18,10 +18,10 @@
 package org.apache.poi.hslf.record;
 
 
-import static org.junit.Assert.assertEquals;
-
 import org.junit.Test;
 
+import static org.junit.Assert.assertEquals;
+
 /**
  * Tests that RecordTypes returns the right records and classes when asked
  */
@@ -42,20 +42,15 @@ public final class TestRecordTypes {
 
     @Test
 	public void testPPTClassLookups() {
-		assertEquals(Slide.class, RecordTypes.Slide.handlingClass);
-		assertEquals(TextCharsAtom.class, RecordTypes.TextCharsAtom.handlingClass);
-		assertEquals(TextBytesAtom.class, RecordTypes.TextBytesAtom.handlingClass);
-		assertEquals(SlideListWithText.class, RecordTypes.SlideListWithText.handlingClass);
-
 		// If this record is ever implemented, change to one that isn't!
 		// This is checking the "unhandled default" stuff works
-		assertEquals(UnknownRecordPlaceholder.class, RecordTypes.forTypeID(-10).handlingClass);
+		assertEquals(RecordTypes.UnknownRecordPlaceholder, RecordTypes.forTypeID(-10));
 	}
 
     @Test
     public void testEscherClassLookups() {
 		// Should all come back with null, as DDF handles them
-		assertEquals(null, RecordTypes.EscherDggContainer.handlingClass);
-		assertEquals(null, RecordTypes.EscherBStoreContainer.handlingClass);
+		assertEquals(null, RecordTypes.EscherDggContainer.recordConstructor);
+		assertEquals(null, RecordTypes.EscherBStoreContainer.recordConstructor);
 	}
 }
author	Andreas Beeker <kiwiwings@apache.org>	2018-04-18 15:02:02 +0000
committer	Andreas Beeker <kiwiwings@apache.org>	2018-04-18 15:02:02 +0000
commit	f395630abd891b9d4ee09932c309d566d8360860 (patch)
tree	13657f32c2e9f9ee2fc933a11cd47791f328aff9 /src/scratchpad/testcases/org
parent	1079e66ea7cc4a53d4d4a7e54a615bddbfeaaa12 (diff)
download	poi-f395630abd891b9d4ee09932c309d566d8360860.tar.gz poi-f395630abd891b9d4ee09932c309d566d8360860.zip