]> source.dussan.org Git - poi.git/commitdiff
55347 - integrate textbox text extraction with Excel extractors
authorTim Allison <tallison@apache.org>
Thu, 8 Aug 2013 14:04:07 +0000 (14:04 +0000)
committerTim Allison <tallison@apache.org>
Thu, 8 Aug 2013 14:04:07 +0000 (14:04 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1511789 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFReader.java
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java
src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java
src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java
src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestXSSFReader.java
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java
src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java

index 59ea919c5c0b9ab40574ce9a4bb90773fbdf73f7..7c250948c37f29999b8291ee1615ba351f03f944 100644 (file)
@@ -21,6 +21,8 @@ import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.poi.POIXMLException;
@@ -37,7 +39,9 @@ import org.apache.poi.xssf.model.CommentsTable;
 import org.apache.poi.xssf.model.SharedStringsTable;
 import org.apache.poi.xssf.model.StylesTable;
 import org.apache.poi.xssf.model.ThemesTable;
+import org.apache.poi.xssf.usermodel.XSSFDrawing;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
 import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook;
@@ -273,6 +277,35 @@ public class XSSFReader {
            return null;
         }
         
+        /**
+         * Returns the shapes associated with this sheet,
+         * an empty list or null if there is an exception
+         */
+        public List<XSSFShape> getShapes() {
+            PackagePart sheetPkg = getSheetPart();
+            List<XSSFShape> shapes= new LinkedList<XSSFShape>();
+           // Do we have a comments relationship? (Only ever one if so)
+           try {
+              PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation());
+              for (int i = 0; i < drawingsList.size(); i++){
+                  PackageRelationship drawings = drawingsList.getRelationship(i);
+                  PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI());
+                  PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName);
+                  XSSFDrawing drawing = new XSSFDrawing(drawingsPart, drawings);
+                  for (XSSFShape shape : drawing.getShapes()){
+                      shapes.add(shape);
+                  }
+              }
+           } catch (XmlException e){
+               return null;
+           } catch (InvalidFormatException e) {  
+              return null;
+           } catch (IOException e) {
+              return null;
+           }
+           return shapes;
+        }
+        
         public PackagePart getSheetPart() {
            String sheetId = ctSheet.getId();
            return sheetMap.get(sheetId);
index 0c31fe04d5a8ec1b8772f8cfde492ed7ccda1a56..a313271e58e00ee40811f0fcd1478a22b9e01e08 100644 (file)
@@ -18,6 +18,7 @@ package org.apache.poi.xssf.extractor;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.List;
 import java.util.Locale;
 
 import javax.xml.parsers.ParserConfigurationException;
@@ -37,6 +38,8 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
 import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
@@ -54,6 +57,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
    private Locale locale;
        private boolean includeSheetNames = true;
        private boolean formulasNotResults = false;
+       private boolean includeTextBoxes = true;
 
        public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
                this(OPCPackage.open(path));
@@ -89,6 +93,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
        public void setFormulasNotResults(boolean formulasNotResults) {
                this.formulasNotResults = formulasNotResults;
        }
+
+       /**
+     * Should text from textboxes be included? Default is true
+     */
+
+       public void setIncludeTextBoxes(boolean includeTextBoxes) {
+           this.includeTextBoxes = includeTextBoxes;
+       }
        
        public void setLocale(Locale locale) {
           this.locale = locale;
@@ -175,6 +187,9 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
                  text.append('\n');
               }
               processSheet(sheetExtractor, styles, strings, stream);
+              if (includeTextBoxes){
+                  processShapes(iter.getShapes(), text);
+              }
               stream.close();
           }
           
@@ -191,7 +206,20 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
        }
    }
    
-       @Override
+    private void processShapes(List<XSSFShape> shapes, StringBuffer text) {
+        if (shapes == null){
+            return;
+        }
+        for (XSSFShape shape : shapes){
+            if (shape instanceof XSSFSimpleShape){
+                String sText = ((XSSFSimpleShape)shape).getText();
+                if (sText != null && sText.length() > 0){
+                    text.append(sText).append('\n');
+                }
+            }
+        }
+    }
+    @Override
        public void close() throws IOException {
                if (container != null) {
                        container.close();
index 32702fd8f088ce0c7a21f263887dff1239dcbd64..1d7c877151cdf72e5fa424a949e9f94829fef2e6 100644 (file)
@@ -31,8 +31,11 @@ import org.apache.poi.ss.usermodel.DataFormatter;
 import org.apache.poi.ss.usermodel.HeaderFooter;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFDrawing;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.apache.xmlbeans.XmlException;
 
@@ -52,6 +55,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
     private boolean formulasNotResults = false;
     private boolean includeCellComments = false;
     private boolean includeHeadersFooters = true;
+    private boolean includeTextBoxes = true;
 
     /**
      * @deprecated  Use {@link #XSSFExcelExtractor(org.apache.poi.openxml4j.opc.OPCPackage)} instead.
@@ -103,6 +107,13 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
     public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
         this.includeHeadersFooters = includeHeadersFooters;
     }
+    /**
+     * Should text within textboxes be included? Default is true
+     * @param includeTextBoxes
+     */
+    public void setIncludeTextBoxes(boolean includeTextBoxes){
+        this.includeTextBoxes = includeTextBoxes;
+    }
     /**
      * What Locale should be used for formatting numbers (based
      *  on the styles applied to the cells)
@@ -180,7 +191,20 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
                 }
                 text.append("\n");
             }
-
+            
+            // add textboxes
+            if (includeTextBoxes){
+                XSSFDrawing drawing = sheet.createDrawingPatriarch();
+                for (XSSFShape shape : drawing.getShapes()){
+                    if (shape instanceof XSSFSimpleShape){
+                        String boxText = ((XSSFSimpleShape)shape).getText();
+                        if (boxText.length() > 0){
+                            text.append(boxText);
+                            text.append('\n');
+                        }
+                    }
+                }
+            }
             // Finally footer(s), if present
             if(includeHeadersFooters) {
                 text.append(
index 82c5f99d181a212a021f95dfb9ad8c245544aa30..100b539dea8d481ddcfdc653a89d65a004dd9955 100644 (file)
@@ -76,7 +76,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
      * @param rel  the package relationship holding this drawing,
      * the relationship type must be http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing
      */
-    protected XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
+    public XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
         super(part, rel);
         XmlOptions options  = new XmlOptions(DEFAULT_XML_OPTIONS);
         //Removing root element
index 5baaebb90a35443796e87d29e91ea283410359d0..85613d3e476ed2824e5a47b3f4ce2020ae540db7 100644 (file)
@@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel;
 
 import java.io.InputStream;
 import java.util.Iterator;
+import java.util.List;
 
 import junit.framework.TestCase;
 
@@ -27,6 +28,8 @@ import org.apache.poi.util.IOUtils;
 import org.apache.poi.xssf.XSSFTestDataSamples;
 import org.apache.poi.xssf.model.CommentsTable;
 import org.apache.poi.xssf.usermodel.XSSFRichTextString;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 import org.apache.poi.POIDataSamples;
 
 /**
@@ -164,4 +167,33 @@ public final class TestXSSFReader extends TestCase {
           stream.close();
       }
    }
+   /**
+    * Test text extraction from text box using getShapes()
+    * @throws Exception
+    */
+   public void testShapes() throws Exception{
+       OPCPackage pkg =  XSSFTestDataSamples.openSamplePackage("WithTextBox.xlsx");
+       XSSFReader r = new XSSFReader(pkg);
+       XSSFReader.SheetIterator it = (XSSFReader.SheetIterator)r.getSheetsData();
+       
+       StringBuilder sb = new StringBuilder();
+       while(it.hasNext())
+       {    
+          it.next();
+          List<XSSFShape> shapes = it.getShapes();
+          if (shapes != null){
+              for (XSSFShape shape : shapes){
+                  if (shape instanceof XSSFSimpleShape){
+                      String t = ((XSSFSimpleShape)shape).getText();
+                      sb.append(t).append('\n');
+                  }
+              }
+          }
+       }
+       String text = sb.toString();
+       assertTrue(text.indexOf("Line 1") > -1);
+       assertTrue(text.indexOf("Line 2") > -1);
+       assertTrue(text.indexOf("Line 3") > -1);
+
+   }
 }
index eac3700e7db001c47ccf19f131b16eb096338d5d..1b0e6c4797a3c300def1f724528690be3402d5da 100644 (file)
@@ -17,6 +17,7 @@
 
 package org.apache.poi.xssf.extractor;
 
+import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -25,7 +26,11 @@ import junit.framework.TestCase;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hssf.HSSFTestDataSamples;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xssf.XSSFTestDataSamples;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 
 /**
  * Tests for {@link XSSFEventBasedExcelExtractor}
@@ -167,4 +172,19 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
                ole2Extractor.close();
                ooxmlExtractor.close();
        }
+       
+        /**
+           * Test text extraction from text box using getShapes()
+           * @throws Exception
+           */
+    public void testShapes() throws Exception{
+           XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx");
+              
+           String text = ooxmlExtractor.getText();
+
+           assertTrue(text.indexOf("Line 1") > -1);
+           assertTrue(text.indexOf("Line 2") > -1);
+           assertTrue(text.indexOf("Line 3") > -1);
+
+    }
 }
index bc86d6f9b9b190ad52a7c13a39d31b111878661b..b4872d1181a4a7dbc455279f1f063a362a1bc8e3 100644 (file)
@@ -211,4 +211,16 @@ public class TestXSSFExcelExtractor extends TestCase {
       
       extractor.close();
        }
+       /**
+        * Simple test for text box text
+        * @throws IOException
+        */
+       public void testTextBoxes() throws IOException {
+           XSSFExcelExtractor extractor = getExtractor("WithTextBox.xlsx");
+           extractor.setFormulasNotResults(true);
+           String text = extractor.getText();
+           assertTrue(text.indexOf("Line 1") > -1);
+           assertTrue(text.indexOf("Line 2") > -1);
+           assertTrue(text.indexOf("Line 3") > -1);
+       }
 }