]> source.dussan.org Git - poi.git/commitdiff
updated docs on extraction of embedded objects, misc changes in HSSF
authorYegor Kozlov <yegor@apache.org>
Sat, 18 Jul 2009 16:49:56 +0000 (16:49 +0000)
committerYegor Kozlov <yegor@apache.org>
Sat, 18 Jul 2009 16:49:56 +0000 (16:49 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@795394 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/spreadsheet/quick-guide.xml
src/documentation/content/xdocs/status.xml
src/documentation/content/xdocs/text-extraction.xml
src/examples/src/org/apache/poi/hssf/usermodel/examples/EmeddedObjects.java [new file with mode: 0755]
src/examples/src/org/apache/poi/xssf/usermodel/examples/EmbeddedObjects.java [new file with mode: 0755]
src/java/org/apache/poi/hssf/usermodel/HSSFShapeGroup.java
src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java

index 859adf961b057fa1b38e3f81b6efd8cb81f1f548..d577d0974e2bc045b794af58a4eadcac2f984ebf 100644 (file)
@@ -73,6 +73,7 @@
                     <li><link href="#Autofit">How to adjust column width to fit the contents</link></li>
                     <li><link href="#Hyperlinks">Hyperlinks</link></li>
                     <li><link href="#Validation">Data Validation</link></li>
+                    <li><link href="#Embedded">Embedded Objects</link></li>
                 </ul>
             </section>
             <section><title>Features</title>
@@ -1659,5 +1660,84 @@ Examples:
   dvConstraint = DVConstraint.createFormulaListConstraint("'Sheet1'!$A$1:$A$3");
       </source>
     </section>
+     <anchor id="Embedded"/>
+     <section><title>Embedded Objects</title>
+       <p>It is possible to perform more detailed processing of an embedded Excel, Word or PowerPoint document, 
+         or to work with any other type of embedded object.</p>
+       <p><strong>HSSF:</strong></p>
+       <source>
+  POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream("excel_with_embeded.xls"));
+  HSSFWorkbook workbook = new HSSFWorkbook(fs);
+  for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
+      //the OLE2 Class Name of the object
+      String oleName = obj.getOLE2ClassName();
+      if (oleName.equals("Worksheet")) {
+          DirectoryNode dn = (DirectoryNode) obj.getDirectory();
+          HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);
+          //System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());
+      } else if (oleName.equals("Document")) {
+          DirectoryNode dn = (DirectoryNode) obj.getDirectory();
+          HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);
+          //System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());
+      }  else if (oleName.equals("Presentation")) {
+          DirectoryNode dn = (DirectoryNode) obj.getDirectory();
+          SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs));
+          //System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);
+      } else {
+          if(obj.hasDirectoryEntry()){
+              // The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is
+              DirectoryNode dn = (DirectoryNode) obj.getDirectory();
+              for (Iterator entries = dn.getEntries(); entries.hasNext();) {
+                  Entry entry = (Entry) entries.next();
+                  //System.out.println(oleName + "." + entry.getName());
+              }
+          } else {
+              // There is no DirectoryEntry
+              // Recover the object's data from the HSSFObjectData instance.
+              byte[] objectData = obj.getObjectData();
+          }
+      }
+  }
+       </source>  
+       <p><strong>XSSF:</strong></p>
+       <source>
+  XSSFWorkbook workbook = new XSSFWorkbook("excel_with_embeded.xlsx");
+  for (PackagePart pPart : workbook.getAllEmbedds()) {
+      String contentType = pPart.getContentType();
+      // Excel Workbook - either binary or OpenXML
+      if (contentType.equals("application/vnd.ms-excel")) {
+          HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());
+      }
+      // Excel Workbook - OpenXML file format
+      else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
+          OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
+          XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage);
+      }
+      // Word Document - binary (OLE2CDF) file format
+      else if (contentType.equals("application/msword")) {
+          HWPFDocument document = new HWPFDocument(pPart.getInputStream());
+      }
+      // Word Document - OpenXML file format
+      else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
+          OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
+          XWPFDocument document = new XWPFDocument(docPackage);
+      }
+      // PowerPoint Document - binary file format
+      else if (contentType.equals("application/vnd.ms-powerpoint")) {
+          HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream());
+      }
+      // PowerPoint Document - OpenXML file format
+      else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
+          OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
+          XSLFSlideShow slideShow = new XSLFSlideShow(docPackage);
+      }
+      // Any other type of embedded object.
+      else {
+          System.out.println("Unknown Embedded Document: " + contentType);
+          InputStream inputStream = pPart.getInputStream();
+      }
+  }
+       </source>  
+     </section>  
     </body>
 </document>
index 73abd537df69ac100c3d8f219fe7bac38bc268d3..2af0bd1e9fbb30602aa367a190ec0a07c491db17 100644 (file)
@@ -33,6 +33,7 @@
 
     <changes>
         <release version="3.5-beta7" date="2009-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">47535 - fixed WordExtractor to tolerate files with empty footnote block</action>
            <action dev="POI-DEVELOPERS" type="fix">47517 - Fixed ExtractorFactory to support .xltx and .dotx files</action>
            <action dev="POI-DEVELOPERS" type="add">45556 - Support for extraction of footnotes from docx files</action>
            <action dev="POI-DEVELOPERS" type="add">45555 - Support for extraction of endnotes from docx files</action>
index d71a0bf10c518fa0a41d954986bf47c9d5d98a54..fa7474bc0fe1c31497fa40e60ebc1930897a79b6 100644 (file)
       <em>org.apache.poi.hdgf.extractor.VisioTextExtractor</em>, which 
       will return text for your file.</p>
     </section>
+
+    <section><title>Embedded Objects</title>
+      <p>Extractors already exist for Excel, Word, PowerPoint and Visio; 
+        if one of these objects is embedded into a worksheet, the ExtractorFactory class can be used to recover an extractor for it.     
+      </p>
+      <source>
+  FileInputStream fis = new FileInputStream(inputFile);
+  POIFSFileSystem fileSystem = new POIFSFileSystem(fis);
+  // Firstly, get an extractor for the Workbook
+  POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem);
+  // Then a List of extractors for any embedded Excel, Word, PowerPoint
+  // or Visio objects embedded into it.
+  POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor);
+  for (POITextExtractor textExtractor : embeddedExtractors) {
+      // If the embedded object was an Excel spreadsheet.
+      if (textExtractor instanceof ExcelExtractor) {
+          ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor;
+          System.out.println(excelExtractor.getText());
+      }
+      // A Word Document
+      else if (textExtractor instanceof WordExtractor) {
+          WordExtractor wordExtractor = (WordExtractor) textExtractor;
+          String[] paragraphText = wordExtractor.getParagraphText();
+          for (String paragraph : paragraphText) {
+              System.out.println(paragraph);
+          }
+          // Display the document's header and footer text
+          System.out.println("Footer text: " + wordExtractor.getFooterText());
+          System.out.println("Header text: " + wordExtractor.getHeaderText());
+      }
+      // PowerPoint Presentation.
+      else if (textExtractor instanceof PowerPointExtractor) {
+          PowerPointExtractor powerPointExtractor = (PowerPointExtractor) textExtractor;
+          System.out.println("Text: " + powerPointExtractor.getText());
+          System.out.println("Notes: " + powerPointExtractor.getNotes());
+      }
+      // Visio Drawing
+      else if (textExtractor instanceof VisioTextExtractor) {
+          VisioTextExtractor visioTextExtractor = (VisioTextExtractor) textExtractor;
+          System.out.println("Text: " + visioTextExtractor.getText());
+      }
+  }
+      </source>
+    </section>
   </body>
 
   <footer>
diff --git a/src/examples/src/org/apache/poi/hssf/usermodel/examples/EmeddedObjects.java b/src/examples/src/org/apache/poi/hssf/usermodel/examples/EmeddedObjects.java
new file mode 100755 (executable)
index 0000000..f27725b
--- /dev/null
@@ -0,0 +1,68 @@
+/* ====================================================================\r
+   Licensed to the Apache Software Foundation (ASF) under one or more\r
+   contributor license agreements.  See the NOTICE file distributed with\r
+   this work for additional information regarding copyright ownership.\r
+   The ASF licenses this file to You under the Apache License, Version 2.0\r
+   (the "License"); you may not use this file except in compliance with\r
+   the License.  You may obtain a copy of the License at\r
+\r
+       http://www.apache.org/licenses/LICENSE-2.0\r
+\r
+   Unless required by applicable law or agreed to in writing, software\r
+   distributed under the License is distributed on an "AS IS" BASIS,\r
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
+   See the License for the specific language governing permissions and\r
+   limitations under the License.\r
+==================================================================== */\r
+package org.apache.poi.hssf.usermodel.examples;\r
+\r
+import org.apache.poi.hssf.usermodel.*;\r
+import org.apache.poi.poifs.filesystem.DirectoryNode;\r
+import org.apache.poi.poifs.filesystem.Entry;\r
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;\r
+import org.apache.poi.hwpf.HWPFDocument;\r
+import org.apache.poi.hslf.HSLFSlideShow;\r
+import org.apache.poi.hslf.usermodel.SlideShow;\r
+\r
+import java.io.FileInputStream;\r
+import java.util.Iterator;\r
+\r
+/**\r
+ * Demonstrates how you can extract embedded data from a .xls file\r
+ */\r
+public class EmeddedObjects {\r
+    public static void main(String[] args) throws Exception {\r
+        POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(args[0]));\r
+        HSSFWorkbook workbook = new HSSFWorkbook(fs);\r
+        for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {\r
+            //the OLE2 Class Name of the object\r
+            String oleName = obj.getOLE2ClassName();\r
+            if (oleName.equals("Worksheet")) {\r
+                DirectoryNode dn = (DirectoryNode) obj.getDirectory();\r
+                HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);\r
+                //System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());\r
+            } else if (oleName.equals("Document")) {\r
+                DirectoryNode dn = (DirectoryNode) obj.getDirectory();\r
+                HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);\r
+                //System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());\r
+            }  else if (oleName.equals("Presentation")) {\r
+                DirectoryNode dn = (DirectoryNode) obj.getDirectory();\r
+                SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs));\r
+                //System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);\r
+            } else {\r
+                if(obj.hasDirectoryEntry()){\r
+                    // The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is\r
+                    DirectoryNode dn = (DirectoryNode) obj.getDirectory();\r
+                    for (Iterator entries = dn.getEntries(); entries.hasNext();) {\r
+                        Entry entry = (Entry) entries.next();\r
+                        //System.out.println(oleName + "." + entry.getName());\r
+                    }\r
+                } else {\r
+                    // There is no DirectoryEntry\r
+                    // Recover the object’s data from the HSSFObjectData instance.\r
+                    byte[] objectData = obj.getObjectData();\r
+                }\r
+            }\r
+        }\r
+    }\r
+}\r
diff --git a/src/examples/src/org/apache/poi/xssf/usermodel/examples/EmbeddedObjects.java b/src/examples/src/org/apache/poi/xssf/usermodel/examples/EmbeddedObjects.java
new file mode 100755 (executable)
index 0000000..07228f4
--- /dev/null
@@ -0,0 +1,72 @@
+/* ====================================================================\r
+   Licensed to the Apache Software Foundation (ASF) under one or more\r
+   contributor license agreements.  See the NOTICE file distributed with\r
+   this work for additional information regarding copyright ownership.\r
+   The ASF licenses this file to You under the Apache License, Version 2.0\r
+   (the "License"); you may not use this file except in compliance with\r
+   the License.  You may obtain a copy of the License at\r
+\r
+       http://www.apache.org/licenses/LICENSE-2.0\r
+\r
+   Unless required by applicable law or agreed to in writing, software\r
+   distributed under the License is distributed on an "AS IS" BASIS,\r
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
+   See the License for the specific language governing permissions and\r
+   limitations under the License.\r
+==================================================================== */\r
+package org.apache.poi.xssf.usermodel.examples;\r
+\r
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;\r
+import org.apache.poi.openxml4j.opc.OPCPackage;\r
+import org.apache.poi.openxml4j.opc.PackagePart;\r
+import org.apache.poi.xwpf.usermodel.XWPFDocument;\r
+import org.apache.poi.hslf.HSLFSlideShow;\r
+import org.apache.poi.hwpf.HWPFDocument;\r
+import org.apache.poi.xslf.XSLFSlideShow;\r
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;\r
+\r
+import java.io.InputStream;\r
+\r
+/**\r
+ * Demonstrates how you can extract embedded data from a .xlsx file\r
+ */\r
+public class EmbeddedObjects {\r
+    public static void main(String[] args) throws Exception {\r
+        XSSFWorkbook workbook = new XSSFWorkbook(args[0]);\r
+        for (PackagePart pPart : workbook.getAllEmbedds()) {\r
+            String contentType = pPart.getContentType();\r
+            // Excel Workbook – either binary or OpenXML\r
+            if (contentType.equals("application/vnd.ms-excel")) {\r
+                HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());\r
+            }\r
+            // Excel Workbook – OpenXML file format\r
+            else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {\r
+                OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());\r
+                XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage);\r
+            }\r
+            // Word Document – binary (OLE2CDF) file format\r
+            else if (contentType.equals("application/msword")) {\r
+                HWPFDocument document = new HWPFDocument(pPart.getInputStream());\r
+            }\r
+            // Word Document – OpenXML file format\r
+            else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {\r
+                OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());\r
+                XWPFDocument document = new XWPFDocument(docPackage);\r
+            }\r
+            // PowerPoint Document – binary file format\r
+            else if (contentType.equals("application/vnd.ms-powerpoint")) {\r
+                HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream());\r
+            }\r
+            // PowerPoint Document – OpenXML file format\r
+            else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {\r
+                OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());\r
+                XSLFSlideShow slideShow = new XSLFSlideShow(docPackage);\r
+            }\r
+            // Any other type of embedded object.\r
+            else {\r
+                System.out.println("Unknown Embedded Document: " + contentType);\r
+                InputStream inputStream = pPart.getInputStream();\r
+            }\r
+        }\r
+    }\r
+}
\ No newline at end of file
index fa49528619533b75187c9919869bf7f613b414f0..d21604f3bbf7aca9e7cfce4e18192d2ea90d5d31 100644 (file)
@@ -31,7 +31,7 @@ public class HSSFShapeGroup
         extends HSSFShape
         implements HSSFShapeContainer
 {
-    List shapes = new ArrayList();
+    List<HSSFShape> shapes = new ArrayList<HSSFShape>();
     int x1 = 0;
     int y1  = 0 ;
     int x2 = 1023;
@@ -115,7 +115,7 @@ public class HSSFShapeGroup
     /**
      * Return all children contained by this shape.
      */
-    public List getChildren()
+    public List<HSSFShape> getChildren()
     {
         return shapes;
     }
index 7a78d4a08b1aa6e42a4a6ec69f8b532ba0b151d2..e0a08420e8141613c20bff2553049fc6780ec504 100644 (file)
@@ -1568,10 +1568,10 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
      *
      * @return the list of pictures (a list of {@link HSSFPictureData} objects.)
      */
-    public List getAllPictures()
+    public List<HSSFPictureData> getAllPictures()
     {
         // The drawing group record always exists at the top level, so we won't need to do this recursively.
-        List pictures = new ArrayList();
+        List<HSSFPictureData> pictures = new ArrayList<HSSFPictureData>();
         Iterator recordIter = workbook.getRecords().iterator();
         while (recordIter.hasNext())
         {
@@ -1592,7 +1592,7 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
      * @param escherRecords the escher records.
      * @param pictures the list to populate with the pictures.
      */
-    private void searchForPictures(List escherRecords, List pictures)
+    private void searchForPictures(List escherRecords, List<HSSFPictureData> pictures)
     {
         Iterator recordIter = escherRecords.iterator();
         while (recordIter.hasNext())
@@ -1646,9 +1646,9 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
      *
      * @return the list of embedded objects (a list of {@link HSSFObjectData} objects.)
      */
-    public List getAllEmbeddedObjects()
+    public List<HSSFObjectData> getAllEmbeddedObjects()
     {
-        List objects = new ArrayList();
+        List<HSSFObjectData> objects = new ArrayList<HSSFObjectData>();
         for (int i = 0; i < getNumberOfSheets(); i++)
         {
             getAllEmbeddedObjects(getSheetAt(i).getSheet().getRecords(), objects);
@@ -1662,7 +1662,7 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
      * @param records the list of records to search.
      * @param objects the list of embedded objects to populate.
      */
-    private void getAllEmbeddedObjects(List records, List objects)
+    private void getAllEmbeddedObjects(List records, List<HSSFObjectData> objects)
     {
         Iterator recordIter = records.iterator();
         while (recordIter.hasNext())