]> source.dussan.org Git - poi.git/commitdiff
EmbeddedExtractor (for *SSF) - added OOXML support
authorAndreas Beeker <kiwiwings@apache.org>
Thu, 5 Jan 2017 01:10:45 +0000 (01:10 +0000)
committerAndreas Beeker <kiwiwings@apache.org>
Thu, 5 Jan 2017 01:10:45 +0000 (01:10 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1777394 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/hpsf/ClassID.java
src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java

index 6fca6dd188819c4a351de24c2a526304815ae1ac..0dacf801392342058c86100b707f02dc4f5a7f97 100644 (file)
@@ -30,17 +30,43 @@ import org.apache.poi.util.StringUtil;
  */
 public class ClassID
 {
-    public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}");
-    public static final ClassID PPT_SHOW      = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
-    public static final ClassID XLS_WORKBOOK  = new ClassID("{00020841-0000-0000-C000-000000000046}");
-    public static final ClassID TXT_ONLY      = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
-    public static final ClassID EXCEL97       = new ClassID("{00020820-0000-0000-C000-000000000046}");
-    public static final ClassID EXCEL95       = new ClassID("{00020810-0000-0000-C000-000000000046}");
-    public static final ClassID WORD97        = new ClassID("{00020906-0000-0000-C000-000000000046}");
-    public static final ClassID WORD95        = new ClassID("{00020900-0000-0000-C000-000000000046}");
-    public static final ClassID POWERPOINT97  = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
-    public static final ClassID POWERPOINT95  = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
-    public static final ClassID EQUATION30    = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
+    public static final ClassID OLE10_PACKAGE  = new ClassID("{0003000C-0000-0000-C000-000000000046}");
+    public static final ClassID PPT_SHOW       = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
+    public static final ClassID XLS_WORKBOOK   = new ClassID("{00020841-0000-0000-C000-000000000046}");
+    public static final ClassID TXT_ONLY       = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
+
+    // Excel V3
+    public static final ClassID EXCEL_V3       = new ClassID("{00030000-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL_V3_CHART = new ClassID("{00030001-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL_V3_MACRO = new ClassID("{00030002-0000-0000-C000-000000000046}");
+    // Excel V5
+    public static final ClassID EXCEL95        = new ClassID("{00020810-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL95_CHART  = new ClassID("{00020811-0000-0000-C000-000000000046}");
+    // Excel V8
+    public static final ClassID EXCEL97        = new ClassID("{00020820-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL97_CHART  = new ClassID("{00020821-0000-0000-C000-000000000046}");
+    // Excel V11
+    public static final ClassID EXCEL2003      = new ClassID("{00020812-0000-0000-C000-000000000046}");
+    // Excel V12
+    public static final ClassID EXCEL2007      = new ClassID("{00020830-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL2007_MACRO= new ClassID("{00020832-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL2007_XLSB = new ClassID("{00020833-0000-0000-C000-000000000046}");
+    // Excel V14
+    public static final ClassID EXCEL2010      = new ClassID("{00024500-0000-0000-C000-000000000046}");
+    public static final ClassID EXCEL2010_CHART= new ClassID("{00024505-0014-0000-C000-000000000046}");
+    public static final ClassID EXCEL2010_ODS  = new ClassID("{EABCECDB-CC1C-4A6F-B4E3-7F888A5ADFC8}");
+    
+    public static final ClassID WORD97         = new ClassID("{00020906-0000-0000-C000-000000000046}");
+    public static final ClassID WORD95         = new ClassID("{00020900-0000-0000-C000-000000000046}");
+    public static final ClassID WORD2007       = new ClassID("{F4754C9B-64F5-4B40-8AF4-679732AC0607}");
+    public static final ClassID WORD2007_MACRO = new ClassID("{18A06B6B-2F3F-4E2B-A611-52BE631B2D22}");
+    
+    public static final ClassID POWERPOINT97   = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
+    public static final ClassID POWERPOINT95   = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
+    public static final ClassID POWERPOINT2007 = new ClassID("{CF4F55F4-8F87-4D47-80BB-5808164BB3F8}");
+    public static final ClassID POWERPOINT2007_MACRO = new ClassID("{DC020317-E6E2-4A62-B9FA-B3EFE16626F4}");
+    
+    public static final ClassID EQUATION30     = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
        
     /** <p>The number of bytes occupied by this object in the byte
      * stream.</p> */
index b45985f8cc117313b46fa9cbe3103c3864d6a5a1..2e74f6c31c70470199fa224a7310749d4f1b1dbc 100644 (file)
@@ -27,10 +27,10 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Locale;
 
 import org.apache.poi.hpsf.ClassID;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
@@ -43,12 +43,18 @@ import org.apache.poi.ss.usermodel.Shape;
 import org.apache.poi.ss.usermodel.ShapeContainer;
 import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.util.Beta;
 import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.LocaleUtil;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 import org.apache.poi.xssf.usermodel.XSSFObjectData;
 
+/**
+ * This extractor class tries to identify various embedded documents within Excel files
+ * and provide them via a common interface, i.e. the EmbeddedData instances
+ */
+@Beta
 public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
     private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
     
@@ -58,19 +64,13 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
     private static final String CONTENT_TYPE_DOC = "application/msword";
     private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
 
-    // default file extension
-    private static final String PDF_EXT = ".pdf";
-    private static final String DOC_EXT = ".doc";
-    private static final String XLS_EXT = ".xls";
-    private static final String OLE_EXT = ".ole";
-
     /**
      * @return the list of known extractors, if you provide custom extractors, override this method
      */
     @Override
     public Iterator<EmbeddedExtractor> iterator() {
         EmbeddedExtractor[] ee = {
-            new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
+            new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
         };
         return Arrays.asList(ee).iterator();
     }
@@ -112,10 +112,11 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
                     if (od.hasDirectoryEntry()) {
                         data = extractOne((DirectoryNode)od.getDirectory());
                     } else {
+                        String contentType = CONTENT_TYPE_BYTES;
                         if (od instanceof XSSFObjectData) {
-                            String contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
+                            contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
                         }
-                        data = new EmbeddedData(od.getFileName(), od.getObjectData(), CONTENT_TYPE_BYTES);
+                        data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType);
                     }
                 } catch (Exception e) {
                     LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
@@ -211,7 +212,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
             InputStream is = dn.createDocumentInputStream("CONTENTS");
             IOUtils.copy(is, bos);
             is.close();
-            return new EmbeddedData(dn.getName() + PDF_EXT, bos.toByteArray(), CONTENT_TYPE_PDF);
+            return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
         }
         
         @Override
@@ -251,8 +252,8 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
             byte[] pdfBytes = new byte[pictureBytesLen];
             System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
             String filename = source.getShapeName().trim();
-            if (!endsWithIgnoreCase(filename, PDF_EXT)) {
-                filename += PDF_EXT;
+            if (!endsWithIgnoreCase(filename, ".pdf")) {
+                filename += ".pdf";
             }
             return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
         }
@@ -260,38 +261,83 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
 
     }
 
-    static class WordExtractor extends EmbeddedExtractor {
+    static class OOXMLExtractor extends EmbeddedExtractor {
         @Override
         public boolean canExtract(DirectoryNode dn) {
-            ClassID clsId = dn.getStorageClsid();
-            return (ClassID.WORD95.equals(clsId)
-            || ClassID.WORD97.equals(clsId)
-            || dn.hasEntry("WordDocument"));
+            return dn.hasEntry("package");
         }
 
         @Override
         public EmbeddedData extract(DirectoryNode dn) throws IOException {
-            EmbeddedData ed = super.extract(dn);
-            ed.setFilename(dn.getName() + DOC_EXT);
-            ed.setContentType(CONTENT_TYPE_DOC);
-            return ed;
+
+            ClassID clsId = dn.getStorageClsid();
+
+            String contentType, ext;
+            if (ClassID.WORD2007.equals(clsId)) {
+                ext = ".docx";
+                contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
+            } else if (ClassID.WORD2007_MACRO.equals(clsId)) {
+                ext = ".docm";
+                contentType = "application/vnd.ms-word.document.macroEnabled.12";
+            } else if (ClassID.EXCEL2007.equals(clsId) || ClassID.EXCEL2003.equals(clsId) || ClassID.EXCEL2010.equals(clsId)) {
+                ext = ".xlsx";
+                contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
+            } else if (ClassID.EXCEL2007_MACRO.equals(clsId)) {
+                ext = ".xlsm";
+                contentType = "application/vnd.ms-excel.sheet.macroEnabled.12";
+            } else if (ClassID.EXCEL2007_XLSB.equals(clsId)) {
+                ext = ".xlsb";
+                contentType = "application/vnd.ms-excel.sheet.binary.macroEnabled.12";
+            } else if (ClassID.POWERPOINT2007.equals(clsId)) {
+                ext = ".pptx";
+                contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
+            } else if (ClassID.POWERPOINT2007_MACRO.equals(clsId)) {
+                ext = ".ppsm";
+                contentType = "application/vnd.ms-powerpoint.slideshow.macroEnabled.12";
+            } else {
+                ext = ".zip";
+                contentType = "application/zip";
+            }
+
+            DocumentInputStream dis = dn.createDocumentInputStream("package");
+            byte data[] = IOUtils.toByteArray(dis);
+            dis.close();
+            
+            return new EmbeddedData(dn.getName()+ext, data, contentType);
         }
     }
 
-    static class ExcelExtractor extends EmbeddedExtractor {
+    static class BiffExtractor extends EmbeddedExtractor {
         @Override
         public boolean canExtract(DirectoryNode dn) {
+            return canExtractExcel(dn) || canExtractWord(dn);
+        }
+        
+        protected boolean canExtractExcel(DirectoryNode dn) {
             ClassID clsId = dn.getStorageClsid();
             return (ClassID.EXCEL95.equals(clsId)
-                    || ClassID.EXCEL97.equals(clsId)
-                    || dn.hasEntry("Workbook") /*...*/);
+                || ClassID.EXCEL97.equals(clsId)
+                || dn.hasEntry("Workbook") /*...*/);
+        }
+
+        protected boolean canExtractWord(DirectoryNode dn) {
+            ClassID clsId = dn.getStorageClsid();
+            return (ClassID.WORD95.equals(clsId)
+                || ClassID.WORD97.equals(clsId)
+                || dn.hasEntry("WordDocument"));
         }
         
         @Override
         public EmbeddedData extract(DirectoryNode dn) throws IOException {
             EmbeddedData ed = super.extract(dn);
-            ed.setFilename(dn.getName() + XLS_EXT);
-            ed.setContentType(CONTENT_TYPE_XLS);
+            if (canExtractExcel(dn)) {
+                ed.setFilename(dn.getName() + ".xls");
+                ed.setContentType(CONTENT_TYPE_XLS);
+            } else if (canExtractWord(dn)) {
+                ed.setFilename(dn.getName() + ".doc");
+                ed.setContentType(CONTENT_TYPE_DOC);
+            }
+            
             return ed;
         }
     }
@@ -304,7 +350,7 @@ public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
         @Override
         public EmbeddedData extract(DirectoryNode dn) throws IOException {
             EmbeddedData ed = super.extract(dn);
-            ed.setFilename(dn.getName() + OLE_EXT);
+            ed.setFilename(dn.getName() + ".ole");
             // TODO: read the content type from CombObj stream
             return ed;
         }