]> source.dussan.org Git - poi.git/commitdiff
#60519 - Extractor for *SSF embeddings
authorAndreas Beeker <kiwiwings@apache.org>
Sat, 31 Dec 2016 21:50:47 +0000 (21:50 +0000)
committerAndreas Beeker <kiwiwings@apache.org>
Sat, 31 Dec 2016 21:50:47 +0000 (21:50 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1776819 13f79535-47bb-0310-9956-ffa450edef68

src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java
src/java/org/apache/poi/hssf/usermodel/HSSFObjectData.java
src/java/org/apache/poi/ss/usermodel/ObjectData.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/openxml4j/opc/PackageRelationshipTypes.java
src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedData.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java
src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFObjectData.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFSheet.java
src/ooxml/testcases/org/apache/poi/ss/extractor/TestEmbeddedExtractor.java [new file with mode: 0644]
test-data/spreadsheet/Basic_Expense_Template_2011.xls [new file with mode: 0644]

index 123bfa7450711e6d4d9c4246a9df227b63c72c12..854dbf2fba2dffa3c821c9b517b4ba701901b543 100644 (file)
@@ -23,6 +23,8 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.ss.extractor.EmbeddedData;
+import org.apache.poi.ss.extractor.EmbeddedExtractor;
 import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
@@ -55,6 +57,8 @@ public abstract class SpreadsheetHandler extends AbstractFileHandler {
                
                readContent(read);
                
+               extractEmbedded(read);
+               
                modifyContent(read);
 
                read.close();
@@ -91,6 +95,18 @@ public abstract class SpreadsheetHandler extends AbstractFileHandler {
                        }
                }
        }
+
+       private void extractEmbedded(Workbook wb) throws IOException {
+        EmbeddedExtractor ee = new EmbeddedExtractor();
+
+        for (Sheet s : wb) {
+            for (EmbeddedData ed : ee.extractAll(s)) {
+                assertNotNull(ed.getFilename());
+                assertNotNull(ed.getEmbeddedData());
+                assertNotNull(ed.getShape());
+            }
+        }
+       }
        
        private void modifyContent(Workbook wb) {
                /* a number of file fail because of various things: udf, unimplemented functions, ...
index d92f216ab114da1d4cebd70f56ce64ae9d095aa4..201e65ed5d04171ff70b5af75e2a87efadb383cc 100644 (file)
@@ -25,6 +25,7 @@ import org.apache.poi.ddf.*;
 import org.apache.poi.hssf.record.*;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.ss.usermodel.ObjectData;
 import org.apache.poi.util.HexDump;
 
 /**
@@ -32,7 +33,7 @@ import org.apache.poi.util.HexDump;
  * <p/>
  * Right now, 13, july, 2012 can not be created from scratch
  */
-public final class HSSFObjectData extends HSSFPicture {
+public final class HSSFObjectData extends HSSFPicture implements ObjectData {
     /**
      * Reference to the filesystem root, required for retrieving the object data.
      */
@@ -43,20 +44,12 @@ public final class HSSFObjectData extends HSSFPicture {
         this._root = _root;
     }
 
-    /**
-     * Returns the OLE2 Class Name of the object
-     */
+    @Override
     public String getOLE2ClassName() {
         return findObjectRecord().getOLEClassName();
     }
 
-    /**
-     * Gets the object data. Only call for ones that have
-     * data though. See {@link #hasDirectoryEntry()}
-     *
-     * @return the object data as an OLE2 directory.
-     * @throws IOException if there was an error reading the data.
-     */
+    @Override
     public DirectoryEntry getDirectory() throws IOException {
         EmbeddedObjectRefSubRecord subRecord = findObjectRecord();
 
@@ -70,20 +63,12 @@ public final class HSSFObjectData extends HSSFPicture {
         throw new IOException("Stream " + streamName + " was not an OLE2 directory");
     }
 
-    /**
-     * Returns the data portion, for an ObjectData
-     * that doesn't have an associated POIFS Directory
-     * Entry
-     */
+    @Override
     public byte[] getObjectData() {
         return findObjectRecord().getObjectData();
     }
 
-    /**
-     * Does this ObjectData have an associated POIFS
-     * Directory Entry?
-     * (Not all do, those that don't have a data portion)
-     */
+    @Override
     public boolean hasDirectoryEntry() {
         EmbeddedObjectRefSubRecord subRecord = findObjectRecord();
 
diff --git a/src/java/org/apache/poi/ss/usermodel/ObjectData.java b/src/java/org/apache/poi/ss/usermodel/ObjectData.java
new file mode 100644 (file)
index 0000000..d157dba
--- /dev/null
@@ -0,0 +1,65 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.ss.usermodel;
+
+import java.io.IOException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+/**
+ * Common interface for OLE shapes, i.e. shapes linked to embedded documents
+ * 
+ * @since POI 3.16-beta2
+ */
+public interface ObjectData extends SimpleShape {
+    /**
+     * @return the data portion, for an ObjectData that doesn't have an associated POIFS Directory Entry
+     */
+    byte[] getObjectData() throws IOException;
+
+    /**
+     * @return does this ObjectData have an associated POIFS Directory Entry?
+     * (Not all do, those that don't have a data portion)
+     */
+    boolean hasDirectoryEntry();
+
+    /**
+     * Gets the object data. Only call for ones that have
+     * data though. See {@link #hasDirectoryEntry()}.
+     * The caller has to close the corresponding POIFSFileSystem
+     *
+     * @return the object data as an OLE2 directory.
+     * @throws IOException if there was an error reading the data.
+     */
+    DirectoryEntry getDirectory() throws IOException;
+
+    /**
+     * @return the OLE2 Class Name of the object
+     */
+    String getOLE2ClassName();
+
+    /**
+     * @return a filename suggestion - inspecting/interpreting the Directory object probably gives a better result
+     */
+    String getFileName();
+
+    /**
+     * @return the preview picture
+     */
+    PictureData getPictureData();
+}
index 5d9801cd896daceb085af4db2eb225fc7c8dd83b..b9cd553ba084116c9fe36d5cb4ca3899800dbd45 100644 (file)
@@ -41,6 +41,11 @@ public interface PackageRelationshipTypes {
      */
     String CORE_PROPERTIES_ECMA376 = "http://schemas.openxmlformats.org/officedocument/2006/relationships/metadata/core-properties";
 
+    /**
+     * Namespace of Core properties relationship type as defiend in ECMA 376
+     */
+    String CORE_PROPERTIES_ECMA376_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+    
     /**
      * Digital signature relationship type.
      */
diff --git a/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedData.java b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedData.java
new file mode 100644 (file)
index 0000000..0e598b3
--- /dev/null
@@ -0,0 +1,104 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.ss.extractor;
+
+import org.apache.poi.ss.usermodel.Shape;
+
+/**
+ * A collection of embedded object informations and content
+ */
+public class EmbeddedData {
+    private String filename;
+    private byte[] embeddedData;
+    private Shape shape;
+    private String contentType = "binary/octet-stream";
+
+    public EmbeddedData(String filename, byte[] embeddedData, String contentType) {
+        setFilename(filename);
+        setEmbeddedData(embeddedData);
+        setContentType(contentType);
+    }
+    
+    /**
+     * @return the filename
+     */
+    public String getFilename() {
+        return filename;
+    }
+    
+    /**
+     * Sets the filename 
+     *
+     * @param filename the filename
+     */
+    public void setFilename(String filename) {
+        if (filename == null) {
+            this.filename = "unknown.bin";
+        } else {
+            this.filename = filename.replaceAll("[^/\\\\]*[/\\\\]", "").trim();
+        }
+    }
+    
+    /**
+     * @return the embedded object byte array
+     */
+    public byte[] getEmbeddedData() {
+        return embeddedData;
+    }
+
+    /**
+     * Sets the embedded object as byte array
+     *
+     * @param embeddedData the embedded object byte array
+     */
+    public void setEmbeddedData(byte[] embeddedData) {
+        this.embeddedData = (embeddedData == null) ? null : embeddedData.clone();
+    }
+
+    /**
+     * @return the shape which links to the embedded object
+     */
+    public Shape getShape() {
+        return shape;
+    }
+
+    /**
+     * Sets the shape which links to the embedded object
+     *
+     * @param shape the shape
+     */
+    public void setShape(Shape shape) {
+        this.shape = shape;
+    }
+
+    /**
+     * @return the content-/mime-type of the embedded object, the default (if unknown) is {@code binary/octet-stream} 
+     */
+    public String getContentType() {
+        return contentType;
+    }
+
+    /**
+     * Sets the content-/mime-type
+     *
+     * @param contentType the content-type
+     */
+    public void setContentType(String contentType) {
+        this.contentType = contentType;
+    }
+}
\ No newline at end of file
diff --git a/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java
new file mode 100644 (file)
index 0000000..a06566b
--- /dev/null
@@ -0,0 +1,353 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.ss.extractor;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.poi.hpsf.ClassID;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.usermodel.Drawing;
+import org.apache.poi.ss.usermodel.ObjectData;
+import org.apache.poi.ss.usermodel.Picture;
+import org.apache.poi.ss.usermodel.PictureData;
+import org.apache.poi.ss.usermodel.Shape;
+import org.apache.poi.ss.usermodel.ShapeContainer;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LocaleUtil;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
+    private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
+    
+    /**
+     * @return the list of known extractors, if you provide custom extractors, override this method
+     */
+    @Override
+    public Iterator<EmbeddedExtractor> iterator() {
+        EmbeddedExtractor[] ee = {
+            new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
+        };
+        return Arrays.asList(ee).iterator();
+    }
+
+    public EmbeddedData extractOne(DirectoryNode src) throws IOException {
+        for (EmbeddedExtractor ee : this) {
+            if (ee.canExtract(src)) {
+                return ee.extract(src);
+            }
+        }
+        return null;
+    }
+
+    public EmbeddedData extractOne(Picture src) throws IOException {
+        for (EmbeddedExtractor ee : this) {
+            if (ee.canExtract(src)) {
+                return ee.extract(src);
+            }
+        }
+        return null;
+    }
+
+    public List<EmbeddedData> extractAll(Sheet sheet) throws IOException {
+        Drawing<?> patriarch = sheet.getDrawingPatriarch();
+        if (null == patriarch){
+            return Collections.emptyList();
+        }
+        List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
+        extractAll(patriarch, embeddings);
+        return embeddings;
+    }
+    
+    protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException {
+        for (Shape shape : parent) {
+            EmbeddedData data = null;
+            if (shape instanceof ObjectData) {
+                ObjectData od = (ObjectData)shape;
+                try {
+                    if (od.hasDirectoryEntry()) {
+                        data = extractOne((DirectoryNode)od.getDirectory());
+                    } else {
+                        data = new EmbeddedData(od.getFileName(), od.getObjectData(), "binary/octet-stream");
+                    }
+                } catch (Exception e) {
+                    LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
+                }
+            } else if (shape instanceof Picture) {
+                data = extractOne((Picture)shape);
+            } else if (shape instanceof ShapeContainer) {
+                extractAll((ShapeContainer<?>)shape, embeddings);
+            }
+            
+            if (data == null) {
+                continue;
+            }
+
+            data.setShape(shape);
+            String filename = data.getFilename();
+            String extension = (filename == null || filename.indexOf('.') == -1) ? ".bin" : filename.substring(filename.indexOf('.'));
+            
+            // try to find an alternative name
+            if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
+                filename = shape.getShapeName();
+                if (filename != null) {
+                    filename += extension;
+                }
+            }
+            // default to dummy name
+            if (filename == null || "".equals(filename)) {
+                filename = "picture_"+embeddings.size()+extension;
+            }
+            filename = filename.trim();
+            data.setFilename(filename);
+            
+            embeddings.add(data);
+        }
+    }
+    
+
+    public boolean canExtract(DirectoryNode source) {
+        return false;
+    }
+
+    public boolean canExtract(Picture source) {
+        return false;
+    }
+
+    protected EmbeddedData extract(DirectoryNode dn) throws IOException {
+        assert(canExtract(dn));
+        POIFSFileSystem dest = new POIFSFileSystem();
+        copyNodes(dn, dest.getRoot());
+        // start with a reasonable big size
+        ByteArrayOutputStream bos = new ByteArrayOutputStream(20000);
+        dest.writeFilesystem(bos);
+        dest.close();
+
+        return new EmbeddedData(dn.getName(), bos.toByteArray(), "binary/octet-stream");
+    }
+
+    protected EmbeddedData extract(Picture source) throws IOException {
+        return null;
+    }
+    
+    public static class Ole10Extractor extends EmbeddedExtractor {
+        @Override
+        public boolean canExtract(DirectoryNode dn) {
+            ClassID clsId = dn.getStorageClsid();
+            return ClassID.OLE10_PACKAGE.equals(clsId);
+        }
+
+        @Override
+        public EmbeddedData extract(DirectoryNode dn) throws IOException {
+            try {
+                Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
+                return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), "binary/octet-stream");
+            } catch (Ole10NativeException e) {
+                throw new IOException(e);
+            }
+        }
+    }
+
+    static class PdfExtractor extends EmbeddedExtractor {
+        static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
+        @Override
+        public boolean canExtract(DirectoryNode dn) {
+            ClassID clsId = dn.getStorageClsid();
+            return (PdfClassID.equals(clsId)
+            || dn.hasEntry("CONTENTS"));
+        }
+
+        @Override
+        public EmbeddedData extract(DirectoryNode dn) throws IOException {
+            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+            InputStream is = dn.createDocumentInputStream("CONTENTS");
+            IOUtils.copy(is, bos);
+            is.close();
+            return new EmbeddedData(dn.getName()+".pdf", bos.toByteArray(), "application/pdf");
+        }
+        
+        @Override
+        public boolean canExtract(Picture source) {
+            PictureData pd = source.getPictureData();
+            return (pd.getPictureType() == Workbook.PICTURE_TYPE_EMF);
+        }
+
+        /**
+         * Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF.
+         * If an embedded stream is inside an EMF picture, this method extracts the payload.
+         *
+         * @return the embedded data in an EMF picture or null if none is found
+         */
+        @Override
+        protected EmbeddedData extract(Picture source) throws IOException {
+            // check for emf+ embedded pdf (poor mans style :( )
+            // Mac Excel 2011 embeds pdf files with this method.
+            PictureData pd = source.getPictureData();
+            if (pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) {
+                return null;
+            }
+
+            // TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF
+            byte pictureBytes[] = pd.getData();
+            int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252));
+            if (idxStart == -1) {
+                return null;
+            }
+            
+            int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252));
+            if (idxEnd == -1) {
+                return null;
+            }
+            
+            int pictureBytesLen = idxEnd-idxStart+6;
+            byte[] pdfBytes = new byte[pictureBytesLen];
+            System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
+            String filename = source.getShapeName().trim();
+            if (!filename.toLowerCase(Locale.ROOT).endsWith(".pdf")) {
+                filename += ".pdf";
+            }
+            return new EmbeddedData(filename, pdfBytes, "application/pdf");
+        }
+        
+
+    }
+
+    static class WordExtractor extends EmbeddedExtractor {
+        @Override
+        public boolean canExtract(DirectoryNode dn) {
+            ClassID clsId = dn.getStorageClsid();
+            return (ClassID.WORD95.equals(clsId)
+            || ClassID.WORD97.equals(clsId)
+            || dn.hasEntry("WordDocument"));
+        }
+
+        @Override
+        public EmbeddedData extract(DirectoryNode dn) throws IOException {
+            EmbeddedData ed = super.extract(dn);
+            ed.setFilename(dn.getName()+".doc");
+            return ed;
+        }
+    }
+
+    static class ExcelExtractor extends EmbeddedExtractor {
+        @Override
+        public boolean canExtract(DirectoryNode dn) {
+            ClassID clsId = dn.getStorageClsid();
+            return (ClassID.EXCEL95.equals(clsId)
+                    || ClassID.EXCEL97.equals(clsId)
+                    || dn.hasEntry("Workbook") /*...*/);
+        }
+        
+        @Override
+        public EmbeddedData extract(DirectoryNode dn) throws IOException {
+            EmbeddedData ed = super.extract(dn);
+            ed.setFilename(dn.getName()+".xls");
+            return ed;
+        }
+    }
+
+    static class FsExtractor extends EmbeddedExtractor {
+        @Override
+        public boolean canExtract(DirectoryNode dn) {
+            return true;
+        }
+        @Override
+        public EmbeddedData extract(DirectoryNode dn) throws IOException {
+            EmbeddedData ed = super.extract(dn);
+            ed.setFilename(dn.getName()+".ole");
+            // TODO: read the content type from CombObj stream
+            return ed;
+        }
+    }
+    
+    protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
+        for (Entry e : src) {
+            if (e instanceof DirectoryNode) {
+                DirectoryNode srcDir = (DirectoryNode)e;
+                DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
+                destDir.setStorageClsid(srcDir.getStorageClsid());
+                copyNodes(srcDir, destDir);
+            } else {
+                InputStream is = src.createDocumentInputStream(e);
+                dest.createDocument(e.getName(), is);
+                is.close();
+            }
+        }
+    }
+    
+    
+
+    /**
+     * Knuth-Morris-Pratt Algorithm for Pattern Matching
+     * Finds the first occurrence of the pattern in the text.
+     */
+    private static int indexOf(byte[] data, int offset, byte[] pattern) {
+        int[] failure = computeFailure(pattern);
+
+        int j = 0;
+        if (data.length == 0) return -1;
+
+        for (int i = offset; i < data.length; i++) {
+            while (j > 0 && pattern[j] != data[i]) {
+                j = failure[j - 1];
+            }
+            if (pattern[j] == data[i]) { j++; }
+            if (j == pattern.length) {
+                return i - pattern.length + 1;
+            }
+        }
+        return -1;
+    }
+
+    /**
+     * Computes the failure function using a boot-strapping process,
+     * where the pattern is matched against itself.
+     */
+    private static int[] computeFailure(byte[] pattern) {
+        int[] failure = new int[pattern.length];
+
+        int j = 0;
+        for (int i = 1; i < pattern.length; i++) {
+            while (j > 0 && pattern[j] != pattern[i]) {
+                j = failure[j - 1];
+            }
+            if (pattern[j] == pattern[i]) {
+                j++;
+            }
+            failure[i] = j;
+        }
+
+        return failure;
+    }
+
+    
+}
index c7104a4172a7099e35df7518a2b7fc4912b77edc..8f409fce318b225d88ac70578ee5140a3f1a5245 100644 (file)
@@ -20,8 +20,10 @@ package org.apache.poi.xssf.usermodel;
 import static org.apache.poi.POIXMLTypeLoader.DEFAULT_XML_OPTIONS;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 
 import javax.xml.namespace.QName;
@@ -32,13 +34,21 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.ss.usermodel.ClientAnchor;
 import org.apache.poi.ss.usermodel.Drawing;
 import org.apache.poi.ss.util.CellAddress;
+import org.apache.poi.ss.util.ImageUtils;
 import org.apache.poi.util.Internal;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
 import org.apache.poi.util.Units;
 import org.apache.poi.xssf.model.CommentsTable;
 import org.apache.xmlbeans.XmlCursor;
 import org.apache.xmlbeans.XmlException;
 import org.apache.xmlbeans.XmlObject;
 import org.apache.xmlbeans.XmlOptions;
+import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTGroupTransform2D;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTPoint2D;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTransform2D;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTConnector;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTDrawing;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTGraphicalObjectFrame;
@@ -53,7 +63,9 @@ import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.STEditAs;
 /**
  * Represents a SpreadsheetML drawing
  */
-public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
+public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing<XSSFShape> {
+    private static final POILogger LOG = POILogFactory.getLogger(XSSFDrawing.class);
+    
     /**
      * Root element of the SpreadsheetML Drawing part
      */
@@ -86,7 +98,12 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         XmlOptions options  = new XmlOptions(DEFAULT_XML_OPTIONS);
         //Removing root element
         options.setLoadReplaceDocumentElement(null);
-        drawing = CTDrawing.Factory.parse(part.getInputStream(),options);
+        InputStream is = part.getInputStream();
+        try {
+            drawing = CTDrawing.Factory.parse(is,options);
+        } finally {
+            is.close();
+        }
     }
     
     /**
@@ -176,6 +193,8 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         XSSFPicture shape = new XSSFPicture(this, ctShape);
         shape.anchor = anchor;
         shape.setPictureReference(rel);
+        ctShape.getSpPr().setXfrm(createXfrm(anchor));
+        
         return shape;
     }
 
@@ -202,6 +221,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
 
         XSSFGraphicFrame frame = createGraphicFrame(anchor);
         frame.setChart(chart, chartRelId);
+        frame.getCTGraphicalObjectFrame().setXfrm(createXfrm(anchor));
 
         return chart;
     }
@@ -241,6 +261,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         CTShape ctShape = ctAnchor.addNewSp();
         ctShape.set(XSSFSimpleShape.prototype());
         ctShape.getNvSpPr().getCNvPr().setId(shapeId);
+        ctShape.getSpPr().setXfrm(createXfrm(anchor));
         XSSFSimpleShape shape = new XSSFSimpleShape(this, ctShape);
         shape.anchor = anchor;
         return shape;
@@ -278,6 +299,11 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         CTTwoCellAnchor ctAnchor = createTwoCellAnchor(anchor);
         CTGroupShape ctGroup = ctAnchor.addNewGrpSp();
         ctGroup.set(XSSFShapeGroup.prototype());
+        CTTransform2D xfrm = createXfrm(anchor);
+        CTGroupTransform2D grpXfrm =ctGroup.getGrpSpPr().getXfrm();
+        grpXfrm.setOff(xfrm.getOff());
+        grpXfrm.setExt(xfrm.getExt());
+        grpXfrm.setChExt(xfrm.getExt());
 
         XSSFShapeGroup shape = new XSSFShapeGroup(this, ctGroup);
         shape.anchor = anchor;
@@ -333,6 +359,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         CTTwoCellAnchor ctAnchor = createTwoCellAnchor(anchor);
         CTGraphicalObjectFrame ctGraphicFrame = ctAnchor.addNewGraphicFrame();
         ctGraphicFrame.set(XSSFGraphicFrame.prototype());
+        ctGraphicFrame.setXfrm(createXfrm(anchor));
 
         long frameId = numOfGraphicFrames++;
         XSSFGraphicFrame graphicFrame = new XSSFGraphicFrame(this, ctGraphicFrame);
@@ -378,39 +405,159 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         return ctAnchor;
     }
 
+    private CTTransform2D createXfrm(XSSFClientAnchor anchor) {
+        CTTransform2D xfrm = CTTransform2D.Factory.newInstance();
+        CTPoint2D off = xfrm.addNewOff();
+        off.setX(anchor.getDx1());
+        off.setY(anchor.getDy1());
+        XSSFSheet sheet = (XSSFSheet)getParent();
+        double widthPx = 0;
+        for (int col=anchor.getCol1(); col<anchor.getCol2(); col++) {
+            widthPx += sheet.getColumnWidthInPixels(col);
+        }
+        double heightPx = 0;
+        for (int row=anchor.getRow1(); row<anchor.getRow2(); row++) {
+            heightPx += ImageUtils.getRowHeightInPixels(sheet, row);
+        }
+        int width = Units.pixelToEMU((int)widthPx);
+        int height = Units.pixelToEMU((int)heightPx);
+        CTPositiveSize2D ext = xfrm.addNewExt();
+        ext.setCx(width - anchor.getDx1() + anchor.getDx2());
+        ext.setCy(height - anchor.getDy1() + anchor.getDy2());
+        
+        // TODO: handle vflip/hflip
+        return xfrm;
+    }
+    
     private long newShapeId(){
         return drawing.sizeOfTwoCellAnchorArray() + 1;
     }
 
     /**
-     *
      * @return list of shapes in this drawing
      */
-    public List<XSSFShape>  getShapes(){
+    public List<XSSFShape> getShapes(){
         List<XSSFShape> lst = new ArrayList<XSSFShape>();
-        for(XmlObject obj : drawing.selectPath("./*/*")) {
-            XSSFShape shape = null;
-            if(obj instanceof CTPicture) shape = new XSSFPicture(this, (CTPicture)obj) ;
-            else if(obj instanceof CTConnector) shape = new XSSFConnector(this, (CTConnector)obj) ;
-            else if(obj instanceof CTShape) shape = new XSSFSimpleShape(this, (CTShape)obj) ;
-            else if(obj instanceof CTGraphicalObjectFrame) shape = new XSSFGraphicFrame(this, (CTGraphicalObjectFrame)obj) ;
-            else if(obj instanceof CTGroupShape) shape = new XSSFShapeGroup(this, (CTGroupShape)obj) ;
-
-            if(shape != null){
-                shape.anchor = getAnchorFromParent(obj);
-                lst.add(shape);
+        XmlCursor cur = drawing.newCursor();
+        try {
+            if (cur.toFirstChild()) {
+                addShapes(cur, lst);
             }
+        } finally {
+            cur.dispose();
         }
         return lst;
     }
 
+    /**
+     * @return list of shapes in this shape group
+     */
+    public List<XSSFShape> getShapes(XSSFShapeGroup groupshape){
+        List<XSSFShape> lst = new ArrayList<XSSFShape>();
+        XmlCursor cur = groupshape.getCTGroupShape().newCursor();
+        try {
+            addShapes(cur, lst);
+        } finally {
+            cur.dispose();
+        }
+        return lst;
+    }
+    
+    private void addShapes(XmlCursor cur, List<XSSFShape> lst) {
+        try {
+            do {
+                cur.push();
+                if (cur.toFirstChild()) {
+                    do {
+                        XmlObject obj = cur.getObject();
+    
+                        XSSFShape shape;
+                        if (obj instanceof CTMarker) {
+                            // ignore anchor elements
+                            continue;
+                        } else if (obj instanceof CTPicture) {
+                            shape = new XSSFPicture(this, (CTPicture)obj) ;
+                        } else if(obj instanceof CTConnector) {
+                            shape = new XSSFConnector(this, (CTConnector)obj) ;
+                        } else if(obj instanceof CTShape) {
+                            shape = hasOleLink(obj) 
+                                ? new XSSFObjectData(this, (CTShape)obj)
+                                : new XSSFSimpleShape(this, (CTShape)obj) ;
+                        } else if(obj instanceof CTGraphicalObjectFrame) {
+                            shape = new XSSFGraphicFrame(this, (CTGraphicalObjectFrame)obj) ;
+                        } else if(obj instanceof CTGroupShape) {
+                            shape = new XSSFShapeGroup(this, (CTGroupShape)obj) ;
+                        } else if(obj instanceof XmlAnyTypeImpl) {
+                            LOG.log(POILogger.WARN, "trying to parse AlternateContent, "
+                                    + "this unlinks the returned Shapes from the underlying xml content, "
+                                    + "so those shapes can't be used to modify the drawing, "
+                                    + "i.e. modifications will be ignored!");
+                            
+                            // XmlAnyTypeImpl is returned for AlternateContent parts, which might contain a CTDrawing
+                            cur.push();
+                            cur.toFirstChild();
+                            XmlCursor cur2 = null;
+                            try {
+                                // need to parse AlternateContent again, otherwise the child elements aren't typed,
+                                // but also XmlAnyTypes
+                                CTDrawing alterWS = CTDrawing.Factory.parse(cur.newXMLStreamReader());
+                                cur2 = alterWS.newCursor();
+                                if (cur2.toFirstChild()) {
+                                    addShapes(cur2, lst);
+                                }
+                            } catch (XmlException e) {
+                                LOG.log(POILogger.WARN, "unable to parse CTDrawing in alternate content.", e);
+                            } finally {
+                                if (cur2 != null) {
+                                    cur2.dispose();
+                                }
+                                cur.pop();
+                            }
+                            continue;
+                        } else {
+                            // ignore anything else
+                            continue;
+                        }
+
+                        assert(shape != null);
+                        shape.anchor = getAnchorFromParent(obj);
+                        lst.add(shape);
+                        
+                    } while (cur.toNextSibling());
+                }
+                cur.pop();
+            } while (cur.toNextSibling());
+        } finally {
+            cur.dispose();
+        }
+    }
+
+    private boolean hasOleLink(XmlObject shape) {
+        QName uriName = new QName(null, "uri");
+        String xquery = "declare namespace a='"+XSSFRelation.NS_DRAWINGML+"' .//a:extLst/a:ext";
+        XmlCursor cur = shape.newCursor();
+        cur.selectPath(xquery);
+        try {
+            while (cur.toNextSelection()) {
+                String uri = cur.getAttributeText(uriName);
+                if ("{63B3BB69-23CF-44E3-9099-C40C66FF867C}".equals(uri)) {
+                    return true;
+                }
+            }
+        } finally {
+            cur.dispose();
+        }
+        return false;
+    }
 
     private XSSFAnchor getAnchorFromParent(XmlObject obj){
         XSSFAnchor anchor = null;
 
         XmlObject parentXbean = null;
         XmlCursor cursor = obj.newCursor();
-        if(cursor.toParent()) parentXbean = cursor.getObject();
+        if(cursor.toParent()) {
+            parentXbean = cursor.getObject();
+        }
         cursor.dispose();
         if(parentXbean != null){
             if (parentXbean instanceof CTTwoCellAnchor) {
@@ -424,4 +571,8 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
         return anchor;
     }
 
+    @Override
+    public Iterator<XSSFShape> iterator() {
+        return getShapes().iterator();
+    }
 }
diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFObjectData.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFObjectData.java
new file mode 100644 (file)
index 0000000..ab51df8
--- /dev/null
@@ -0,0 +1,169 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xssf.usermodel;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+import javax.xml.namespace.QName;
+
+import org.apache.poi.POIXMLDocumentPart;
+import org.apache.poi.POIXMLException;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.usermodel.ObjectData;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+import org.apache.xmlbeans.XmlCursor;
+import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
+import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTOleObject;
+
+/**
+ * Represents binary object (i.e. OLE) data stored in the file.  Eg. A GIF, JPEG etc...
+ */
+public class XSSFObjectData extends XSSFSimpleShape implements ObjectData {
+    private static final POILogger LOG = POILogFactory.getLogger(XSSFObjectData.class);
+    
+    /**
+     * A default instance of CTShape used for creating new shapes.
+     */
+    private static CTShape prototype = null;
+
+    private CTOleObject oleObject;
+
+    protected XSSFObjectData(XSSFDrawing drawing, CTShape ctShape) {
+        super(drawing, ctShape);
+    }
+
+    /**
+     * Prototype with the default structure of a new auto-shape.
+     */
+    protected static CTShape prototype() {
+        if(prototype == null) {
+            prototype = XSSFSimpleShape.prototype();
+        }
+        return prototype;
+    }
+
+    @Override
+    public String getOLE2ClassName() {
+        return getOleObject().getProgId();
+    }
+
+    /**
+     * @return the CTOleObject associated with the shape 
+     */
+    public CTOleObject getOleObject() {
+        if (oleObject == null) {
+            long shapeId = getCTShape().getNvSpPr().getCNvPr().getId();
+            oleObject = getSheet().readOleObject(shapeId);
+            if (oleObject == null) {
+                throw new POIXMLException("Ole object not found in sheet container - it's probably a control element");
+            }
+        }
+        return oleObject;
+    }
+    
+    @Override
+    public byte[] getObjectData() throws IOException {
+        InputStream is = getObjectPart().getInputStream();
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        IOUtils.copy(is, bos);
+        is.close();
+        return bos.toByteArray();
+    }
+    
+    /**
+     * @return the package part of the object data
+     */
+    public PackagePart getObjectPart() {
+        if (!getOleObject().isSetId()) {
+            throw new POIXMLException("Invalid ole object found in sheet container");
+        }
+        POIXMLDocumentPart pdp = getSheet().getRelationById(getOleObject().getId());
+        return (pdp == null) ? null : pdp.getPackagePart();
+    }
+
+    @Override
+    public boolean hasDirectoryEntry() {
+        InputStream is = null;
+        try {
+            is = getObjectPart().getInputStream();
+
+            // If clearly doesn't do mark/reset, wrap up
+            if (! is.markSupported()) {
+                is = new PushbackInputStream(is, 8);
+            }
+
+            // Ensure that there is at least some data there
+            byte[] header8 = IOUtils.peekFirst8Bytes(is);
+
+            // Try to create
+            return NPOIFSFileSystem.hasPOIFSHeader(header8);
+        } catch (IOException e) {
+            LOG.log(POILogger.WARN, "can't determine if directory entry exists", e);
+            return false;
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public DirectoryEntry getDirectory() throws IOException {
+        InputStream is = null;
+        try {
+            is = getObjectPart().getInputStream();
+            return new POIFSFileSystem(is).getRoot();
+        } finally {
+            IOUtils.closeQuietly(is);
+        }
+    }
+
+    /**
+     * The filename of the embedded image
+     */
+    @Override
+    public String getFileName() {
+        return getObjectPart().getPartName().getName();
+    }
+    
+    protected XSSFSheet getSheet() {
+        return (XSSFSheet)getDrawing().getParent();
+    }
+
+    @Override
+    public XSSFPictureData getPictureData() {
+        XmlCursor cur = getOleObject().newCursor();
+        try {
+            if (cur.toChild(XSSFRelation.NS_SPREADSHEETML, "objectPr")) {
+                String blipId = cur.getAttributeText(new QName(PackageRelationshipTypes.CORE_PROPERTIES_ECMA376_NS, "id"));
+                return (XSSFPictureData)getDrawing().getRelationById(blipId);
+            }
+            return null;
+        } finally {
+            cur.dispose();
+        }
+    }
+}
index 5f1529604e535547bad990f1563c92d7b2d79123..20432104962bf309b78c39c5f9b140eec33cb65d 100644 (file)
@@ -40,6 +40,8 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 
 import javax.xml.namespace.QName;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
 
 import org.apache.poi.POIXMLDocumentPart;
 import org.apache.poi.POIXMLException;
@@ -86,7 +88,9 @@ import org.apache.poi.xssf.usermodel.XSSFPivotTable.PivotTableReferenceConfigura
 import org.apache.poi.xssf.usermodel.helpers.ColumnHelper;
 import org.apache.poi.xssf.usermodel.helpers.XSSFIgnoredErrorHelper;
 import org.apache.poi.xssf.usermodel.helpers.XSSFRowShifter;
+import org.apache.xmlbeans.XmlCursor;
 import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
 import org.apache.xmlbeans.XmlOptions;
 import org.openxmlformats.schemas.spreadsheetml.x2006.main.*;
 
@@ -4371,4 +4375,64 @@ public class XSSFSheet extends POIXMLDocumentPart implements Sheet  {
         CTIgnoredError ctIgnoredError = ctIgnoredErrors.addNewIgnoredError();
         XSSFIgnoredErrorHelper.addIgnoredErrors(ctIgnoredError, ref, ignoredErrorTypes);
     }
+
+    /**
+     * Determine the OleObject which links shapes with embedded resources
+     *
+     * @param shapeId the shape id
+     * @return the CTOleObject of the shape
+     */
+    protected CTOleObject readOleObject(long shapeId) {
+        if (!getCTWorksheet().isSetOleObjects()) {
+            return null;
+        }
+        
+        // we use a XmlCursor here to handle oleObject with-/out AlternateContent wrappers
+        String xquery = "declare namespace p='"+XSSFRelation.NS_SPREADSHEETML+"' .//p:oleObject";
+        XmlCursor cur = getCTWorksheet().getOleObjects().newCursor();
+        try {
+            cur.selectPath(xquery);
+            CTOleObject coo = null;
+            while (cur.toNextSelection()) {
+                String sId = cur.getAttributeText(new QName(null, "shapeId"));
+                if (sId == null || Long.parseLong(sId)  != shapeId) {
+                    continue;
+                }
+                
+                XmlObject xObj = cur.getObject();
+                if (xObj instanceof CTOleObject) {
+                    // the unusual case ...
+                    coo = (CTOleObject)xObj;
+                } else {
+                    XMLStreamReader reader = cur.newXMLStreamReader();
+                    try {
+                        CTOleObjects coos = CTOleObjects.Factory.parse(reader);
+                        if (coos.sizeOfOleObjectArray() == 0) {
+                            continue;
+                        }
+                        coo = coos.getOleObjectArray(0);
+                    } catch (XmlException e) {
+                        logger.log(POILogger.INFO, "can't parse CTOleObjects", e);
+                    } finally {
+                        try {
+                            reader.close();
+                        } catch (XMLStreamException e) {
+                            logger.log(POILogger.INFO, "can't close reader", e);
+                        }
+                    }
+                }
+                
+                // there are choice and fallback OleObject ... we prefer the one having the objectPr element,
+                // which is in the choice element
+                if (cur.toChild(XSSFRelation.NS_SPREADSHEETML, "objectPr")) {
+                    break;
+                }
+            }
+            return (coo == null) ? null : coo;
+        } finally {
+            cur.dispose();
+        }
+    }
+
+
 }
diff --git a/src/ooxml/testcases/org/apache/poi/ss/extractor/TestEmbeddedExtractor.java b/src/ooxml/testcases/org/apache/poi/ss/extractor/TestEmbeddedExtractor.java
new file mode 100644 (file)
index 0000000..225e506
--- /dev/null
@@ -0,0 +1,115 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.ss.extractor;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.xml.bind.DatatypeConverter;
+
+import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.POIDataSamples;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.usermodel.WorkbookFactory;
+import org.junit.Test;
+
+public class TestEmbeddedExtractor {
+    private static final POIDataSamples samples = POIDataSamples.getSpreadSheetInstance();
+
+    @Test
+    public void extractPDFfromEMF() throws Exception {
+        InputStream fis = samples.openResourceAsStream("Basic_Expense_Template_2011.xls");
+        Workbook wb = WorkbookFactory.create(fis);
+        fis.close();
+
+        EmbeddedExtractor ee = new EmbeddedExtractor();
+        List<EmbeddedData> edList = new ArrayList<EmbeddedData>();
+        for (Sheet s : wb) {
+            edList.addAll(ee.extractAll(s));
+        }
+        wb.close();
+
+        assertEquals(2, edList.size());
+
+        String filename1 = "Sample.pdf";
+        EmbeddedData ed0 = edList.get(0);
+        assertEquals(filename1, ed0.getFilename());
+        assertEquals(filename1, ed0.getShape().getShapeName().trim());
+        assertEquals("uNplB1QpYug+LWappiTh0w==", md5hash(ed0.getEmbeddedData()));
+
+        String filename2 = "kalastuslupa_jiyjhnj_yuiyuiyuio_uyte_sldfsdfsdf_sfsdfsdf_sfsssfsf_sdfsdfsdfsdf_sdfsdfsdf.pdf";
+        EmbeddedData ed1 = edList.get(1);
+        assertEquals(filename2, ed1.getFilename());
+        assertEquals(filename2, ed1.getShape().getShapeName().trim());
+        assertEquals("QjLuAZ+cd7KbhVz4sj+QdA==", md5hash(ed1.getEmbeddedData()));
+    }
+
+    @Test
+    public void extractFromXSSF() throws IOException, EncryptedDocumentException, InvalidFormatException {
+        InputStream fis = samples.openResourceAsStream("58325_db.xlsx");
+        Workbook wb = WorkbookFactory.create(fis);
+        fis.close();
+
+        EmbeddedExtractor ee = new EmbeddedExtractor();
+        List<EmbeddedData> edList = new ArrayList<EmbeddedData>();
+        for (Sheet s : wb) {
+            edList.addAll(ee.extractAll(s));
+        }
+        wb.close();
+
+        assertEquals(4, edList.size());
+        EmbeddedData ed0 = edList.get(0);
+        assertEquals("Object 1.pdf", ed0.getFilename());
+        assertEquals("Object 1", ed0.getShape().getShapeName().trim());
+        assertEquals("Oyys6UtQU1gbHYBYqA4NFA==", md5hash(ed0.getEmbeddedData()));
+
+        EmbeddedData ed1 = edList.get(1);
+        assertEquals("Object 2.pdf", ed1.getFilename());
+        assertEquals("Object 2", ed1.getShape().getShapeName().trim());
+        assertEquals("xLScPUS0XH+5CTZ2A3neNw==", md5hash(ed1.getEmbeddedData()));
+
+        EmbeddedData ed2 = edList.get(2);
+        assertEquals("Object 3.pdf", ed2.getFilename());
+        assertEquals("Object 3", ed2.getShape().getShapeName().trim());
+        assertEquals("rX4klZqJAeM5npb54Gi2+Q==", md5hash(ed2.getEmbeddedData()));
+
+        EmbeddedData ed3 = edList.get(3);
+        assertEquals("Microsoft_Excel_Worksheet1.xlsx", ed3.getFilename());
+        assertEquals("Object 1", ed3.getShape().getShapeName().trim());
+        assertEquals("4m4N8ji2tjpEGPQuw2YwGA==", md5hash(ed3.getEmbeddedData()));
+    }
+
+    public static String md5hash(byte[] input) {
+        try {
+            MessageDigest md = MessageDigest.getInstance("MD5");
+            byte hash[] = md.digest(input);
+            return DatatypeConverter.printBase64Binary(hash);
+        } catch (NoSuchAlgorithmException e) {
+            // doesn't happen
+            return "";
+        }
+    }
+}
diff --git a/test-data/spreadsheet/Basic_Expense_Template_2011.xls b/test-data/spreadsheet/Basic_Expense_Template_2011.xls
new file mode 100644 (file)
index 0000000..5df7ae3
Binary files /dev/null and b/test-data/spreadsheet/Basic_Expense_Template_2011.xls differ