diff options
author | Andreas Beeker <kiwiwings@apache.org> | 2016-12-31 21:50:47 +0000 |
---|---|---|
committer | Andreas Beeker <kiwiwings@apache.org> | 2016-12-31 21:50:47 +0000 |
commit | 6a4428260860c2dab150c03c4ca7579f6a1cc518 (patch) | |
tree | 3c2d874da1351e2ffcb15359ce12e852a08ebd80 /src | |
parent | 155bc83d5833218489017237a0ff519f73041db1 (diff) | |
download | poi-6a4428260860c2dab150c03c4ca7579f6a1cc518.tar.gz poi-6a4428260860c2dab150c03c4ca7579f6a1cc518.zip |
#60519 - Extractor for *SSF embeddings
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1776819 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
10 files changed, 1064 insertions, 37 deletions
diff --git a/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java b/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java index 123bfa7450..854dbf2fba 100644 --- a/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java +++ b/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java @@ -23,6 +23,8 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.ss.extractor.EmbeddedData; +import org.apache.poi.ss.extractor.EmbeddedExtractor; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; @@ -55,6 +57,8 @@ public abstract class SpreadsheetHandler extends AbstractFileHandler { readContent(read); + extractEmbedded(read); + modifyContent(read); read.close(); @@ -91,6 +95,18 @@ public abstract class SpreadsheetHandler extends AbstractFileHandler { } } } + + private void extractEmbedded(Workbook wb) throws IOException { + EmbeddedExtractor ee = new EmbeddedExtractor(); + + for (Sheet s : wb) { + for (EmbeddedData ed : ee.extractAll(s)) { + assertNotNull(ed.getFilename()); + assertNotNull(ed.getEmbeddedData()); + assertNotNull(ed.getShape()); + } + } + } private void modifyContent(Workbook wb) { /* a number of file fail because of various things: udf, unimplemented functions, ... diff --git a/src/java/org/apache/poi/hssf/usermodel/HSSFObjectData.java b/src/java/org/apache/poi/hssf/usermodel/HSSFObjectData.java index d92f216ab1..201e65ed5d 100644 --- a/src/java/org/apache/poi/hssf/usermodel/HSSFObjectData.java +++ b/src/java/org/apache/poi/hssf/usermodel/HSSFObjectData.java @@ -25,6 +25,7 @@ import org.apache.poi.ddf.*; import org.apache.poi.hssf.record.*; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.ss.usermodel.ObjectData; import org.apache.poi.util.HexDump; /** @@ -32,7 +33,7 @@ import org.apache.poi.util.HexDump; * <p/> * Right now, 13, july, 2012 can not be created from scratch */ -public final class HSSFObjectData extends HSSFPicture { +public final class HSSFObjectData extends HSSFPicture implements ObjectData { /** * Reference to the filesystem root, required for retrieving the object data. */ @@ -43,20 +44,12 @@ public final class HSSFObjectData extends HSSFPicture { this._root = _root; } - /** - * Returns the OLE2 Class Name of the object - */ + @Override public String getOLE2ClassName() { return findObjectRecord().getOLEClassName(); } - /** - * Gets the object data. Only call for ones that have - * data though. See {@link #hasDirectoryEntry()} - * - * @return the object data as an OLE2 directory. - * @throws IOException if there was an error reading the data. - */ + @Override public DirectoryEntry getDirectory() throws IOException { EmbeddedObjectRefSubRecord subRecord = findObjectRecord(); @@ -70,20 +63,12 @@ public final class HSSFObjectData extends HSSFPicture { throw new IOException("Stream " + streamName + " was not an OLE2 directory"); } - /** - * Returns the data portion, for an ObjectData - * that doesn't have an associated POIFS Directory - * Entry - */ + @Override public byte[] getObjectData() { return findObjectRecord().getObjectData(); } - /** - * Does this ObjectData have an associated POIFS - * Directory Entry? - * (Not all do, those that don't have a data portion) - */ + @Override public boolean hasDirectoryEntry() { EmbeddedObjectRefSubRecord subRecord = findObjectRecord(); diff --git a/src/java/org/apache/poi/ss/usermodel/ObjectData.java b/src/java/org/apache/poi/ss/usermodel/ObjectData.java new file mode 100644 index 0000000000..d157dba53d --- /dev/null +++ b/src/java/org/apache/poi/ss/usermodel/ObjectData.java @@ -0,0 +1,65 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.ss.usermodel; + +import java.io.IOException; + +import org.apache.poi.poifs.filesystem.DirectoryEntry; + +/** + * Common interface for OLE shapes, i.e. shapes linked to embedded documents + * + * @since POI 3.16-beta2 + */ +public interface ObjectData extends SimpleShape { + /** + * @return the data portion, for an ObjectData that doesn't have an associated POIFS Directory Entry + */ + byte[] getObjectData() throws IOException; + + /** + * @return does this ObjectData have an associated POIFS Directory Entry? + * (Not all do, those that don't have a data portion) + */ + boolean hasDirectoryEntry(); + + /** + * Gets the object data. Only call for ones that have + * data though. See {@link #hasDirectoryEntry()}. + * The caller has to close the corresponding POIFSFileSystem + * + * @return the object data as an OLE2 directory. + * @throws IOException if there was an error reading the data. + */ + DirectoryEntry getDirectory() throws IOException; + + /** + * @return the OLE2 Class Name of the object + */ + String getOLE2ClassName(); + + /** + * @return a filename suggestion - inspecting/interpreting the Directory object probably gives a better result + */ + String getFileName(); + + /** + * @return the preview picture + */ + PictureData getPictureData(); +} diff --git a/src/ooxml/java/org/apache/poi/openxml4j/opc/PackageRelationshipTypes.java b/src/ooxml/java/org/apache/poi/openxml4j/opc/PackageRelationshipTypes.java index 5d9801cd89..b9cd553ba0 100644 --- a/src/ooxml/java/org/apache/poi/openxml4j/opc/PackageRelationshipTypes.java +++ b/src/ooxml/java/org/apache/poi/openxml4j/opc/PackageRelationshipTypes.java @@ -42,6 +42,11 @@ public interface PackageRelationshipTypes { String CORE_PROPERTIES_ECMA376 = "http://schemas.openxmlformats.org/officedocument/2006/relationships/metadata/core-properties"; /** + * Namespace of Core properties relationship type as defiend in ECMA 376 + */ + String CORE_PROPERTIES_ECMA376_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; + + /** * Digital signature relationship type. */ String DIGITAL_SIGNATURE = "http://schemas.openxmlformats.org/package/2006/relationships/digital-signature/signature"; diff --git a/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedData.java b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedData.java new file mode 100644 index 0000000000..0e598b3175 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedData.java @@ -0,0 +1,104 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.ss.extractor; + +import org.apache.poi.ss.usermodel.Shape; + +/** + * A collection of embedded object informations and content + */ +public class EmbeddedData { + private String filename; + private byte[] embeddedData; + private Shape shape; + private String contentType = "binary/octet-stream"; + + public EmbeddedData(String filename, byte[] embeddedData, String contentType) { + setFilename(filename); + setEmbeddedData(embeddedData); + setContentType(contentType); + } + + /** + * @return the filename + */ + public String getFilename() { + return filename; + } + + /** + * Sets the filename + * + * @param filename the filename + */ + public void setFilename(String filename) { + if (filename == null) { + this.filename = "unknown.bin"; + } else { + this.filename = filename.replaceAll("[^/\\\\]*[/\\\\]", "").trim(); + } + } + + /** + * @return the embedded object byte array + */ + public byte[] getEmbeddedData() { + return embeddedData; + } + + /** + * Sets the embedded object as byte array + * + * @param embeddedData the embedded object byte array + */ + public void setEmbeddedData(byte[] embeddedData) { + this.embeddedData = (embeddedData == null) ? null : embeddedData.clone(); + } + + /** + * @return the shape which links to the embedded object + */ + public Shape getShape() { + return shape; + } + + /** + * Sets the shape which links to the embedded object + * + * @param shape the shape + */ + public void setShape(Shape shape) { + this.shape = shape; + } + + /** + * @return the content-/mime-type of the embedded object, the default (if unknown) is {@code binary/octet-stream} + */ + public String getContentType() { + return contentType; + } + + /** + * Sets the content-/mime-type + * + * @param contentType the content-type + */ + public void setContentType(String contentType) { + this.contentType = contentType; + } +}
\ No newline at end of file diff --git a/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java new file mode 100644 index 0000000000..a06566b54c --- /dev/null +++ b/src/ooxml/java/org/apache/poi/ss/extractor/EmbeddedExtractor.java @@ -0,0 +1,353 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.ss.extractor; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; + +import org.apache.poi.hpsf.ClassID; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.Entry; +import org.apache.poi.poifs.filesystem.Ole10Native; +import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.ss.usermodel.Drawing; +import org.apache.poi.ss.usermodel.ObjectData; +import org.apache.poi.ss.usermodel.Picture; +import org.apache.poi.ss.usermodel.PictureData; +import org.apache.poi.ss.usermodel.Shape; +import org.apache.poi.ss.usermodel.ShapeContainer; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.util.IOUtils; +import org.apache.poi.util.LocaleUtil; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> { + private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class); + + /** + * @return the list of known extractors, if you provide custom extractors, override this method + */ + @Override + public Iterator<EmbeddedExtractor> iterator() { + EmbeddedExtractor[] ee = { + new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor() + }; + return Arrays.asList(ee).iterator(); + } + + public EmbeddedData extractOne(DirectoryNode src) throws IOException { + for (EmbeddedExtractor ee : this) { + if (ee.canExtract(src)) { + return ee.extract(src); + } + } + return null; + } + + public EmbeddedData extractOne(Picture src) throws IOException { + for (EmbeddedExtractor ee : this) { + if (ee.canExtract(src)) { + return ee.extract(src); + } + } + return null; + } + + public List<EmbeddedData> extractAll(Sheet sheet) throws IOException { + Drawing<?> patriarch = sheet.getDrawingPatriarch(); + if (null == patriarch){ + return Collections.emptyList(); + } + List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>(); + extractAll(patriarch, embeddings); + return embeddings; + } + + protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException { + for (Shape shape : parent) { + EmbeddedData data = null; + if (shape instanceof ObjectData) { + ObjectData od = (ObjectData)shape; + try { + if (od.hasDirectoryEntry()) { + data = extractOne((DirectoryNode)od.getDirectory()); + } else { + data = new EmbeddedData(od.getFileName(), od.getObjectData(), "binary/octet-stream"); + } + } catch (Exception e) { + LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e); + } + } else if (shape instanceof Picture) { + data = extractOne((Picture)shape); + } else if (shape instanceof ShapeContainer) { + extractAll((ShapeContainer<?>)shape, embeddings); + } + + if (data == null) { + continue; + } + + data.setShape(shape); + String filename = data.getFilename(); + String extension = (filename == null || filename.indexOf('.') == -1) ? ".bin" : filename.substring(filename.indexOf('.')); + + // try to find an alternative name + if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) { + filename = shape.getShapeName(); + if (filename != null) { + filename += extension; + } + } + // default to dummy name + if (filename == null || "".equals(filename)) { + filename = "picture_"+embeddings.size()+extension; + } + filename = filename.trim(); + data.setFilename(filename); + + embeddings.add(data); + } + } + + + public boolean canExtract(DirectoryNode source) { + return false; + } + + public boolean canExtract(Picture source) { + return false; + } + + protected EmbeddedData extract(DirectoryNode dn) throws IOException { + assert(canExtract(dn)); + POIFSFileSystem dest = new POIFSFileSystem(); + copyNodes(dn, dest.getRoot()); + // start with a reasonable big size + ByteArrayOutputStream bos = new ByteArrayOutputStream(20000); + dest.writeFilesystem(bos); + dest.close(); + + return new EmbeddedData(dn.getName(), bos.toByteArray(), "binary/octet-stream"); + } + + protected EmbeddedData extract(Picture source) throws IOException { + return null; + } + + public static class Ole10Extractor extends EmbeddedExtractor { + @Override + public boolean canExtract(DirectoryNode dn) { + ClassID clsId = dn.getStorageClsid(); + return ClassID.OLE10_PACKAGE.equals(clsId); + } + + @Override + public EmbeddedData extract(DirectoryNode dn) throws IOException { + try { + Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn); + return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), "binary/octet-stream"); + } catch (Ole10NativeException e) { + throw new IOException(e); + } + } + } + + static class PdfExtractor extends EmbeddedExtractor { + static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}"); + @Override + public boolean canExtract(DirectoryNode dn) { + ClassID clsId = dn.getStorageClsid(); + return (PdfClassID.equals(clsId) + || dn.hasEntry("CONTENTS")); + } + + @Override + public EmbeddedData extract(DirectoryNode dn) throws IOException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + InputStream is = dn.createDocumentInputStream("CONTENTS"); + IOUtils.copy(is, bos); + is.close(); + return new EmbeddedData(dn.getName()+".pdf", bos.toByteArray(), "application/pdf"); + } + + @Override + public boolean canExtract(Picture source) { + PictureData pd = source.getPictureData(); + return (pd.getPictureType() == Workbook.PICTURE_TYPE_EMF); + } + + /** + * Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF. + * If an embedded stream is inside an EMF picture, this method extracts the payload. + * + * @return the embedded data in an EMF picture or null if none is found + */ + @Override + protected EmbeddedData extract(Picture source) throws IOException { + // check for emf+ embedded pdf (poor mans style :( ) + // Mac Excel 2011 embeds pdf files with this method. + PictureData pd = source.getPictureData(); + if (pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) { + return null; + } + + // TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF + byte pictureBytes[] = pd.getData(); + int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252)); + if (idxStart == -1) { + return null; + } + + int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252)); + if (idxEnd == -1) { + return null; + } + + int pictureBytesLen = idxEnd-idxStart+6; + byte[] pdfBytes = new byte[pictureBytesLen]; + System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen); + String filename = source.getShapeName().trim(); + if (!filename.toLowerCase(Locale.ROOT).endsWith(".pdf")) { + filename += ".pdf"; + } + return new EmbeddedData(filename, pdfBytes, "application/pdf"); + } + + + } + + static class WordExtractor extends EmbeddedExtractor { + @Override + public boolean canExtract(DirectoryNode dn) { + ClassID clsId = dn.getStorageClsid(); + return (ClassID.WORD95.equals(clsId) + || ClassID.WORD97.equals(clsId) + || dn.hasEntry("WordDocument")); + } + + @Override + public EmbeddedData extract(DirectoryNode dn) throws IOException { + EmbeddedData ed = super.extract(dn); + ed.setFilename(dn.getName()+".doc"); + return ed; + } + } + + static class ExcelExtractor extends EmbeddedExtractor { + @Override + public boolean canExtract(DirectoryNode dn) { + ClassID clsId = dn.getStorageClsid(); + return (ClassID.EXCEL95.equals(clsId) + || ClassID.EXCEL97.equals(clsId) + || dn.hasEntry("Workbook") /*...*/); + } + + @Override + public EmbeddedData extract(DirectoryNode dn) throws IOException { + EmbeddedData ed = super.extract(dn); + ed.setFilename(dn.getName()+".xls"); + return ed; + } + } + + static class FsExtractor extends EmbeddedExtractor { + @Override + public boolean canExtract(DirectoryNode dn) { + return true; + } + @Override + public EmbeddedData extract(DirectoryNode dn) throws IOException { + EmbeddedData ed = super.extract(dn); + ed.setFilename(dn.getName()+".ole"); + // TODO: read the content type from CombObj stream + return ed; + } + } + + protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException { + for (Entry e : src) { + if (e instanceof DirectoryNode) { + DirectoryNode srcDir = (DirectoryNode)e; + DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName()); + destDir.setStorageClsid(srcDir.getStorageClsid()); + copyNodes(srcDir, destDir); + } else { + InputStream is = src.createDocumentInputStream(e); + dest.createDocument(e.getName(), is); + is.close(); + } + } + } + + + + /** + * Knuth-Morris-Pratt Algorithm for Pattern Matching + * Finds the first occurrence of the pattern in the text. + */ + private static int indexOf(byte[] data, int offset, byte[] pattern) { + int[] failure = computeFailure(pattern); + + int j = 0; + if (data.length == 0) return -1; + + for (int i = offset; i < data.length; i++) { + while (j > 0 && pattern[j] != data[i]) { + j = failure[j - 1]; + } + if (pattern[j] == data[i]) { j++; } + if (j == pattern.length) { + return i - pattern.length + 1; + } + } + return -1; + } + + /** + * Computes the failure function using a boot-strapping process, + * where the pattern is matched against itself. + */ + private static int[] computeFailure(byte[] pattern) { + int[] failure = new int[pattern.length]; + + int j = 0; + for (int i = 1; i < pattern.length; i++) { + while (j > 0 && pattern[j] != pattern[i]) { + j = failure[j - 1]; + } + if (pattern[j] == pattern[i]) { + j++; + } + failure[i] = j; + } + + return failure; + } + + +} diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java index c7104a4172..8f409fce31 100644 --- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java @@ -20,8 +20,10 @@ package org.apache.poi.xssf.usermodel; import static org.apache.poi.POIXMLTypeLoader.DEFAULT_XML_OPTIONS; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import javax.xml.namespace.QName; @@ -32,13 +34,21 @@ import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.ss.usermodel.ClientAnchor; import org.apache.poi.ss.usermodel.Drawing; import org.apache.poi.ss.util.CellAddress; +import org.apache.poi.ss.util.ImageUtils; import org.apache.poi.util.Internal; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; import org.apache.poi.util.Units; import org.apache.poi.xssf.model.CommentsTable; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; import org.apache.xmlbeans.XmlOptions; +import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl; +import org.openxmlformats.schemas.drawingml.x2006.main.CTGroupTransform2D; +import org.openxmlformats.schemas.drawingml.x2006.main.CTPoint2D; +import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D; +import org.openxmlformats.schemas.drawingml.x2006.main.CTTransform2D; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTConnector; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTDrawing; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTGraphicalObjectFrame; @@ -53,7 +63,9 @@ import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.STEditAs; /** * Represents a SpreadsheetML drawing */ -public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { +public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing<XSSFShape> { + private static final POILogger LOG = POILogFactory.getLogger(XSSFDrawing.class); + /** * Root element of the SpreadsheetML Drawing part */ @@ -86,7 +98,12 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS); //Removing root element options.setLoadReplaceDocumentElement(null); - drawing = CTDrawing.Factory.parse(part.getInputStream(),options); + InputStream is = part.getInputStream(); + try { + drawing = CTDrawing.Factory.parse(is,options); + } finally { + is.close(); + } } /** @@ -176,6 +193,8 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { XSSFPicture shape = new XSSFPicture(this, ctShape); shape.anchor = anchor; shape.setPictureReference(rel); + ctShape.getSpPr().setXfrm(createXfrm(anchor)); + return shape; } @@ -202,6 +221,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { XSSFGraphicFrame frame = createGraphicFrame(anchor); frame.setChart(chart, chartRelId); + frame.getCTGraphicalObjectFrame().setXfrm(createXfrm(anchor)); return chart; } @@ -241,6 +261,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { CTShape ctShape = ctAnchor.addNewSp(); ctShape.set(XSSFSimpleShape.prototype()); ctShape.getNvSpPr().getCNvPr().setId(shapeId); + ctShape.getSpPr().setXfrm(createXfrm(anchor)); XSSFSimpleShape shape = new XSSFSimpleShape(this, ctShape); shape.anchor = anchor; return shape; @@ -278,6 +299,11 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { CTTwoCellAnchor ctAnchor = createTwoCellAnchor(anchor); CTGroupShape ctGroup = ctAnchor.addNewGrpSp(); ctGroup.set(XSSFShapeGroup.prototype()); + CTTransform2D xfrm = createXfrm(anchor); + CTGroupTransform2D grpXfrm =ctGroup.getGrpSpPr().getXfrm(); + grpXfrm.setOff(xfrm.getOff()); + grpXfrm.setExt(xfrm.getExt()); + grpXfrm.setChExt(xfrm.getExt()); XSSFShapeGroup shape = new XSSFShapeGroup(this, ctGroup); shape.anchor = anchor; @@ -333,6 +359,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { CTTwoCellAnchor ctAnchor = createTwoCellAnchor(anchor); CTGraphicalObjectFrame ctGraphicFrame = ctAnchor.addNewGraphicFrame(); ctGraphicFrame.set(XSSFGraphicFrame.prototype()); + ctGraphicFrame.setXfrm(createXfrm(anchor)); long frameId = numOfGraphicFrames++; XSSFGraphicFrame graphicFrame = new XSSFGraphicFrame(this, ctGraphicFrame); @@ -378,39 +405,159 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { return ctAnchor; } + private CTTransform2D createXfrm(XSSFClientAnchor anchor) { + CTTransform2D xfrm = CTTransform2D.Factory.newInstance(); + CTPoint2D off = xfrm.addNewOff(); + off.setX(anchor.getDx1()); + off.setY(anchor.getDy1()); + XSSFSheet sheet = (XSSFSheet)getParent(); + double widthPx = 0; + for (int col=anchor.getCol1(); col<anchor.getCol2(); col++) { + widthPx += sheet.getColumnWidthInPixels(col); + } + double heightPx = 0; + for (int row=anchor.getRow1(); row<anchor.getRow2(); row++) { + heightPx += ImageUtils.getRowHeightInPixels(sheet, row); + } + int width = Units.pixelToEMU((int)widthPx); + int height = Units.pixelToEMU((int)heightPx); + CTPositiveSize2D ext = xfrm.addNewExt(); + ext.setCx(width - anchor.getDx1() + anchor.getDx2()); + ext.setCy(height - anchor.getDy1() + anchor.getDy2()); + + // TODO: handle vflip/hflip + return xfrm; + } + private long newShapeId(){ return drawing.sizeOfTwoCellAnchorArray() + 1; } /** - * * @return list of shapes in this drawing */ - public List<XSSFShape> getShapes(){ + public List<XSSFShape> getShapes(){ List<XSSFShape> lst = new ArrayList<XSSFShape>(); - for(XmlObject obj : drawing.selectPath("./*/*")) { - XSSFShape shape = null; - if(obj instanceof CTPicture) shape = new XSSFPicture(this, (CTPicture)obj) ; - else if(obj instanceof CTConnector) shape = new XSSFConnector(this, (CTConnector)obj) ; - else if(obj instanceof CTShape) shape = new XSSFSimpleShape(this, (CTShape)obj) ; - else if(obj instanceof CTGraphicalObjectFrame) shape = new XSSFGraphicFrame(this, (CTGraphicalObjectFrame)obj) ; - else if(obj instanceof CTGroupShape) shape = new XSSFShapeGroup(this, (CTGroupShape)obj) ; - - if(shape != null){ - shape.anchor = getAnchorFromParent(obj); - lst.add(shape); + XmlCursor cur = drawing.newCursor(); + try { + if (cur.toFirstChild()) { + addShapes(cur, lst); } + } finally { + cur.dispose(); } return lst; } + /** + * @return list of shapes in this shape group + */ + public List<XSSFShape> getShapes(XSSFShapeGroup groupshape){ + List<XSSFShape> lst = new ArrayList<XSSFShape>(); + XmlCursor cur = groupshape.getCTGroupShape().newCursor(); + try { + addShapes(cur, lst); + } finally { + cur.dispose(); + } + return lst; + } + + private void addShapes(XmlCursor cur, List<XSSFShape> lst) { + try { + do { + cur.push(); + if (cur.toFirstChild()) { + do { + XmlObject obj = cur.getObject(); + + XSSFShape shape; + if (obj instanceof CTMarker) { + // ignore anchor elements + continue; + } else if (obj instanceof CTPicture) { + shape = new XSSFPicture(this, (CTPicture)obj) ; + } else if(obj instanceof CTConnector) { + shape = new XSSFConnector(this, (CTConnector)obj) ; + } else if(obj instanceof CTShape) { + shape = hasOleLink(obj) + ? new XSSFObjectData(this, (CTShape)obj) + : new XSSFSimpleShape(this, (CTShape)obj) ; + } else if(obj instanceof CTGraphicalObjectFrame) { + shape = new XSSFGraphicFrame(this, (CTGraphicalObjectFrame)obj) ; + } else if(obj instanceof CTGroupShape) { + shape = new XSSFShapeGroup(this, (CTGroupShape)obj) ; + } else if(obj instanceof XmlAnyTypeImpl) { + LOG.log(POILogger.WARN, "trying to parse AlternateContent, " + + "this unlinks the returned Shapes from the underlying xml content, " + + "so those shapes can't be used to modify the drawing, " + + "i.e. modifications will be ignored!"); + + // XmlAnyTypeImpl is returned for AlternateContent parts, which might contain a CTDrawing + cur.push(); + cur.toFirstChild(); + XmlCursor cur2 = null; + try { + // need to parse AlternateContent again, otherwise the child elements aren't typed, + // but also XmlAnyTypes + CTDrawing alterWS = CTDrawing.Factory.parse(cur.newXMLStreamReader()); + cur2 = alterWS.newCursor(); + if (cur2.toFirstChild()) { + addShapes(cur2, lst); + } + } catch (XmlException e) { + LOG.log(POILogger.WARN, "unable to parse CTDrawing in alternate content.", e); + } finally { + if (cur2 != null) { + cur2.dispose(); + } + cur.pop(); + } + continue; + } else { + // ignore anything else + continue; + } + + assert(shape != null); + shape.anchor = getAnchorFromParent(obj); + lst.add(shape); + + } while (cur.toNextSibling()); + } + cur.pop(); + } while (cur.toNextSibling()); + } finally { + cur.dispose(); + } + } + + private boolean hasOleLink(XmlObject shape) { + QName uriName = new QName(null, "uri"); + String xquery = "declare namespace a='"+XSSFRelation.NS_DRAWINGML+"' .//a:extLst/a:ext"; + XmlCursor cur = shape.newCursor(); + cur.selectPath(xquery); + try { + while (cur.toNextSelection()) { + String uri = cur.getAttributeText(uriName); + if ("{63B3BB69-23CF-44E3-9099-C40C66FF867C}".equals(uri)) { + return true; + } + } + } finally { + cur.dispose(); + } + return false; + } private XSSFAnchor getAnchorFromParent(XmlObject obj){ XSSFAnchor anchor = null; XmlObject parentXbean = null; XmlCursor cursor = obj.newCursor(); - if(cursor.toParent()) parentXbean = cursor.getObject(); + if(cursor.toParent()) { + parentXbean = cursor.getObject(); + } cursor.dispose(); if(parentXbean != null){ if (parentXbean instanceof CTTwoCellAnchor) { @@ -424,4 +571,8 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { return anchor; } + @Override + public Iterator<XSSFShape> iterator() { + return getShapes().iterator(); + } } diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFObjectData.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFObjectData.java new file mode 100644 index 0000000000..ab51df81ee --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFObjectData.java @@ -0,0 +1,169 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.xssf.usermodel; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +import javax.xml.namespace.QName; + +import org.apache.poi.POIXMLDocumentPart; +import org.apache.poi.POIXMLException; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; +import org.apache.poi.poifs.filesystem.DirectoryEntry; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.ss.usermodel.ObjectData; +import org.apache.poi.util.IOUtils; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.apache.xmlbeans.XmlCursor; +import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape; +import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTOleObject; + +/** + * Represents binary object (i.e. OLE) data stored in the file. Eg. A GIF, JPEG etc... + */ +public class XSSFObjectData extends XSSFSimpleShape implements ObjectData { + private static final POILogger LOG = POILogFactory.getLogger(XSSFObjectData.class); + + /** + * A default instance of CTShape used for creating new shapes. + */ + private static CTShape prototype = null; + + private CTOleObject oleObject; + + protected XSSFObjectData(XSSFDrawing drawing, CTShape ctShape) { + super(drawing, ctShape); + } + + /** + * Prototype with the default structure of a new auto-shape. + */ + protected static CTShape prototype() { + if(prototype == null) { + prototype = XSSFSimpleShape.prototype(); + } + return prototype; + } + + @Override + public String getOLE2ClassName() { + return getOleObject().getProgId(); + } + + /** + * @return the CTOleObject associated with the shape + */ + public CTOleObject getOleObject() { + if (oleObject == null) { + long shapeId = getCTShape().getNvSpPr().getCNvPr().getId(); + oleObject = getSheet().readOleObject(shapeId); + if (oleObject == null) { + throw new POIXMLException("Ole object not found in sheet container - it's probably a control element"); + } + } + return oleObject; + } + + @Override + public byte[] getObjectData() throws IOException { + InputStream is = getObjectPart().getInputStream(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + IOUtils.copy(is, bos); + is.close(); + return bos.toByteArray(); + } + + /** + * @return the package part of the object data + */ + public PackagePart getObjectPart() { + if (!getOleObject().isSetId()) { + throw new POIXMLException("Invalid ole object found in sheet container"); + } + POIXMLDocumentPart pdp = getSheet().getRelationById(getOleObject().getId()); + return (pdp == null) ? null : pdp.getPackagePart(); + } + + @Override + public boolean hasDirectoryEntry() { + InputStream is = null; + try { + is = getObjectPart().getInputStream(); + + // If clearly doesn't do mark/reset, wrap up + if (! is.markSupported()) { + is = new PushbackInputStream(is, 8); + } + + // Ensure that there is at least some data there + byte[] header8 = IOUtils.peekFirst8Bytes(is); + + // Try to create + return NPOIFSFileSystem.hasPOIFSHeader(header8); + } catch (IOException e) { + LOG.log(POILogger.WARN, "can't determine if directory entry exists", e); + return false; + } finally { + IOUtils.closeQuietly(is); + } + } + + @Override + @SuppressWarnings("resource") + public DirectoryEntry getDirectory() throws IOException { + InputStream is = null; + try { + is = getObjectPart().getInputStream(); + return new POIFSFileSystem(is).getRoot(); + } finally { + IOUtils.closeQuietly(is); + } + } + + /** + * The filename of the embedded image + */ + @Override + public String getFileName() { + return getObjectPart().getPartName().getName(); + } + + protected XSSFSheet getSheet() { + return (XSSFSheet)getDrawing().getParent(); + } + + @Override + public XSSFPictureData getPictureData() { + XmlCursor cur = getOleObject().newCursor(); + try { + if (cur.toChild(XSSFRelation.NS_SPREADSHEETML, "objectPr")) { + String blipId = cur.getAttributeText(new QName(PackageRelationshipTypes.CORE_PROPERTIES_ECMA376_NS, "id")); + return (XSSFPictureData)getDrawing().getRelationById(blipId); + } + return null; + } finally { + cur.dispose(); + } + } +} diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFSheet.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFSheet.java index 5f1529604e..2043210496 100644 --- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFSheet.java +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFSheet.java @@ -40,6 +40,8 @@ import java.util.SortedMap; import java.util.TreeMap; import javax.xml.namespace.QName; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; import org.apache.poi.POIXMLDocumentPart; import org.apache.poi.POIXMLException; @@ -86,7 +88,9 @@ import org.apache.poi.xssf.usermodel.XSSFPivotTable.PivotTableReferenceConfigura import org.apache.poi.xssf.usermodel.helpers.ColumnHelper; import org.apache.poi.xssf.usermodel.helpers.XSSFIgnoredErrorHelper; import org.apache.poi.xssf.usermodel.helpers.XSSFRowShifter; +import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlException; +import org.apache.xmlbeans.XmlObject; import org.apache.xmlbeans.XmlOptions; import org.openxmlformats.schemas.spreadsheetml.x2006.main.*; @@ -4371,4 +4375,64 @@ public class XSSFSheet extends POIXMLDocumentPart implements Sheet { CTIgnoredError ctIgnoredError = ctIgnoredErrors.addNewIgnoredError(); XSSFIgnoredErrorHelper.addIgnoredErrors(ctIgnoredError, ref, ignoredErrorTypes); } + + /** + * Determine the OleObject which links shapes with embedded resources + * + * @param shapeId the shape id + * @return the CTOleObject of the shape + */ + protected CTOleObject readOleObject(long shapeId) { + if (!getCTWorksheet().isSetOleObjects()) { + return null; + } + + // we use a XmlCursor here to handle oleObject with-/out AlternateContent wrappers + String xquery = "declare namespace p='"+XSSFRelation.NS_SPREADSHEETML+"' .//p:oleObject"; + XmlCursor cur = getCTWorksheet().getOleObjects().newCursor(); + try { + cur.selectPath(xquery); + CTOleObject coo = null; + while (cur.toNextSelection()) { + String sId = cur.getAttributeText(new QName(null, "shapeId")); + if (sId == null || Long.parseLong(sId) != shapeId) { + continue; + } + + XmlObject xObj = cur.getObject(); + if (xObj instanceof CTOleObject) { + // the unusual case ... + coo = (CTOleObject)xObj; + } else { + XMLStreamReader reader = cur.newXMLStreamReader(); + try { + CTOleObjects coos = CTOleObjects.Factory.parse(reader); + if (coos.sizeOfOleObjectArray() == 0) { + continue; + } + coo = coos.getOleObjectArray(0); + } catch (XmlException e) { + logger.log(POILogger.INFO, "can't parse CTOleObjects", e); + } finally { + try { + reader.close(); + } catch (XMLStreamException e) { + logger.log(POILogger.INFO, "can't close reader", e); + } + } + } + + // there are choice and fallback OleObject ... we prefer the one having the objectPr element, + // which is in the choice element + if (cur.toChild(XSSFRelation.NS_SPREADSHEETML, "objectPr")) { + break; + } + } + return (coo == null) ? null : coo; + } finally { + cur.dispose(); + } + } + + } diff --git a/src/ooxml/testcases/org/apache/poi/ss/extractor/TestEmbeddedExtractor.java b/src/ooxml/testcases/org/apache/poi/ss/extractor/TestEmbeddedExtractor.java new file mode 100644 index 0000000000..225e50660b --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/ss/extractor/TestEmbeddedExtractor.java @@ -0,0 +1,115 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.ss.extractor; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.io.InputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.List; + +import javax.xml.bind.DatatypeConverter; + +import org.apache.poi.EncryptedDocumentException; +import org.apache.poi.POIDataSamples; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.ss.usermodel.WorkbookFactory; +import org.junit.Test; + +public class TestEmbeddedExtractor { + private static final POIDataSamples samples = POIDataSamples.getSpreadSheetInstance(); + + @Test + public void extractPDFfromEMF() throws Exception { + InputStream fis = samples.openResourceAsStream("Basic_Expense_Template_2011.xls"); + Workbook wb = WorkbookFactory.create(fis); + fis.close(); + + EmbeddedExtractor ee = new EmbeddedExtractor(); + List<EmbeddedData> edList = new ArrayList<EmbeddedData>(); + for (Sheet s : wb) { + edList.addAll(ee.extractAll(s)); + } + wb.close(); + + assertEquals(2, edList.size()); + + String filename1 = "Sample.pdf"; + EmbeddedData ed0 = edList.get(0); + assertEquals(filename1, ed0.getFilename()); + assertEquals(filename1, ed0.getShape().getShapeName().trim()); + assertEquals("uNplB1QpYug+LWappiTh0w==", md5hash(ed0.getEmbeddedData())); + + String filename2 = "kalastuslupa_jiyjhnj_yuiyuiyuio_uyte_sldfsdfsdf_sfsdfsdf_sfsssfsf_sdfsdfsdfsdf_sdfsdfsdf.pdf"; + EmbeddedData ed1 = edList.get(1); + assertEquals(filename2, ed1.getFilename()); + assertEquals(filename2, ed1.getShape().getShapeName().trim()); + assertEquals("QjLuAZ+cd7KbhVz4sj+QdA==", md5hash(ed1.getEmbeddedData())); + } + + @Test + public void extractFromXSSF() throws IOException, EncryptedDocumentException, InvalidFormatException { + InputStream fis = samples.openResourceAsStream("58325_db.xlsx"); + Workbook wb = WorkbookFactory.create(fis); + fis.close(); + + EmbeddedExtractor ee = new EmbeddedExtractor(); + List<EmbeddedData> edList = new ArrayList<EmbeddedData>(); + for (Sheet s : wb) { + edList.addAll(ee.extractAll(s)); + } + wb.close(); + + assertEquals(4, edList.size()); + EmbeddedData ed0 = edList.get(0); + assertEquals("Object 1.pdf", ed0.getFilename()); + assertEquals("Object 1", ed0.getShape().getShapeName().trim()); + assertEquals("Oyys6UtQU1gbHYBYqA4NFA==", md5hash(ed0.getEmbeddedData())); + + EmbeddedData ed1 = edList.get(1); + assertEquals("Object 2.pdf", ed1.getFilename()); + assertEquals("Object 2", ed1.getShape().getShapeName().trim()); + assertEquals("xLScPUS0XH+5CTZ2A3neNw==", md5hash(ed1.getEmbeddedData())); + + EmbeddedData ed2 = edList.get(2); + assertEquals("Object 3.pdf", ed2.getFilename()); + assertEquals("Object 3", ed2.getShape().getShapeName().trim()); + assertEquals("rX4klZqJAeM5npb54Gi2+Q==", md5hash(ed2.getEmbeddedData())); + + EmbeddedData ed3 = edList.get(3); + assertEquals("Microsoft_Excel_Worksheet1.xlsx", ed3.getFilename()); + assertEquals("Object 1", ed3.getShape().getShapeName().trim()); + assertEquals("4m4N8ji2tjpEGPQuw2YwGA==", md5hash(ed3.getEmbeddedData())); + } + + public static String md5hash(byte[] input) { + try { + MessageDigest md = MessageDigest.getInstance("MD5"); + byte hash[] = md.digest(input); + return DatatypeConverter.printBase64Binary(hash); + } catch (NoSuchAlgorithmException e) { + // doesn't happen + return ""; + } + } +} |