123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.ss.extractor;
-
- import static org.apache.poi.util.StringUtil.endsWithIgnoreCase;
-
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collections;
- import java.util.Iterator;
- import java.util.List;
-
- import org.apache.poi.hpsf.ClassID;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.DocumentInputStream;
- import org.apache.poi.poifs.filesystem.Entry;
- import org.apache.poi.poifs.filesystem.Ole10Native;
- import org.apache.poi.poifs.filesystem.Ole10NativeException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.ss.usermodel.Drawing;
- import org.apache.poi.ss.usermodel.ObjectData;
- import org.apache.poi.ss.usermodel.Picture;
- import org.apache.poi.ss.usermodel.PictureData;
- import org.apache.poi.ss.usermodel.Shape;
- import org.apache.poi.ss.usermodel.ShapeContainer;
- import org.apache.poi.ss.usermodel.Sheet;
- import org.apache.poi.ss.usermodel.Workbook;
- import org.apache.poi.util.Beta;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.util.LocaleUtil;
- import org.apache.poi.util.POILogFactory;
- import org.apache.poi.util.POILogger;
- import org.apache.poi.xssf.usermodel.XSSFObjectData;
-
- /**
- * This extractor class tries to identify various embedded documents within Excel files
- * and provide them via a common interface, i.e. the EmbeddedData instances
- */
- @Beta
- public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
- private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
-
- // contentType
- private static final String CONTENT_TYPE_BYTES = "binary/octet-stream";
- private static final String CONTENT_TYPE_PDF = "application/pdf";
- private static final String CONTENT_TYPE_DOC = "application/msword";
- private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
-
- /**
- * @return the list of known extractors, if you provide custom extractors, override this method
- */
- @Override
- public Iterator<EmbeddedExtractor> iterator() {
- EmbeddedExtractor[] ee = {
- new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
- };
- return Arrays.asList(ee).iterator();
- }
-
- public EmbeddedData extractOne(DirectoryNode src) throws IOException {
- for (EmbeddedExtractor ee : this) {
- if (ee.canExtract(src)) {
- return ee.extract(src);
- }
- }
- return null;
- }
-
- public EmbeddedData extractOne(Picture src) throws IOException {
- for (EmbeddedExtractor ee : this) {
- if (ee.canExtract(src)) {
- return ee.extract(src);
- }
- }
- return null;
- }
-
- public List<EmbeddedData> extractAll(Sheet sheet) throws IOException {
- Drawing<?> patriarch = sheet.getDrawingPatriarch();
- if (null == patriarch){
- return Collections.emptyList();
- }
- List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
- extractAll(patriarch, embeddings);
- return embeddings;
- }
-
- protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException {
- for (Shape shape : parent) {
- EmbeddedData data = null;
- if (shape instanceof ObjectData) {
- ObjectData od = (ObjectData)shape;
- try {
- if (od.hasDirectoryEntry()) {
- data = extractOne((DirectoryNode)od.getDirectory());
- } else {
- String contentType = CONTENT_TYPE_BYTES;
- if (od instanceof XSSFObjectData) {
- contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
- }
- data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType);
- }
- } catch (Exception e) {
- LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
- }
- } else if (shape instanceof Picture) {
- data = extractOne((Picture)shape);
- } else if (shape instanceof ShapeContainer) {
- extractAll((ShapeContainer<?>)shape, embeddings);
- }
-
- if (data == null) {
- continue;
- }
-
- data.setShape(shape);
- String filename = data.getFilename();
- String extension = (filename == null || filename.lastIndexOf('.') == -1) ? ".bin" : filename.substring(filename.lastIndexOf('.'));
-
- // try to find an alternative name
- if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
- filename = shape.getShapeName();
- if (filename != null) {
- filename += extension;
- }
- }
- // default to dummy name
- if (filename == null || "".equals(filename)) {
- filename = "picture_" + embeddings.size() + extension;
- }
- filename = filename.trim();
- data.setFilename(filename);
-
- embeddings.add(data);
- }
- }
-
-
- public boolean canExtract(DirectoryNode source) {
- return false;
- }
-
- public boolean canExtract(Picture source) {
- return false;
- }
-
- protected EmbeddedData extract(DirectoryNode dn) throws IOException {
- assert(canExtract(dn));
- POIFSFileSystem dest = new POIFSFileSystem();
- copyNodes(dn, dest.getRoot());
- // start with a reasonable big size
- ByteArrayOutputStream bos = new ByteArrayOutputStream(20000);
- dest.writeFilesystem(bos);
- dest.close();
-
- return new EmbeddedData(dn.getName(), bos.toByteArray(), CONTENT_TYPE_BYTES);
- }
-
- protected EmbeddedData extract(Picture source) throws IOException {
- return null;
- }
-
- public static class Ole10Extractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return ClassID.OLE10_PACKAGE.equals(clsId);
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- try {
- // TODO: inspect the CompObj record for more details, i.e. the content type
- Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
- return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), CONTENT_TYPE_BYTES);
- } catch (Ole10NativeException e) {
- throw new IOException(e);
- }
- }
- }
-
- static class PdfExtractor extends EmbeddedExtractor {
- static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
- @Override
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (PdfClassID.equals(clsId)
- || dn.hasEntry("CONTENTS"));
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- InputStream is = dn.createDocumentInputStream("CONTENTS");
- IOUtils.copy(is, bos);
- is.close();
- return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
- }
-
- @Override
- public boolean canExtract(Picture source) {
- PictureData pd = source.getPictureData();
- return (pd != null && pd.getPictureType() == Workbook.PICTURE_TYPE_EMF);
- }
-
- /**
- * Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF.
- * If an embedded stream is inside an EMF picture, this method extracts the payload.
- *
- * @return the embedded data in an EMF picture or null if none is found
- */
- @Override
- protected EmbeddedData extract(Picture source) throws IOException {
- // check for emf+ embedded pdf (poor mans style :( )
- // Mac Excel 2011 embeds pdf files with this method.
- PictureData pd = source.getPictureData();
- if (pd != null && pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) {
- return null;
- }
-
- // TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF
- byte pictureBytes[] = pd.getData();
- int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252));
- if (idxStart == -1) {
- return null;
- }
-
- int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252));
- if (idxEnd == -1) {
- return null;
- }
-
- int pictureBytesLen = idxEnd-idxStart+6;
- byte[] pdfBytes = new byte[pictureBytesLen];
- System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
- String filename = source.getShapeName().trim();
- if (!endsWithIgnoreCase(filename, ".pdf")) {
- filename += ".pdf";
- }
- return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
- }
-
-
- }
-
- static class OOXMLExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- return dn.hasEntry("package");
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
-
- ClassID clsId = dn.getStorageClsid();
-
- String contentType, ext;
- if (ClassID.WORD2007.equals(clsId)) {
- ext = ".docx";
- contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
- } else if (ClassID.WORD2007_MACRO.equals(clsId)) {
- ext = ".docm";
- contentType = "application/vnd.ms-word.document.macroEnabled.12";
- } else if (ClassID.EXCEL2007.equals(clsId) || ClassID.EXCEL2003.equals(clsId) || ClassID.EXCEL2010.equals(clsId)) {
- ext = ".xlsx";
- contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
- } else if (ClassID.EXCEL2007_MACRO.equals(clsId)) {
- ext = ".xlsm";
- contentType = "application/vnd.ms-excel.sheet.macroEnabled.12";
- } else if (ClassID.EXCEL2007_XLSB.equals(clsId)) {
- ext = ".xlsb";
- contentType = "application/vnd.ms-excel.sheet.binary.macroEnabled.12";
- } else if (ClassID.POWERPOINT2007.equals(clsId)) {
- ext = ".pptx";
- contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
- } else if (ClassID.POWERPOINT2007_MACRO.equals(clsId)) {
- ext = ".ppsm";
- contentType = "application/vnd.ms-powerpoint.slideshow.macroEnabled.12";
- } else {
- ext = ".zip";
- contentType = "application/zip";
- }
-
- DocumentInputStream dis = dn.createDocumentInputStream("package");
- byte data[] = IOUtils.toByteArray(dis);
- dis.close();
-
- return new EmbeddedData(dn.getName()+ext, data, contentType);
- }
- }
-
- static class BiffExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- return canExtractExcel(dn) || canExtractWord(dn);
- }
-
- protected boolean canExtractExcel(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (ClassID.EXCEL95.equals(clsId)
- || ClassID.EXCEL97.equals(clsId)
- || dn.hasEntry("Workbook") /*...*/);
- }
-
- protected boolean canExtractWord(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (ClassID.WORD95.equals(clsId)
- || ClassID.WORD97.equals(clsId)
- || dn.hasEntry("WordDocument"));
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- EmbeddedData ed = super.extract(dn);
- if (canExtractExcel(dn)) {
- ed.setFilename(dn.getName() + ".xls");
- ed.setContentType(CONTENT_TYPE_XLS);
- } else if (canExtractWord(dn)) {
- ed.setFilename(dn.getName() + ".doc");
- ed.setContentType(CONTENT_TYPE_DOC);
- }
-
- return ed;
- }
- }
-
- static class FsExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- return true;
- }
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- EmbeddedData ed = super.extract(dn);
- ed.setFilename(dn.getName() + ".ole");
- // TODO: read the content type from CombObj stream
- return ed;
- }
- }
-
- protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
- for (Entry e : src) {
- if (e instanceof DirectoryNode) {
- DirectoryNode srcDir = (DirectoryNode)e;
- DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
- destDir.setStorageClsid(srcDir.getStorageClsid());
- copyNodes(srcDir, destDir);
- } else {
- InputStream is = src.createDocumentInputStream(e);
- try {
- dest.createDocument(e.getName(), is);
- } finally {
- is.close();
- }
- }
- }
- }
-
-
-
- /**
- * Knuth-Morris-Pratt Algorithm for Pattern Matching
- * Finds the first occurrence of the pattern in the text.
- */
- private static int indexOf(byte[] data, int offset, byte[] pattern) {
- int[] failure = computeFailure(pattern);
-
- int j = 0;
- if (data.length == 0) {
- return -1;
- }
-
- for (int i = offset; i < data.length; i++) {
- while (j > 0 && pattern[j] != data[i]) {
- j = failure[j - 1];
- }
- if (pattern[j] == data[i]) { j++; }
- if (j == pattern.length) {
- return i - pattern.length + 1;
- }
- }
- return -1;
- }
-
- /**
- * Computes the failure function using a boot-strapping process,
- * where the pattern is matched against itself.
- */
- private static int[] computeFailure(byte[] pattern) {
- int[] failure = new int[pattern.length];
-
- int j = 0;
- for (int i = 1; i < pattern.length; i++) {
- while (j > 0 && pattern[j] != pattern[i]) {
- j = failure[j - 1];
- }
- if (pattern[j] == pattern[i]) {
- j++;
- }
- failure[i] = j;
- }
-
- return failure;
- }
-
-
- }
|