- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.ss.extractor;
-
- import static org.apache.poi.util.StringUtil.endsWithIgnoreCase;
-
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collections;
- import java.util.Iterator;
- import java.util.List;
-
- import org.apache.logging.log4j.LogManager;
- import org.apache.logging.log4j.Logger;
- import org.apache.poi.hpsf.ClassID;
- import org.apache.poi.hpsf.ClassIDPredefined;
- import org.apache.poi.poifs.filesystem.DirectoryNode;
- import org.apache.poi.poifs.filesystem.DocumentInputStream;
- import org.apache.poi.poifs.filesystem.Entry;
- import org.apache.poi.poifs.filesystem.Ole10Native;
- import org.apache.poi.poifs.filesystem.Ole10NativeException;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.ss.usermodel.Drawing;
- import org.apache.poi.ss.usermodel.ObjectData;
- import org.apache.poi.ss.usermodel.Picture;
- import org.apache.poi.ss.usermodel.PictureData;
- import org.apache.poi.ss.usermodel.Shape;
- import org.apache.poi.ss.usermodel.ShapeContainer;
- import org.apache.poi.ss.usermodel.Sheet;
- import org.apache.poi.ss.usermodel.Workbook;
- import org.apache.poi.util.Beta;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.util.LocaleUtil;
-
- /**
- * This extractor class tries to identify various embedded documents within Excel files
- * and provide them via a common interface, i.e. the EmbeddedData instances
- */
- @Beta
- public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
- private static final Logger LOG = LogManager.getLogger(EmbeddedExtractor.class);
- //arbitrarily selected; may need to increase
- private static final int MAX_RECORD_LENGTH = 1_000_000;
-
- // contentType
- private static final String CONTENT_TYPE_BYTES = "binary/octet-stream";
- private static final String CONTENT_TYPE_PDF = "application/pdf";
- private static final String CONTENT_TYPE_DOC = "application/msword";
- private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
-
- /**
- * @return the list of known extractors, if you provide custom extractors, override this method
- */
- @Override
- public Iterator<EmbeddedExtractor> iterator() {
- EmbeddedExtractor[] ee = {
- new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
- };
- return Arrays.asList(ee).iterator();
- }
-
- public EmbeddedData extractOne(DirectoryNode src) throws IOException {
- for (EmbeddedExtractor ee : this) {
- if (ee.canExtract(src)) {
- return ee.extract(src);
- }
- }
- return null;
- }
-
- public EmbeddedData extractOne(Picture src) throws IOException {
- for (EmbeddedExtractor ee : this) {
- if (ee.canExtract(src)) {
- return ee.extract(src);
- }
- }
- return null;
- }
-
- public List<EmbeddedData> extractAll(Sheet sheet) throws IOException {
- Drawing<?> patriarch = sheet.getDrawingPatriarch();
- if (null == patriarch){
- return Collections.emptyList();
- }
- List<EmbeddedData> embeddings = new ArrayList<>();
- extractAll(patriarch, embeddings);
- return embeddings;
- }
-
- protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException {
- for (Shape shape : parent) {
- EmbeddedData data = null;
- if (shape instanceof ObjectData) {
- ObjectData od = (ObjectData)shape;
- try {
- if (od.hasDirectoryEntry()) {
- data = extractOne((DirectoryNode)od.getDirectory());
- } else {
- data = new EmbeddedData(od.getFileName(), od.getObjectData(), od.getContentType());
- }
- } catch (Exception e) {
- LOG.atWarn().withThrowable(e).log("Entry not found / readable - ignoring OLE embedding");
- }
- } else if (shape instanceof Picture) {
- data = extractOne((Picture)shape);
- } else if (shape instanceof ShapeContainer) {
- extractAll((ShapeContainer<?>)shape, embeddings);
- }
-
- if (data == null) {
- continue;
- }
-
- data.setShape(shape);
- String filename = data.getFilename();
- String extension = (filename == null || filename.lastIndexOf('.') == -1) ? ".bin" : filename.substring(filename.lastIndexOf('.'));
-
- // try to find an alternative name
- if (filename == null || filename.isEmpty() || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
- filename = shape.getShapeName();
- if (filename != null) {
- filename += extension;
- }
- }
- // default to dummy name
- if (filename == null || filename.isEmpty()) {
- filename = "picture_" + embeddings.size() + extension;
- }
- filename = filename.trim();
- data.setFilename(filename);
-
- embeddings.add(data);
- }
- }
-
-
- public boolean canExtract(DirectoryNode source) {
- return false;
- }
-
- public boolean canExtract(Picture source) {
- return false;
- }
-
- protected EmbeddedData extract(DirectoryNode dn) throws IOException {
- assert(canExtract(dn));
- ByteArrayOutputStream bos = new ByteArrayOutputStream(20000);
- try (POIFSFileSystem dest = new POIFSFileSystem()) {
- copyNodes(dn, dest.getRoot());
- // start with a reasonable big size
- dest.writeFilesystem(bos);
- }
-
- return new EmbeddedData(dn.getName(), bos.toByteArray(), CONTENT_TYPE_BYTES);
- }
-
- protected EmbeddedData extract(Picture source) throws IOException {
- return null;
- }
-
- public static class Ole10Extractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return ClassIDPredefined.lookup(clsId) == ClassIDPredefined.OLE_V1_PACKAGE;
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- try {
- // TODO: inspect the CompObj record for more details, i.e. the content type
- Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
- return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), CONTENT_TYPE_BYTES);
- } catch (Ole10NativeException e) {
- throw new IOException(e);
- }
- }
- }
-
- static class PdfExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (ClassIDPredefined.PDF.equals(clsId) || dn.hasEntry("CONTENTS"));
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- try(ByteArrayOutputStream bos = new ByteArrayOutputStream();
- InputStream is = dn.createDocumentInputStream("CONTENTS")) {
- IOUtils.copy(is, bos);
- return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
- }
- }
-
- @Override
- public boolean canExtract(Picture source) {
- PictureData pd = source.getPictureData();
- return (pd != null && pd.getPictureType() == Workbook.PICTURE_TYPE_EMF);
- }
-
- /**
- * Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF.
- * If an embedded stream is inside an EMF picture, this method extracts the payload.
- *
- * @return the embedded data in an EMF picture or null if none is found
- */
- @Override
- protected EmbeddedData extract(Picture source) throws IOException {
- // check for emf+ embedded pdf (poor mans style :( )
- // Mac Excel 2011 embeds pdf files with this method.
- PictureData pd = source.getPictureData();
- if (pd == null || pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) {
- return null;
- }
-
- // TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF
- byte[] pictureBytes = pd.getData();
- int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252));
- if (idxStart == -1) {
- return null;
- }
-
- int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252));
- if (idxEnd == -1) {
- return null;
- }
-
- int pictureBytesLen = idxEnd-idxStart+6;
- byte[] pdfBytes = IOUtils.safelyClone(pictureBytes, idxStart, pictureBytesLen, MAX_RECORD_LENGTH);
- String filename = source.getShapeName().trim();
- if (!endsWithIgnoreCase(filename, ".pdf")) {
- filename += ".pdf";
- }
- return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
- }
-
-
- }
-
- static class OOXMLExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- return dn.hasEntry("package");
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
-
- ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid());
-
- String contentType = null;
- String ext = null;
-
- if (clsId != null) {
- contentType = clsId.getContentType();
- ext = clsId.getFileExtension();
- }
-
- if (contentType == null || ext == null) {
- contentType = "application/zip";
- ext = ".zip";
- }
-
- DocumentInputStream dis = dn.createDocumentInputStream("package");
- byte[] data = IOUtils.toByteArray(dis);
- dis.close();
-
- return new EmbeddedData(dn.getName()+ext, data, contentType);
- }
- }
-
- static class BiffExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- return canExtractExcel(dn) || canExtractWord(dn);
- }
-
- protected boolean canExtractExcel(DirectoryNode dn) {
- ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid());
- return (ClassIDPredefined.EXCEL_V7 == clsId
- || ClassIDPredefined.EXCEL_V8 == clsId
- || dn.hasEntry("Workbook") /*...*/);
- }
-
- protected boolean canExtractWord(DirectoryNode dn) {
- ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid());
- return (ClassIDPredefined.WORD_V7 == clsId
- || ClassIDPredefined.WORD_V8 == clsId
- || dn.hasEntry("WordDocument"));
- }
-
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- EmbeddedData ed = super.extract(dn);
- if (canExtractExcel(dn)) {
- ed.setFilename(dn.getName() + ".xls");
- ed.setContentType(CONTENT_TYPE_XLS);
- } else if (canExtractWord(dn)) {
- ed.setFilename(dn.getName() + ".doc");
- ed.setContentType(CONTENT_TYPE_DOC);
- }
-
- return ed;
- }
- }
-
- static class FsExtractor extends EmbeddedExtractor {
- @Override
- public boolean canExtract(DirectoryNode dn) {
- return true;
- }
- @Override
- public EmbeddedData extract(DirectoryNode dn) throws IOException {
- EmbeddedData ed = super.extract(dn);
- ed.setFilename(dn.getName() + ".ole");
- // TODO: read the content type from CombObj stream
- return ed;
- }
- }
-
- protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
- for (Entry e : src) {
- if (e instanceof DirectoryNode) {
- DirectoryNode srcDir = (DirectoryNode)e;
- DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
- destDir.setStorageClsid(srcDir.getStorageClsid());
- copyNodes(srcDir, destDir);
- } else {
- try (InputStream is = src.createDocumentInputStream(e)) {
- dest.createDocument(e.getName(), is);
- }
- }
- }
- }
-
-
-
- /**
- * Knuth-Morris-Pratt Algorithm for Pattern Matching
- * Finds the first occurrence of the pattern in the text.
- */
- private static int indexOf(byte[] data, int offset, byte[] pattern) {
- int[] failure = computeFailure(pattern);
-
- int j = 0;
- if (data.length == 0) {
- return -1;
- }
-
- for (int i = offset; i < data.length; i++) {
- while (j > 0 && pattern[j] != data[i]) {
- j = failure[j - 1];
- }
- if (pattern[j] == data[i]) { j++; }
- if (j == pattern.length) {
- return i - pattern.length + 1;
- }
- }
- return -1;
- }
-
- /**
- * Computes the failure function using a boot-strapping process,
- * where the pattern is matched against itself.
- */
- private static int[] computeFailure(byte[] pattern) {
- int[] failure = new int[pattern.length];
-
- int j = 0;
- for (int i = 1; i < pattern.length; i++) {
- while (j > 0 && pattern[j] != pattern[i]) {
- j = failure[j - 1];
- }
- if (pattern[j] == pattern[i]) {
- j++;
- }
- failure[i] = j;
- }
-
- return failure;
- }
-
-
- }
|