*/
public class ClassID
{
- public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}");
- public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
- public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}");
- public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
- public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}");
- public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}");
- public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}");
- public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}");
- public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
- public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
- public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
+ public static final ClassID OLE10_PACKAGE = new ClassID("{0003000C-0000-0000-C000-000000000046}");
+ public static final ClassID PPT_SHOW = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
+ public static final ClassID XLS_WORKBOOK = new ClassID("{00020841-0000-0000-C000-000000000046}");
+ public static final ClassID TXT_ONLY = new ClassID("{5e941d80-bf96-11cd-b579-08002b30bfeb}");
+
+ // Excel V3
+ public static final ClassID EXCEL_V3 = new ClassID("{00030000-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL_V3_CHART = new ClassID("{00030001-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL_V3_MACRO = new ClassID("{00030002-0000-0000-C000-000000000046}");
+ // Excel V5
+ public static final ClassID EXCEL95 = new ClassID("{00020810-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL95_CHART = new ClassID("{00020811-0000-0000-C000-000000000046}");
+ // Excel V8
+ public static final ClassID EXCEL97 = new ClassID("{00020820-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL97_CHART = new ClassID("{00020821-0000-0000-C000-000000000046}");
+ // Excel V11
+ public static final ClassID EXCEL2003 = new ClassID("{00020812-0000-0000-C000-000000000046}");
+ // Excel V12
+ public static final ClassID EXCEL2007 = new ClassID("{00020830-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL2007_MACRO= new ClassID("{00020832-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL2007_XLSB = new ClassID("{00020833-0000-0000-C000-000000000046}");
+ // Excel V14
+ public static final ClassID EXCEL2010 = new ClassID("{00024500-0000-0000-C000-000000000046}");
+ public static final ClassID EXCEL2010_CHART= new ClassID("{00024505-0014-0000-C000-000000000046}");
+ public static final ClassID EXCEL2010_ODS = new ClassID("{EABCECDB-CC1C-4A6F-B4E3-7F888A5ADFC8}");
+
+ public static final ClassID WORD97 = new ClassID("{00020906-0000-0000-C000-000000000046}");
+ public static final ClassID WORD95 = new ClassID("{00020900-0000-0000-C000-000000000046}");
+ public static final ClassID WORD2007 = new ClassID("{F4754C9B-64F5-4B40-8AF4-679732AC0607}");
+ public static final ClassID WORD2007_MACRO = new ClassID("{18A06B6B-2F3F-4E2B-A611-52BE631B2D22}");
+
+ public static final ClassID POWERPOINT97 = new ClassID("{64818D10-4F9B-11CF-86EA-00AA00B929E8}");
+ public static final ClassID POWERPOINT95 = new ClassID("{EA7BAE70-FB3B-11CD-A903-00AA00510EA3}");
+ public static final ClassID POWERPOINT2007 = new ClassID("{CF4F55F4-8F87-4D47-80BB-5808164BB3F8}");
+ public static final ClassID POWERPOINT2007_MACRO = new ClassID("{DC020317-E6E2-4A62-B9FA-B3EFE16626F4}");
+
+ public static final ClassID EQUATION30 = new ClassID("{0002CE02-0000-0000-C000-000000000046}");
/** <p>The number of bytes occupied by this object in the byte
* stream.</p> */
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
-import java.util.Locale;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.ss.usermodel.ShapeContainer;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.xssf.usermodel.XSSFObjectData;
+/**
+ * This extractor class tries to identify various embedded documents within Excel files
+ * and provide them via a common interface, i.e. the EmbeddedData instances
+ */
+@Beta
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
private static final String CONTENT_TYPE_DOC = "application/msword";
private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel";
- // default file extension
- private static final String PDF_EXT = ".pdf";
- private static final String DOC_EXT = ".doc";
- private static final String XLS_EXT = ".xls";
- private static final String OLE_EXT = ".ole";
-
/**
* @return the list of known extractors, if you provide custom extractors, override this method
*/
@Override
public Iterator<EmbeddedExtractor> iterator() {
EmbeddedExtractor[] ee = {
- new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
+ new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
};
return Arrays.asList(ee).iterator();
}
if (od.hasDirectoryEntry()) {
data = extractOne((DirectoryNode)od.getDirectory());
} else {
+ String contentType = CONTENT_TYPE_BYTES;
if (od instanceof XSSFObjectData) {
- String contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
+ contentType = ((XSSFObjectData)od).getObjectPart().getContentType();
}
- data = new EmbeddedData(od.getFileName(), od.getObjectData(), CONTENT_TYPE_BYTES);
+ data = new EmbeddedData(od.getFileName(), od.getObjectData(), contentType);
}
} catch (Exception e) {
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
InputStream is = dn.createDocumentInputStream("CONTENTS");
IOUtils.copy(is, bos);
is.close();
- return new EmbeddedData(dn.getName() + PDF_EXT, bos.toByteArray(), CONTENT_TYPE_PDF);
+ return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
}
@Override
byte[] pdfBytes = new byte[pictureBytesLen];
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
String filename = source.getShapeName().trim();
- if (!endsWithIgnoreCase(filename, PDF_EXT)) {
- filename += PDF_EXT;
+ if (!endsWithIgnoreCase(filename, ".pdf")) {
+ filename += ".pdf";
}
return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
}
}
- static class WordExtractor extends EmbeddedExtractor {
+ static class OOXMLExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
- ClassID clsId = dn.getStorageClsid();
- return (ClassID.WORD95.equals(clsId)
- || ClassID.WORD97.equals(clsId)
- || dn.hasEntry("WordDocument"));
+ return dn.hasEntry("package");
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
- EmbeddedData ed = super.extract(dn);
- ed.setFilename(dn.getName() + DOC_EXT);
- ed.setContentType(CONTENT_TYPE_DOC);
- return ed;
+
+ ClassID clsId = dn.getStorageClsid();
+
+ String contentType, ext;
+ if (ClassID.WORD2007.equals(clsId)) {
+ ext = ".docx";
+ contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
+ } else if (ClassID.WORD2007_MACRO.equals(clsId)) {
+ ext = ".docm";
+ contentType = "application/vnd.ms-word.document.macroEnabled.12";
+ } else if (ClassID.EXCEL2007.equals(clsId) || ClassID.EXCEL2003.equals(clsId) || ClassID.EXCEL2010.equals(clsId)) {
+ ext = ".xlsx";
+ contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
+ } else if (ClassID.EXCEL2007_MACRO.equals(clsId)) {
+ ext = ".xlsm";
+ contentType = "application/vnd.ms-excel.sheet.macroEnabled.12";
+ } else if (ClassID.EXCEL2007_XLSB.equals(clsId)) {
+ ext = ".xlsb";
+ contentType = "application/vnd.ms-excel.sheet.binary.macroEnabled.12";
+ } else if (ClassID.POWERPOINT2007.equals(clsId)) {
+ ext = ".pptx";
+ contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
+ } else if (ClassID.POWERPOINT2007_MACRO.equals(clsId)) {
+ ext = ".ppsm";
+ contentType = "application/vnd.ms-powerpoint.slideshow.macroEnabled.12";
+ } else {
+ ext = ".zip";
+ contentType = "application/zip";
+ }
+
+ DocumentInputStream dis = dn.createDocumentInputStream("package");
+ byte data[] = IOUtils.toByteArray(dis);
+ dis.close();
+
+ return new EmbeddedData(dn.getName()+ext, data, contentType);
}
}
- static class ExcelExtractor extends EmbeddedExtractor {
+ static class BiffExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
+ return canExtractExcel(dn) || canExtractWord(dn);
+ }
+
+ protected boolean canExtractExcel(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.EXCEL95.equals(clsId)
- || ClassID.EXCEL97.equals(clsId)
- || dn.hasEntry("Workbook") /*...*/);
+ || ClassID.EXCEL97.equals(clsId)
+ || dn.hasEntry("Workbook") /*...*/);
+ }
+
+ protected boolean canExtractWord(DirectoryNode dn) {
+ ClassID clsId = dn.getStorageClsid();
+ return (ClassID.WORD95.equals(clsId)
+ || ClassID.WORD97.equals(clsId)
+ || dn.hasEntry("WordDocument"));
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
- ed.setFilename(dn.getName() + XLS_EXT);
- ed.setContentType(CONTENT_TYPE_XLS);
+ if (canExtractExcel(dn)) {
+ ed.setFilename(dn.getName() + ".xls");
+ ed.setContentType(CONTENT_TYPE_XLS);
+ } else if (canExtractWord(dn)) {
+ ed.setFilename(dn.getName() + ".doc");
+ ed.setContentType(CONTENT_TYPE_DOC);
+ }
+
return ed;
}
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
- ed.setFilename(dn.getName() + OLE_EXT);
+ ed.setFilename(dn.getName() + ".ole");
// TODO: read the content type from CombObj stream
return ed;
}