You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

EmbeddedExtractor.java 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.ss.extractor;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.util.ArrayList;
  20. import java.util.Arrays;
  21. import java.util.Collections;
  22. import java.util.Iterator;
  23. import java.util.List;
  24. import java.util.Locale;
  25. import org.apache.poi.hpsf.ClassID;
  26. import org.apache.poi.poifs.filesystem.DirectoryNode;
  27. import org.apache.poi.poifs.filesystem.Entry;
  28. import org.apache.poi.poifs.filesystem.Ole10Native;
  29. import org.apache.poi.poifs.filesystem.Ole10NativeException;
  30. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  31. import org.apache.poi.ss.usermodel.Drawing;
  32. import org.apache.poi.ss.usermodel.ObjectData;
  33. import org.apache.poi.ss.usermodel.Picture;
  34. import org.apache.poi.ss.usermodel.PictureData;
  35. import org.apache.poi.ss.usermodel.Shape;
  36. import org.apache.poi.ss.usermodel.ShapeContainer;
  37. import org.apache.poi.ss.usermodel.Sheet;
  38. import org.apache.poi.ss.usermodel.Workbook;
  39. import org.apache.poi.util.IOUtils;
  40. import org.apache.poi.util.LocaleUtil;
  41. import org.apache.poi.util.POILogFactory;
  42. import org.apache.poi.util.POILogger;
  43. public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
  44. private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
  45. /**
  46. * @return the list of known extractors, if you provide custom extractors, override this method
  47. */
  48. @Override
  49. public Iterator<EmbeddedExtractor> iterator() {
  50. EmbeddedExtractor[] ee = {
  51. new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
  52. };
  53. return Arrays.asList(ee).iterator();
  54. }
  55. public EmbeddedData extractOne(DirectoryNode src) throws IOException {
  56. for (EmbeddedExtractor ee : this) {
  57. if (ee.canExtract(src)) {
  58. return ee.extract(src);
  59. }
  60. }
  61. return null;
  62. }
  63. public EmbeddedData extractOne(Picture src) throws IOException {
  64. for (EmbeddedExtractor ee : this) {
  65. if (ee.canExtract(src)) {
  66. return ee.extract(src);
  67. }
  68. }
  69. return null;
  70. }
  71. public List<EmbeddedData> extractAll(Sheet sheet) throws IOException {
  72. Drawing<?> patriarch = sheet.getDrawingPatriarch();
  73. if (null == patriarch){
  74. return Collections.emptyList();
  75. }
  76. List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
  77. extractAll(patriarch, embeddings);
  78. return embeddings;
  79. }
  80. protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException {
  81. for (Shape shape : parent) {
  82. EmbeddedData data = null;
  83. if (shape instanceof ObjectData) {
  84. ObjectData od = (ObjectData)shape;
  85. try {
  86. if (od.hasDirectoryEntry()) {
  87. data = extractOne((DirectoryNode)od.getDirectory());
  88. } else {
  89. data = new EmbeddedData(od.getFileName(), od.getObjectData(), "binary/octet-stream");
  90. }
  91. } catch (Exception e) {
  92. LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
  93. }
  94. } else if (shape instanceof Picture) {
  95. data = extractOne((Picture)shape);
  96. } else if (shape instanceof ShapeContainer) {
  97. extractAll((ShapeContainer<?>)shape, embeddings);
  98. }
  99. if (data == null) {
  100. continue;
  101. }
  102. data.setShape(shape);
  103. String filename = data.getFilename();
  104. String extension = (filename == null || filename.indexOf('.') == -1) ? ".bin" : filename.substring(filename.indexOf('.'));
  105. // try to find an alternative name
  106. if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
  107. filename = shape.getShapeName();
  108. if (filename != null) {
  109. filename += extension;
  110. }
  111. }
  112. // default to dummy name
  113. if (filename == null || "".equals(filename)) {
  114. filename = "picture_"+embeddings.size()+extension;
  115. }
  116. filename = filename.trim();
  117. data.setFilename(filename);
  118. embeddings.add(data);
  119. }
  120. }
  121. public boolean canExtract(DirectoryNode source) {
  122. return false;
  123. }
  124. public boolean canExtract(Picture source) {
  125. return false;
  126. }
  127. protected EmbeddedData extract(DirectoryNode dn) throws IOException {
  128. assert(canExtract(dn));
  129. POIFSFileSystem dest = new POIFSFileSystem();
  130. copyNodes(dn, dest.getRoot());
  131. // start with a reasonable big size
  132. ByteArrayOutputStream bos = new ByteArrayOutputStream(20000);
  133. dest.writeFilesystem(bos);
  134. dest.close();
  135. return new EmbeddedData(dn.getName(), bos.toByteArray(), "binary/octet-stream");
  136. }
  137. protected EmbeddedData extract(Picture source) throws IOException {
  138. return null;
  139. }
  140. public static class Ole10Extractor extends EmbeddedExtractor {
  141. @Override
  142. public boolean canExtract(DirectoryNode dn) {
  143. ClassID clsId = dn.getStorageClsid();
  144. return ClassID.OLE10_PACKAGE.equals(clsId);
  145. }
  146. @Override
  147. public EmbeddedData extract(DirectoryNode dn) throws IOException {
  148. try {
  149. Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
  150. return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), "binary/octet-stream");
  151. } catch (Ole10NativeException e) {
  152. throw new IOException(e);
  153. }
  154. }
  155. }
  156. static class PdfExtractor extends EmbeddedExtractor {
  157. static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
  158. @Override
  159. public boolean canExtract(DirectoryNode dn) {
  160. ClassID clsId = dn.getStorageClsid();
  161. return (PdfClassID.equals(clsId)
  162. || dn.hasEntry("CONTENTS"));
  163. }
  164. @Override
  165. public EmbeddedData extract(DirectoryNode dn) throws IOException {
  166. ByteArrayOutputStream bos = new ByteArrayOutputStream();
  167. InputStream is = dn.createDocumentInputStream("CONTENTS");
  168. IOUtils.copy(is, bos);
  169. is.close();
  170. return new EmbeddedData(dn.getName()+".pdf", bos.toByteArray(), "application/pdf");
  171. }
  172. @Override
  173. public boolean canExtract(Picture source) {
  174. PictureData pd = source.getPictureData();
  175. return (pd.getPictureType() == Workbook.PICTURE_TYPE_EMF);
  176. }
  177. /**
  178. * Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF.
  179. * If an embedded stream is inside an EMF picture, this method extracts the payload.
  180. *
  181. * @return the embedded data in an EMF picture or null if none is found
  182. */
  183. @Override
  184. protected EmbeddedData extract(Picture source) throws IOException {
  185. // check for emf+ embedded pdf (poor mans style :( )
  186. // Mac Excel 2011 embeds pdf files with this method.
  187. PictureData pd = source.getPictureData();
  188. if (pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) {
  189. return null;
  190. }
  191. // TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF
  192. byte pictureBytes[] = pd.getData();
  193. int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252));
  194. if (idxStart == -1) {
  195. return null;
  196. }
  197. int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252));
  198. if (idxEnd == -1) {
  199. return null;
  200. }
  201. int pictureBytesLen = idxEnd-idxStart+6;
  202. byte[] pdfBytes = new byte[pictureBytesLen];
  203. System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
  204. String filename = source.getShapeName().trim();
  205. if (!filename.toLowerCase(Locale.ROOT).endsWith(".pdf")) {
  206. filename += ".pdf";
  207. }
  208. return new EmbeddedData(filename, pdfBytes, "application/pdf");
  209. }
  210. }
  211. static class WordExtractor extends EmbeddedExtractor {
  212. @Override
  213. public boolean canExtract(DirectoryNode dn) {
  214. ClassID clsId = dn.getStorageClsid();
  215. return (ClassID.WORD95.equals(clsId)
  216. || ClassID.WORD97.equals(clsId)
  217. || dn.hasEntry("WordDocument"));
  218. }
  219. @Override
  220. public EmbeddedData extract(DirectoryNode dn) throws IOException {
  221. EmbeddedData ed = super.extract(dn);
  222. ed.setFilename(dn.getName()+".doc");
  223. return ed;
  224. }
  225. }
  226. static class ExcelExtractor extends EmbeddedExtractor {
  227. @Override
  228. public boolean canExtract(DirectoryNode dn) {
  229. ClassID clsId = dn.getStorageClsid();
  230. return (ClassID.EXCEL95.equals(clsId)
  231. || ClassID.EXCEL97.equals(clsId)
  232. || dn.hasEntry("Workbook") /*...*/);
  233. }
  234. @Override
  235. public EmbeddedData extract(DirectoryNode dn) throws IOException {
  236. EmbeddedData ed = super.extract(dn);
  237. ed.setFilename(dn.getName()+".xls");
  238. return ed;
  239. }
  240. }
  241. static class FsExtractor extends EmbeddedExtractor {
  242. @Override
  243. public boolean canExtract(DirectoryNode dn) {
  244. return true;
  245. }
  246. @Override
  247. public EmbeddedData extract(DirectoryNode dn) throws IOException {
  248. EmbeddedData ed = super.extract(dn);
  249. ed.setFilename(dn.getName()+".ole");
  250. // TODO: read the content type from CombObj stream
  251. return ed;
  252. }
  253. }
  254. protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
  255. for (Entry e : src) {
  256. if (e instanceof DirectoryNode) {
  257. DirectoryNode srcDir = (DirectoryNode)e;
  258. DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
  259. destDir.setStorageClsid(srcDir.getStorageClsid());
  260. copyNodes(srcDir, destDir);
  261. } else {
  262. InputStream is = src.createDocumentInputStream(e);
  263. dest.createDocument(e.getName(), is);
  264. is.close();
  265. }
  266. }
  267. }
  268. /**
  269. * Knuth-Morris-Pratt Algorithm for Pattern Matching
  270. * Finds the first occurrence of the pattern in the text.
  271. */
  272. private static int indexOf(byte[] data, int offset, byte[] pattern) {
  273. int[] failure = computeFailure(pattern);
  274. int j = 0;
  275. if (data.length == 0) return -1;
  276. for (int i = offset; i < data.length; i++) {
  277. while (j > 0 && pattern[j] != data[i]) {
  278. j = failure[j - 1];
  279. }
  280. if (pattern[j] == data[i]) { j++; }
  281. if (j == pattern.length) {
  282. return i - pattern.length + 1;
  283. }
  284. }
  285. return -1;
  286. }
  287. /**
  288. * Computes the failure function using a boot-strapping process,
  289. * where the pattern is matched against itself.
  290. */
  291. private static int[] computeFailure(byte[] pattern) {
  292. int[] failure = new int[pattern.length];
  293. int j = 0;
  294. for (int i = 1; i < pattern.length; i++) {
  295. while (j > 0 && pattern[j] != pattern[i]) {
  296. j = failure[j - 1];
  297. }
  298. if (pattern[j] == pattern[i]) {
  299. j++;
  300. }
  301. failure[i] = j;
  302. }
  303. return failure;
  304. }
  305. }