You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ExtractorFactory.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.ooxml.extractor;
  16. import java.io.File;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.lang.reflect.Method;
  20. import java.util.ArrayList;
  21. import java.util.Iterator;
  22. import java.util.List;
  23. import org.apache.poi.EncryptedDocumentException;
  24. import org.apache.poi.extractor.OLE2ExtractorFactory;
  25. import org.apache.poi.extractor.POIOLE2TextExtractor;
  26. import org.apache.poi.extractor.POITextExtractor;
  27. import org.apache.poi.hssf.extractor.ExcelExtractor;
  28. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  29. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  30. import org.apache.poi.openxml4j.opc.OPCPackage;
  31. import org.apache.poi.openxml4j.opc.PackageAccess;
  32. import org.apache.poi.openxml4j.opc.PackagePart;
  33. import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
  34. import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
  35. import org.apache.poi.poifs.crypt.Decryptor;
  36. import org.apache.poi.poifs.crypt.EncryptionInfo;
  37. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  38. import org.apache.poi.poifs.filesystem.DirectoryNode;
  39. import org.apache.poi.poifs.filesystem.Entry;
  40. import org.apache.poi.poifs.filesystem.FileMagic;
  41. import org.apache.poi.poifs.filesystem.NotOLE2FileException;
  42. import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
  43. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  44. import org.apache.poi.sl.extractor.SlideShowExtractor;
  45. import org.apache.poi.util.IOUtils;
  46. import org.apache.poi.util.NotImplemented;
  47. import org.apache.poi.util.POILogFactory;
  48. import org.apache.poi.util.POILogger;
  49. import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
  50. import org.apache.poi.xslf.usermodel.XMLSlideShow;
  51. import org.apache.poi.xslf.usermodel.XSLFRelation;
  52. import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
  53. import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
  54. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  55. import org.apache.poi.xssf.usermodel.XSSFRelation;
  56. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  57. import org.apache.poi.xwpf.usermodel.XWPFRelation;
  58. import org.apache.xmlbeans.XmlException;
  59. /**
  60. * Figures out the correct POITextExtractor for your supplied
  61. * document, and returns it.
  62. *
  63. * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
  64. * not present on the runtime classpath</p>
  65. * <p>Note 2 - rather than using this, for most cases you would be better
  66. * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
  67. */
  68. @SuppressWarnings("WeakerAccess")
  69. public final class ExtractorFactory {
  70. private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
  71. public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
  72. private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
  73. private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
  74. private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
  75. XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
  76. XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
  77. XSLFRelation.PRESENTATION_MACRO
  78. };
  79. private ExtractorFactory() {
  80. }
  81. /**
  82. * Should this thread prefer event based over usermodel based extractors?
  83. * (usermodel extractors tend to be more accurate, but use more memory)
  84. * Default is false.
  85. */
  86. public static boolean getThreadPrefersEventExtractors() {
  87. return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
  88. }
  89. /**
  90. * Should all threads prefer event based over usermodel based extractors?
  91. * (usermodel extractors tend to be more accurate, but use more memory)
  92. * Default is to use the thread level setting, which defaults to false.
  93. */
  94. public static Boolean getAllThreadsPreferEventExtractors() {
  95. return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
  96. }
  97. /**
  98. * Should this thread prefer event based over usermodel based extractors?
  99. * Will only be used if the All Threads setting is null.
  100. */
  101. public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
  102. OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
  103. }
  104. /**
  105. * Should all threads prefer event based over usermodel based extractors?
  106. * If set, will take preference over the Thread level setting.
  107. */
  108. public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
  109. OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
  110. }
  111. /**
  112. * Should this thread use event based extractors is available?
  113. * Checks the all-threads one first, then thread specific.
  114. */
  115. public static boolean getPreferEventExtractor() {
  116. return OLE2ExtractorFactory.getPreferEventExtractor();
  117. }
  118. @SuppressWarnings("unchecked")
  119. public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
  120. POIFSFileSystem fs = null;
  121. try {
  122. fs = new POIFSFileSystem(f);
  123. if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
  124. return (T)createEncryptedOOXMLExtractor(fs);
  125. }
  126. POITextExtractor extractor = createExtractor(fs);
  127. extractor.setFilesystem(fs);
  128. return (T)extractor;
  129. } catch (OfficeXmlFileException e) {
  130. // ensure file-handle release
  131. IOUtils.closeQuietly(fs);
  132. OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
  133. T t = (T)createExtractor(pkg);
  134. t.setFilesystem(pkg);
  135. return t;
  136. } catch (NotOLE2FileException ne) {
  137. // ensure file-handle release
  138. IOUtils.closeQuietly(fs);
  139. throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
  140. } catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
  141. // ensure file-handle release
  142. IOUtils.closeQuietly(fs);
  143. throw e;
  144. }
  145. }
  146. public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
  147. InputStream is = FileMagic.prepareToCheckMagic(inp);
  148. FileMagic fm = FileMagic.valueOf(is);
  149. switch (fm) {
  150. case OLE2:
  151. POIFSFileSystem fs = new POIFSFileSystem(is);
  152. boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
  153. return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
  154. case OOXML:
  155. return createExtractor(OPCPackage.open(is));
  156. default:
  157. throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm);
  158. }
  159. }
  160. /**
  161. * Tries to determine the actual type of file and produces a matching text-extractor for it.
  162. *
  163. * @param pkg An {@link OPCPackage}.
  164. * @return A {@link POIXMLTextExtractor} for the given file.
  165. * @throws IOException If an error occurs while reading the file
  166. * @throws OpenXML4JException If an error parsing the OpenXML file format is found.
  167. * @throws XmlException If an XML parsing error occurs.
  168. * @throws IllegalArgumentException If no matching file type could be found.
  169. */
  170. public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
  171. try {
  172. // Check for the normal Office core document
  173. PackageRelationshipCollection core;
  174. core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
  175. // If nothing was found, try some of the other OOXML-based core types
  176. if (core.size() == 0) {
  177. // Could it be an OOXML-Strict one?
  178. core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
  179. }
  180. if (core.size() == 0) {
  181. // Could it be a visio one?
  182. core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
  183. if (core.size() == 1)
  184. return new XDGFVisioExtractor(pkg);
  185. }
  186. // Should just be a single core document, complain if not
  187. if (core.size() != 1) {
  188. throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
  189. }
  190. // Grab the core document part, and try to identify from that
  191. final PackagePart corePart = pkg.getPart(core.getRelationship(0));
  192. final String contentType = corePart.getContentType();
  193. // Is it XSSF?
  194. for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
  195. if ( rel.getContentType().equals( contentType ) ) {
  196. if (getPreferEventExtractor()) {
  197. return new XSSFEventBasedExcelExtractor(pkg);
  198. }
  199. return new XSSFExcelExtractor(pkg);
  200. }
  201. }
  202. // Is it XWPF?
  203. for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
  204. if ( rel.getContentType().equals( contentType ) ) {
  205. return new XWPFWordExtractor(pkg);
  206. }
  207. }
  208. // Is it XSLF?
  209. for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
  210. if ( rel.getContentType().equals( contentType ) ) {
  211. return new SlideShowExtractor<>(new XMLSlideShow(pkg));
  212. }
  213. }
  214. // special handling for SlideShow-Theme-files,
  215. if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
  216. return new SlideShowExtractor<>(new XMLSlideShow(pkg));
  217. }
  218. // How about xlsb?
  219. for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
  220. if (rel.getContentType().equals(contentType)) {
  221. return new XSSFBEventBasedExcelExtractor(pkg);
  222. }
  223. }
  224. throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
  225. } catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
  226. // ensure that we close the package again if there is an error opening it, however
  227. // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
  228. pkg.revert();
  229. throw e;
  230. }
  231. }
  232. public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
  233. return createExtractor(fs.getRoot());
  234. }
  235. @SuppressWarnings("unchecked")
  236. public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
  237. {
  238. // First, check for OOXML
  239. for (String entryName : poifsDir.getEntryNames()) {
  240. if (entryName.equals("Package")) {
  241. OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
  242. return (T)createExtractor(pkg);
  243. }
  244. }
  245. // If not, ask the OLE2 code to check, with Scratchpad if possible
  246. return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
  247. }
  248. /**
  249. * Returns an array of text extractors, one for each of
  250. * the embedded documents in the file (if there are any).
  251. * If there are no embedded documents, you'll get back an
  252. * empty array. Otherwise, you'll get one open
  253. * {@link POITextExtractor} for each embedded file.
  254. */
  255. public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
  256. // All the embedded directories we spotted
  257. ArrayList<Entry> dirs = new ArrayList<>();
  258. // For anything else not directly held in as a POIFS directory
  259. ArrayList<InputStream> nonPOIFS = new ArrayList<>();
  260. // Find all the embedded directories
  261. DirectoryEntry root = ext.getRoot();
  262. if (root == null) {
  263. throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
  264. }
  265. // provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it
  266. if (ext instanceof ExcelExtractor) {
  267. // These are in MBD... under the root
  268. Iterator<Entry> it = root.getEntries();
  269. while (it.hasNext()) {
  270. Entry entry = it.next();
  271. if (entry.getName().startsWith("MBD")) {
  272. dirs.add(entry);
  273. }
  274. }
  275. } else {
  276. try {
  277. Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory");
  278. Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
  279. m.invoke(null, ext, dirs, nonPOIFS);
  280. } catch (ReflectiveOperationException e) {
  281. logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage());
  282. return new POITextExtractor[0];
  283. }
  284. }
  285. // Create the extractors
  286. if (dirs.size() == 0 && nonPOIFS.size() == 0){
  287. return new POITextExtractor[0];
  288. }
  289. ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
  290. for (Entry dir : dirs) {
  291. textExtractors.add(createExtractor((DirectoryNode) dir));
  292. }
  293. for (InputStream nonPOIF : nonPOIFS) {
  294. try {
  295. textExtractors.add(createExtractor(nonPOIF));
  296. } catch (IllegalArgumentException e) {
  297. // Ignore, just means it didn't contain
  298. // a format we support as yet
  299. logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
  300. } catch (XmlException | OpenXML4JException e) {
  301. throw new IOException(e.getMessage(), e);
  302. }
  303. }
  304. return textExtractors.toArray(new POITextExtractor[0]);
  305. }
  306. /**
  307. * Returns an array of text extractors, one for each of
  308. * the embedded documents in the file (if there are any).
  309. * If there are no embedded documents, you'll get back an
  310. * empty array. Otherwise, you'll get one open
  311. * {@link POITextExtractor} for each embedded file.
  312. */
  313. @NotImplemented
  314. @SuppressWarnings({"UnusedParameters", "UnusedReturnValue"})
  315. public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
  316. throw new IllegalStateException("Not yet supported");
  317. }
  318. private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
  319. throws IOException {
  320. String pass = Biff8EncryptionKey.getCurrentUserPassword();
  321. if (pass == null) {
  322. pass = Decryptor.DEFAULT_PASSWORD;
  323. }
  324. EncryptionInfo ei = new EncryptionInfo(fs);
  325. Decryptor dec = ei.getDecryptor();
  326. InputStream is = null;
  327. try {
  328. if (!dec.verifyPassword(pass)) {
  329. throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
  330. }
  331. is = dec.getDataStream(fs);
  332. return createExtractor(OPCPackage.open(is));
  333. } catch (IOException e) {
  334. throw e;
  335. } catch (Exception e) {
  336. throw new EncryptedDocumentException(e);
  337. } finally {
  338. IOUtils.closeQuietly(is);
  339. // also close the POIFSFileSystem here as we read all the data
  340. // while decrypting
  341. fs.close();
  342. }
  343. }
  344. }