You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ExtractorFactory.java 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor;
  16. import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword;
  17. import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY;
  18. import java.io.File;
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.util.ArrayList;
  22. import java.util.List;
  23. import java.util.ServiceLoader;
  24. import java.util.stream.StreamSupport;
  25. import org.apache.poi.EmptyFileException;
  26. import org.apache.poi.hssf.extractor.ExcelExtractor;
  27. import org.apache.poi.poifs.crypt.Decryptor;
  28. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  29. import org.apache.poi.poifs.filesystem.DirectoryNode;
  30. import org.apache.poi.poifs.filesystem.Entry;
  31. import org.apache.poi.poifs.filesystem.FileMagic;
  32. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  33. import org.apache.poi.util.IOUtils;
  34. import org.apache.poi.util.POILogFactory;
  35. import org.apache.poi.util.POILogger;
  36. /**
  37. * Figures out the correct POIOLE2TextExtractor for your supplied
  38. * document, and returns it.
  39. *
  40. * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
  41. * not present on the runtime classpath</p>
  42. * <p>Note 2 - for text extractor creation across all formats, use
  43. * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
  44. * the OOXML jar.</p>
  45. * <p>Note 3 - rather than using this, for most cases you would be better
  46. * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
  47. */
  48. @SuppressWarnings({"WeakerAccess", "JavadocReference"})
  49. public final class ExtractorFactory {
  50. /**
  51. * Some OPCPackages are packed in side an OLE2 container.
  52. * If encrypted, the {@link DirectoryNode} is called {@link Decryptor#DEFAULT_POIFS_ENTRY "EncryptedPackage"},
  53. * otherwise the node is called "Packge"
  54. */
  55. public static final String OOXML_PACKAGE = "Package";
  56. private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class);
  57. /** Should this thread prefer event based over usermodel based extractors? */
  58. private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
  59. /** Should all threads prefer event based over usermodel based extractors? */
  60. private static Boolean allPreferEventExtractors;
  61. private static class Singleton {
  62. private static final ExtractorFactory INSTANCE = new ExtractorFactory();
  63. }
  64. private interface ProviderMethod {
  65. POITextExtractor create(ExtractorProvider prov) throws IOException;
  66. }
  67. private final List<ExtractorProvider> provider = new ArrayList<>();
  68. private ExtractorFactory() {
  69. ClassLoader cl = ExtractorFactory.class.getClassLoader();
  70. ServiceLoader.load(ExtractorProvider.class, cl).forEach(provider::add);
  71. }
  72. /**
  73. * Should this thread prefer event based over usermodel based extractors?
  74. * (usermodel extractors tend to be more accurate, but use more memory)
  75. * Default is false.
  76. *
  77. * @return true if event extractors should be preferred in the current thread, fals otherwise.
  78. */
  79. public static boolean getThreadPrefersEventExtractors() {
  80. return threadPreferEventExtractors.get();
  81. }
  82. /**
  83. * Should all threads prefer event based over usermodel based extractors?
  84. * (usermodel extractors tend to be more accurate, but use more memory)
  85. * Default is to use the thread level setting, which defaults to false.
  86. *
  87. * @return true if event extractors should be preferred in all threads, fals otherwise.
  88. */
  89. public static Boolean getAllThreadsPreferEventExtractors() {
  90. return allPreferEventExtractors;
  91. }
  92. /**
  93. * Should this thread prefer event based over usermodel based extractors?
  94. * Will only be used if the All Threads setting is null.
  95. *
  96. * @param preferEventExtractors If this threads should prefer event based extractors.
  97. */
  98. public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
  99. threadPreferEventExtractors.set(preferEventExtractors);
  100. }
  101. /**
  102. * Should all threads prefer event based over usermodel based extractors?
  103. * If set, will take preference over the Thread level setting.
  104. *
  105. * @param preferEventExtractors If all threads should prefer event based extractors.
  106. */
  107. public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
  108. allPreferEventExtractors = preferEventExtractors;
  109. }
  110. /**
  111. * Should this thread use event based extractors is available?
  112. * Checks the all-threads one first, then thread specific.
  113. *
  114. * @return If the current thread should use event based extractors.
  115. */
  116. public static boolean getPreferEventExtractor() {
  117. return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get();
  118. }
  119. public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
  120. return createExtractor(fs, getCurrentUserPassword());
  121. }
  122. public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException {
  123. return createExtractor(fs.getRoot(), password);
  124. }
  125. public static POITextExtractor createExtractor(InputStream input) throws IOException {
  126. return createExtractor(input, getCurrentUserPassword());
  127. }
  128. public static POITextExtractor createExtractor(InputStream input, String password) throws IOException {
  129. final InputStream is = FileMagic.prepareToCheckMagic(input);
  130. byte[] emptyFileCheck = new byte[1];
  131. is.mark(emptyFileCheck.length);
  132. if (is.read(emptyFileCheck) < emptyFileCheck.length) {
  133. throw new EmptyFileException();
  134. }
  135. is.reset();
  136. final FileMagic fm = FileMagic.valueOf(is);
  137. if (FileMagic.OOXML == fm) {
  138. return wp(fm, w -> w.create(is, password));
  139. }
  140. if (FileMagic.OLE2 != fm) {
  141. throw new IOException("Can't create extractor - unsupported file type: "+fm);
  142. }
  143. POIFSFileSystem poifs = new POIFSFileSystem(is);
  144. boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
  145. return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
  146. }
  147. public static POITextExtractor createExtractor(File file) throws IOException {
  148. return createExtractor(file, getCurrentUserPassword());
  149. }
  150. public static POITextExtractor createExtractor(File file, String password) throws IOException {
  151. if (file.length() == 0) {
  152. throw new EmptyFileException();
  153. }
  154. final FileMagic fm = FileMagic.valueOf(file);
  155. if (FileMagic.OOXML == fm) {
  156. return wp(fm, w -> w.create(file, password));
  157. }
  158. if (FileMagic.OLE2 != fm) {
  159. throw new IOException("Can't create extractor - unsupported file type: "+fm);
  160. }
  161. POIFSFileSystem poifs = new POIFSFileSystem(file, true);
  162. try {
  163. boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
  164. return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
  165. } catch (IOException | RuntimeException e) {
  166. IOUtils.closeQuietly(poifs);
  167. throw e;
  168. }
  169. }
  170. /**
  171. * Create the Extractor, if possible. Generally needs the Scratchpad jar.
  172. * Note that this won't check for embedded OOXML resources either, use
  173. * {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
  174. *
  175. * @param root The {@link DirectoryNode} pointing to a document.
  176. *
  177. * @return The resulting {@link POITextExtractor}, an exception is thrown if
  178. * no TextExtractor can be created for some reason.
  179. *
  180. * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
  181. * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
  182. * an unsupported version of Excel.
  183. * @throws IllegalArgumentException If creating the Extractor fails
  184. */
  185. public static POITextExtractor createExtractor(DirectoryNode root) throws IOException {
  186. return createExtractor(root, getCurrentUserPassword());
  187. }
  188. public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException {
  189. // Encrypted OOXML files go inside OLE2 containers, is this one?
  190. if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry(OOXML_PACKAGE)) {
  191. return wp(FileMagic.OOXML, w -> w.create(root, password));
  192. } else {
  193. return wp(FileMagic.OLE2, w -> w.create(root, password));
  194. }
  195. }
  196. /**
  197. * Returns an array of text extractors, one for each of
  198. * the embedded documents in the file (if there are any).
  199. * If there are no embedded documents, you'll get back an
  200. * empty array. Otherwise, you'll get one open
  201. * {@link POITextExtractor} for each embedded file.
  202. *
  203. * @param ext The extractor to look at for embedded documents
  204. *
  205. * @return An array of resulting extractors. Empty if no embedded documents are found.
  206. *
  207. * @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
  208. * @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
  209. * an unsupported version of Excel.
  210. * @throws IllegalArgumentException If creating the Extractor fails
  211. */
  212. public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
  213. if (ext == null) {
  214. throw new IllegalStateException("extractor must be given");
  215. }
  216. // All the embedded directories we spotted
  217. List<Entry> dirs = new ArrayList<>();
  218. // For anything else not directly held in as a POIFS directory
  219. List<InputStream> nonPOIFS = new ArrayList<>();
  220. // Find all the embedded directories
  221. DirectoryEntry root = ext.getRoot();
  222. if(root == null) {
  223. throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
  224. }
  225. if(ext instanceof ExcelExtractor) {
  226. // These are in MBD... under the root
  227. StreamSupport.stream(root.spliterator(), false)
  228. .filter(entry -> entry.getName().startsWith("MBD"))
  229. .forEach(dirs::add);
  230. } else {
  231. for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
  232. if (prov.accepts(FileMagic.OLE2)) {
  233. prov.identifyEmbeddedResources(ext, dirs, nonPOIFS);
  234. break;
  235. }
  236. }
  237. }
  238. // Create the extractors
  239. if(dirs.size() == 0 && nonPOIFS.size() == 0){
  240. return new POITextExtractor[0];
  241. }
  242. ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
  243. for (Entry dir : dirs) {
  244. textExtractors.add(createExtractor((DirectoryNode) dir));
  245. }
  246. for (InputStream stream : nonPOIFS) {
  247. try {
  248. textExtractors.add(createExtractor(stream));
  249. } catch (IOException e) {
  250. // Ignore, just means it didn't contain a format we support as yet
  251. LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
  252. }
  253. }
  254. return textExtractors.toArray(new POITextExtractor[0]);
  255. }
  256. private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException {
  257. for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
  258. if (prov.accepts(fm)) {
  259. POITextExtractor ext = fun.create(prov);
  260. if (ext != null) {
  261. return ext;
  262. }
  263. }
  264. }
  265. throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " +
  266. "or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm);
  267. }
  268. public static void addProvider(ExtractorProvider provider){
  269. Singleton.INSTANCE.provider.add(provider);
  270. }
  271. public static void removeProvider(Class<? extends ExtractorProvider> provider){
  272. Singleton.INSTANCE.provider.removeIf(p -> p.getClass().getName().equals(provider.getName()));
  273. }
  274. }