You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

OLE2ScratchpadExtractorFactory.java 8.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.extractor.ole2;
  16. import java.io.ByteArrayInputStream;
  17. import java.io.File;
  18. import java.io.FileNotFoundException;
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.util.List;
  22. import java.util.stream.StreamSupport;
  23. import org.apache.poi.extractor.ExtractorFactory;
  24. import org.apache.poi.extractor.ExtractorProvider;
  25. import org.apache.poi.extractor.POIOLE2TextExtractor;
  26. import org.apache.poi.extractor.POITextExtractor;
  27. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  28. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  29. import org.apache.poi.hslf.usermodel.HSLFShape;
  30. import org.apache.poi.hslf.usermodel.HSLFSlideShow;
  31. import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
  32. import org.apache.poi.hsmf.MAPIMessage;
  33. import org.apache.poi.hsmf.datatypes.AttachmentChunks;
  34. import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
  35. import org.apache.poi.hssf.extractor.ExcelExtractor;
  36. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  37. import org.apache.poi.hwpf.OldWordFileFormatException;
  38. import org.apache.poi.hwpf.extractor.Word6Extractor;
  39. import org.apache.poi.hwpf.extractor.WordExtractor;
  40. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  41. import org.apache.poi.poifs.filesystem.DirectoryNode;
  42. import org.apache.poi.poifs.filesystem.Entry;
  43. import org.apache.poi.poifs.filesystem.FileMagic;
  44. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  45. import org.apache.poi.sl.extractor.SlideShowExtractor;
  46. import org.apache.poi.sl.usermodel.SlideShowFactory;
  47. import org.apache.poi.util.POILogFactory;
  48. import org.apache.poi.util.POILogger;
  49. /**
  50. * Scratchpad-specific logic for {@link ExtractorFactory} and
  51. * {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with
  52. * no Scratchpad jar (though without functionality!)
  53. * <p>Note - should not be used standalone, always use via the other
  54. * two classes</p>
  55. */
  56. @SuppressWarnings("WeakerAccess")
  57. public class OLE2ScratchpadExtractorFactory implements ExtractorProvider {
  58. private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class);
  59. @Override
  60. public boolean accepts(FileMagic fm) {
  61. return FileMagic.OLE2 == fm;
  62. }
  63. @Override
  64. public POITextExtractor create(File file, String password) throws IOException {
  65. return create(new POIFSFileSystem(file, true).getRoot(), password);
  66. }
  67. @Override
  68. public POITextExtractor create(InputStream inputStream, String password) throws IOException {
  69. return create(new POIFSFileSystem(inputStream).getRoot(), password);
  70. }
  71. /**
  72. * Look for certain entries in the stream, to figure it
  73. * out what format is desired
  74. * Note - doesn't check for core-supported formats!
  75. * Note - doesn't check for OOXML-supported formats
  76. *
  77. * @param poifsDir the directory node to be inspected
  78. * @return the format specific text extractor
  79. *
  80. * @throws IOException when the format specific extraction fails because of invalid entires
  81. */
  82. public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
  83. final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
  84. try {
  85. Biff8EncryptionKey.setCurrentUserPassword(password);
  86. if (poifsDir.hasEntry("WordDocument")) {
  87. // Old or new style word document?
  88. try {
  89. return new WordExtractor(poifsDir);
  90. } catch (OldWordFileFormatException e) {
  91. return new Word6Extractor(poifsDir);
  92. }
  93. }
  94. if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
  95. return new SlideShowExtractor<HSLFShape, HSLFTextParagraph>(SlideShowFactory.create(poifsDir));
  96. }
  97. if (poifsDir.hasEntry("VisioDocument")) {
  98. return new VisioTextExtractor(poifsDir);
  99. }
  100. if (poifsDir.hasEntry("Quill")) {
  101. return new PublisherTextExtractor(poifsDir);
  102. }
  103. final String[] outlookEntryNames = new String[]{
  104. // message bodies, saved as plain text (PtypString)
  105. // The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
  106. // the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
  107. // https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
  108. // @see org.apache.poi.hsmf.Types.MAPIType
  109. "__substg1.0_1000001E", //PidTagBody ASCII
  110. "__substg1.0_1000001F", //PidTagBody Unicode
  111. "__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
  112. "__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
  113. "__substg1.0_0037001E", //PidTagSubject ASCII
  114. "__substg1.0_0037001F", //PidTagSubject Unicode
  115. };
  116. for (String entryName : outlookEntryNames) {
  117. if (poifsDir.hasEntry(entryName)) {
  118. return new OutlookTextExtractor(poifsDir);
  119. }
  120. }
  121. } finally {
  122. Biff8EncryptionKey.setCurrentUserPassword(oldPW);
  123. }
  124. return null;
  125. }
  126. /**
  127. * Returns an array of text extractors, one for each of
  128. * the embedded documents in the file (if there are any).
  129. * If there are no embedded documents, you'll get back an
  130. * empty array. Otherwise, you'll get one open
  131. * {@link POITextExtractor} for each embedded file.
  132. *
  133. * @param ext the extractor holding the directory to start parsing
  134. * @param dirs a list to be filled with directory references holding embedded
  135. * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
  136. */
  137. @Override
  138. public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) {
  139. // Find all the embedded directories
  140. DirectoryEntry root = ext.getRoot();
  141. if (root == null) {
  142. throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
  143. }
  144. if (ext instanceof ExcelExtractor) {
  145. // These are in MBD... under the root
  146. StreamSupport.stream(root.spliterator(), false)
  147. .filter(entry -> entry.getName().startsWith("MBD"))
  148. .forEach(dirs::add);
  149. } else if (ext instanceof WordExtractor) {
  150. // These are in ObjectPool -> _... under the root
  151. try {
  152. DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
  153. StreamSupport.stream(op.spliterator(), false)
  154. .filter(entry -> entry.getName().startsWith("_"))
  155. .forEach(dirs::add);
  156. } catch(FileNotFoundException e) {
  157. logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
  158. // ignored here
  159. }
  160. //} else if(ext instanceof PowerPointExtractor) {
  161. // Tricky, not stored directly in poifs
  162. // TODO
  163. } else if (ext instanceof OutlookTextExtractor) {
  164. // Stored in the Attachment blocks
  165. MAPIMessage msg = ((OutlookTextExtractor)ext).getMAPIMessage();
  166. for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
  167. if (attachment.getAttachData() != null) {
  168. byte[] data = attachment.getAttachData().getValue();
  169. nonPOIFS.add( new ByteArrayInputStream(data) );
  170. } else if (attachment.getAttachmentDirectory() != null) {
  171. dirs.add(attachment.getAttachmentDirectory().getDirectory());
  172. }
  173. }
  174. }
  175. }
  176. }