You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RecordFactoryInputStream.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.record;
  16. import java.io.InputStream;
  17. import java.security.GeneralSecurityException;
  18. import java.util.ArrayList;
  19. import java.util.List;
  20. import org.apache.poi.EncryptedDocumentException;
  21. import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
  22. import org.apache.poi.hssf.eventusermodel.HSSFListener;
  23. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  24. import org.apache.poi.poifs.crypt.Decryptor;
  25. import org.apache.poi.poifs.crypt.EncryptionInfo;
  26. import org.apache.poi.util.RecordFormatException;
  27. /**
  28. * A stream based way to get at complete records, with
  29. * as low a memory footprint as possible.
  30. * This handles reading from a RecordInputStream, turning
  31. * the data into full records, processing continue records
  32. * etc.
  33. * Most users should use {@link HSSFEventFactory} /
  34. * {@link HSSFListener} and have new records pushed to
  35. * them, but this does allow for a "pull" style of coding.
  36. */
  37. public final class RecordFactoryInputStream {
  38. /**
  39. * Keeps track of the sizes of the initial records up to and including {@link FilePassRecord}
  40. * Needed for protected files because each byte is encrypted with respect to its absolute
  41. * position from the start of the stream.
  42. */
  43. private static final class StreamEncryptionInfo {
  44. private final int _initialRecordsSize;
  45. private final FilePassRecord _filePassRec;
  46. private final Record _lastRecord;
  47. private final boolean _hasBOFRecord;
  48. public StreamEncryptionInfo(RecordInputStream rs, List<org.apache.poi.hssf.record.Record> outputRecs) {
  49. Record rec;
  50. rs.nextRecord();
  51. int recSize = 4 + rs.remaining();
  52. rec = RecordFactory.createSingleRecord(rs);
  53. outputRecs.add(rec);
  54. FilePassRecord fpr = null;
  55. if (rec instanceof BOFRecord) {
  56. _hasBOFRecord = true;
  57. // Fetch the next record, and see if it indicates whether
  58. // the document is encrypted or not
  59. if (rs.hasNextRecord()) {
  60. rs.nextRecord();
  61. rec = RecordFactory.createSingleRecord(rs);
  62. recSize += rec.getRecordSize();
  63. outputRecs.add(rec);
  64. // Encrypted is normally BOF then FILEPASS
  65. // May sometimes be BOF, WRITEPROTECT, FILEPASS
  66. if (rec instanceof WriteProtectRecord && rs.hasNextRecord()) {
  67. rs.nextRecord();
  68. rec = RecordFactory.createSingleRecord(rs);
  69. recSize += rec.getRecordSize();
  70. outputRecs.add(rec);
  71. }
  72. // If it's a FILEPASS, track it specifically
  73. if (rec instanceof FilePassRecord) {
  74. fpr = (FilePassRecord) rec;
  75. }
  76. // workbook not encrypted (typical case)
  77. if (rec instanceof EOFRecord) {
  78. // A workbook stream is never empty, so crash instead
  79. // of trying to keep track of nesting level
  80. throw new IllegalStateException("Nothing between BOF and EOF");
  81. }
  82. }
  83. } else {
  84. // Invalid in a normal workbook stream.
  85. // However, some test cases work on sub-sections of
  86. // the workbook stream that do not begin with BOF
  87. _hasBOFRecord = false;
  88. }
  89. _initialRecordsSize = recSize;
  90. _filePassRec = fpr;
  91. _lastRecord = rec;
  92. }
  93. @SuppressWarnings({"squid:S2068"})
  94. public RecordInputStream createDecryptingStream(InputStream original) {
  95. String userPassword = Biff8EncryptionKey.getCurrentUserPassword();
  96. if (userPassword == null) {
  97. userPassword = Decryptor.DEFAULT_PASSWORD;
  98. }
  99. EncryptionInfo info = _filePassRec.getEncryptionInfo();
  100. try {
  101. if (!info.getDecryptor().verifyPassword(userPassword)) {
  102. throw new EncryptedDocumentException(
  103. (Decryptor.DEFAULT_PASSWORD.equals(userPassword) ? "Default" : "Supplied")
  104. + " password is invalid for salt/verifier/verifierHash");
  105. }
  106. } catch (GeneralSecurityException e) {
  107. throw new EncryptedDocumentException(e);
  108. }
  109. return new RecordInputStream(original, info, _initialRecordsSize);
  110. }
  111. public boolean hasEncryption() {
  112. return _filePassRec != null;
  113. }
  114. /**
  115. * @return last record scanned while looking for encryption info.
  116. * This will typically be the first or second record read. Possibly <code>null</code>
  117. * if stream was empty
  118. */
  119. public Record getLastRecord() {
  120. return _lastRecord;
  121. }
  122. /**
  123. * <code>false</code> in some test cases
  124. */
  125. public boolean hasBOFRecord() {
  126. return _hasBOFRecord;
  127. }
  128. }
  129. private final RecordInputStream _recStream;
  130. private final boolean _shouldIncludeContinueRecords;
  131. /**
  132. * Temporarily stores a group of {@link Record}s, for future return by {@link #nextRecord()}.
  133. * This is used at the start of the workbook stream, and also when the most recently read
  134. * underlying record is a {@link MulRKRecord}
  135. */
  136. private Record[] _unreadRecordBuffer;
  137. /**
  138. * used to help iterating over the unread records
  139. */
  140. private int _unreadRecordIndex = -1;
  141. /**
  142. * The most recent record that we gave to the user
  143. */
  144. private Record _lastRecord;
  145. /**
  146. * The most recent DrawingRecord seen
  147. */
  148. private DrawingRecord _lastDrawingRecord = new DrawingRecord();
  149. private int _bofDepth;
  150. private boolean _lastRecordWasEOFLevelZero;
  151. /**
  152. * @param in the InputStream to read from
  153. *
  154. * @param shouldIncludeContinueRecords caller can pass <code>false</code> if loose
  155. * {@link ContinueRecord}s should be skipped (this is sometimes useful in event based
  156. * processing).
  157. */
  158. public RecordFactoryInputStream(InputStream in, boolean shouldIncludeContinueRecords) {
  159. RecordInputStream rs = new RecordInputStream(in);
  160. List<org.apache.poi.hssf.record.Record> records = new ArrayList<>();
  161. StreamEncryptionInfo sei = new StreamEncryptionInfo(rs, records);
  162. if (sei.hasEncryption()) {
  163. rs = sei.createDecryptingStream(in);
  164. } else {
  165. // typical case - non-encrypted stream
  166. }
  167. if (!records.isEmpty()) {
  168. _unreadRecordBuffer = new Record[records.size()];
  169. records.toArray(_unreadRecordBuffer);
  170. _unreadRecordIndex =0;
  171. }
  172. _recStream = rs;
  173. _shouldIncludeContinueRecords = shouldIncludeContinueRecords;
  174. _lastRecord = sei.getLastRecord();
  175. /*
  176. * How to recognise end of stream?
  177. * In the best case, the underlying input stream (in) ends just after the last EOF record
  178. * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
  179. * reliably use zeros for padding and if this were always the case, this code could just
  180. * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
  181. * non-zero padding that is read OK by Excel (Excel also fixes the padding).
  182. *
  183. * So to properly detect the workbook end of stream, this code has to identify the last
  184. * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
  185. * whole stream. The worksheets follow the workbook, but it is not easy to tell how many
  186. * sheet sub-streams should be present. Hence we are looking for an EOF record that is not
  187. * immediately followed by a BOF record. One extra complication is that bof+eof sub-
  188. * streams can be nested within worksheet streams and it's not clear in these cases what
  189. * record might follow any EOF record. So we also need to keep track of the bof/eof
  190. * nesting level.
  191. */
  192. _bofDepth = sei.hasBOFRecord() ? 1 : 0;
  193. _lastRecordWasEOFLevelZero = false;
  194. }
  195. /**
  196. * @return the next (complete) record from the stream, or null if there are no more.
  197. */
  198. public Record nextRecord() {
  199. Record r;
  200. r = getNextUnreadRecord();
  201. if (r != null) {
  202. // found an unread record
  203. return r;
  204. }
  205. while (true) {
  206. if (!_recStream.hasNextRecord()) {
  207. // recStream is exhausted;
  208. return null;
  209. }
  210. if (_lastRecordWasEOFLevelZero) {
  211. // Potential place for ending the workbook stream
  212. // Check that the next record is not BOFRecord(0x0809)
  213. // Normally the input stream contains only zero padding after the last EOFRecord,
  214. // but bug 46987 and 48068 suggests that the padding may be garbage.
  215. // This code relies on the padding bytes not starting with BOFRecord.sid
  216. if (_recStream.getNextSid() != BOFRecord.sid) {
  217. return null;
  218. }
  219. // else - another sheet substream starting here
  220. }
  221. // step underlying RecordInputStream to the next record
  222. _recStream.nextRecord();
  223. r = readNextRecord();
  224. if (r == null) {
  225. // some record types may get skipped (e.g. DBCellRecord and ContinueRecord)
  226. continue;
  227. }
  228. return r;
  229. }
  230. }
  231. /**
  232. * @return the next {@link Record} from the multiple record group as expanded from
  233. * a recently read {@link MulRKRecord}. <code>null</code> if not present.
  234. */
  235. private Record getNextUnreadRecord() {
  236. if (_unreadRecordBuffer != null) {
  237. int ix = _unreadRecordIndex;
  238. if (ix < _unreadRecordBuffer.length) {
  239. Record result = _unreadRecordBuffer[ix];
  240. _unreadRecordIndex = ix + 1;
  241. return result;
  242. }
  243. _unreadRecordIndex = -1;
  244. _unreadRecordBuffer = null;
  245. }
  246. return null;
  247. }
  248. /**
  249. * @return the next available record, or <code>null</code> if
  250. * this pass didn't return a record that's
  251. * suitable for returning (eg was a continue record).
  252. */
  253. private Record readNextRecord() {
  254. Record record = RecordFactory.createSingleRecord(_recStream);
  255. _lastRecordWasEOFLevelZero = false;
  256. if (record instanceof BOFRecord) {
  257. _bofDepth++;
  258. return record;
  259. }
  260. if (record instanceof EOFRecord) {
  261. _bofDepth--;
  262. if (_bofDepth < 1) {
  263. _lastRecordWasEOFLevelZero = true;
  264. }
  265. return record;
  266. }
  267. if (record instanceof DBCellRecord) {
  268. // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
  269. return null;
  270. }
  271. if (record instanceof RKRecord) {
  272. return RecordFactory.convertToNumberRecord((RKRecord) record);
  273. }
  274. if (record instanceof MulRKRecord) {
  275. Record[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
  276. _unreadRecordBuffer = records;
  277. _unreadRecordIndex = 1;
  278. return records[0];
  279. }
  280. if (record.getSid() == DrawingGroupRecord.sid
  281. && _lastRecord instanceof DrawingGroupRecord) {
  282. DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord;
  283. lastDGRecord.join((AbstractEscherHolderRecord) record);
  284. return null;
  285. }
  286. if (record.getSid() == ContinueRecord.sid) {
  287. ContinueRecord contRec = (ContinueRecord) record;
  288. if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) {
  289. // Drawing records have a very strange continue behaviour.
  290. //There can actually be OBJ records mixed between the continues.
  291. _lastDrawingRecord.processContinueRecord(contRec.getData());
  292. //we must remember the position of the continue record.
  293. //in the serialization procedure the original structure of records must be preserved
  294. if (_shouldIncludeContinueRecords) {
  295. return record;
  296. }
  297. return null;
  298. }
  299. if (_lastRecord instanceof DrawingGroupRecord) {
  300. ((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData());
  301. return null;
  302. }
  303. if (_lastRecord instanceof DrawingRecord) {
  304. // ((DrawingRecord) _lastRecord).appendContinueRecord(contRec.getData());
  305. return contRec;
  306. }
  307. if (_lastRecord instanceof UnknownRecord) {
  308. //Gracefully handle records that we don't know about,
  309. //that happen to be continued
  310. return record;
  311. }
  312. if (_lastRecord instanceof EOFRecord) {
  313. // This is really odd, but excel still sometimes
  314. // outputs a file like this all the same
  315. return record;
  316. }
  317. throw new RecordFormatException("Unhandled Continue Record followining " + _lastRecord.getClass());
  318. }
  319. _lastRecord = record;
  320. if (record instanceof DrawingRecord) {
  321. _lastDrawingRecord = (DrawingRecord) record;
  322. }
  323. return record;
  324. }
  325. }