You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RecordFactoryInputStream.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.record;
  16. import java.io.InputStream;
  17. import java.util.ArrayList;
  18. import java.util.List;
  19. import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
  20. import org.apache.poi.hssf.eventusermodel.HSSFListener;
  21. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  22. import org.apache.poi.EncryptedDocumentException;
  23. /**
  24. * A stream based way to get at complete records, with
  25. * as low a memory footprint as possible.
  26. * This handles reading from a RecordInputStream, turning
  27. * the data into full records, processing continue records
  28. * etc.
  29. * Most users should use {@link HSSFEventFactory} /
  30. * {@link HSSFListener} and have new records pushed to
  31. * them, but this does allow for a "pull" style of coding.
  32. */
  33. public final class RecordFactoryInputStream {
  34. /**
  35. * Keeps track of the sizes of the initial records up to and including {@link FilePassRecord}
  36. * Needed for protected files because each byte is encrypted with respect to its absolute
  37. * position from the start of the stream.
  38. */
  39. private static final class StreamEncryptionInfo {
  40. private final int _initialRecordsSize;
  41. private final FilePassRecord _filePassRec;
  42. private final Record _lastRecord;
  43. private final boolean _hasBOFRecord;
  44. public StreamEncryptionInfo(RecordInputStream rs, List<Record> outputRecs) {
  45. Record rec;
  46. rs.nextRecord();
  47. int recSize = 4 + rs.remaining();
  48. rec = RecordFactory.createSingleRecord(rs);
  49. outputRecs.add(rec);
  50. FilePassRecord fpr = null;
  51. if (rec instanceof BOFRecord) {
  52. _hasBOFRecord = true;
  53. // Fetch the next record, and see if it indicates whether
  54. // the document is encrypted or not
  55. if (rs.hasNextRecord()) {
  56. rs.nextRecord();
  57. rec = RecordFactory.createSingleRecord(rs);
  58. recSize += rec.getRecordSize();
  59. outputRecs.add(rec);
  60. // Encrypted is normally BOF then FILEPASS
  61. // May sometimes be BOF, WRITEPROTECT, FILEPASS
  62. if (rec instanceof WriteProtectRecord && rs.hasNextRecord()) {
  63. rs.nextRecord();
  64. rec = RecordFactory.createSingleRecord(rs);
  65. recSize += rec.getRecordSize();
  66. outputRecs.add(rec);
  67. }
  68. // If it's a FILEPASS, track it specifically but
  69. // don't include it in the main stream
  70. if (rec instanceof FilePassRecord) {
  71. fpr = (FilePassRecord) rec;
  72. outputRecs.remove(outputRecs.size()-1);
  73. // TODO - add fpr not added to outputRecs
  74. rec = outputRecs.get(0);
  75. } else {
  76. // workbook not encrypted (typical case)
  77. if (rec instanceof EOFRecord) {
  78. // A workbook stream is never empty, so crash instead
  79. // of trying to keep track of nesting level
  80. throw new IllegalStateException("Nothing between BOF and EOF");
  81. }
  82. }
  83. }
  84. } else {
  85. // Invalid in a normal workbook stream.
  86. // However, some test cases work on sub-sections of
  87. // the workbook stream that do not begin with BOF
  88. _hasBOFRecord = false;
  89. }
  90. _initialRecordsSize = recSize;
  91. _filePassRec = fpr;
  92. _lastRecord = rec;
  93. }
  94. public RecordInputStream createDecryptingStream(InputStream original) {
  95. FilePassRecord fpr = _filePassRec;
  96. String userPassword = Biff8EncryptionKey.getCurrentUserPassword();
  97. Biff8EncryptionKey key;
  98. if (userPassword == null) {
  99. key = Biff8EncryptionKey.create(fpr.getDocId());
  100. } else {
  101. key = Biff8EncryptionKey.create(userPassword, fpr.getDocId());
  102. }
  103. if (!key.validate(fpr.getSaltData(), fpr.getSaltHash())) {
  104. throw new EncryptedDocumentException(
  105. (userPassword == null ? "Default" : "Supplied")
  106. + " password is invalid for docId/saltData/saltHash");
  107. }
  108. return new RecordInputStream(original, key, _initialRecordsSize);
  109. }
  110. public boolean hasEncryption() {
  111. return _filePassRec != null;
  112. }
  113. /**
  114. * @return last record scanned while looking for encryption info.
  115. * This will typically be the first or second record read. Possibly <code>null</code>
  116. * if stream was empty
  117. */
  118. public Record getLastRecord() {
  119. return _lastRecord;
  120. }
  121. /**
  122. * <code>false</code> in some test cases
  123. */
  124. public boolean hasBOFRecord() {
  125. return _hasBOFRecord;
  126. }
  127. }
  128. private final RecordInputStream _recStream;
  129. private final boolean _shouldIncludeContinueRecords;
  130. /**
  131. * Temporarily stores a group of {@link Record}s, for future return by {@link #nextRecord()}.
  132. * This is used at the start of the workbook stream, and also when the most recently read
  133. * underlying record is a {@link MulRKRecord}
  134. */
  135. private Record[] _unreadRecordBuffer;
  136. /**
  137. * used to help iterating over the unread records
  138. */
  139. private int _unreadRecordIndex = -1;
  140. /**
  141. * The most recent record that we gave to the user
  142. */
  143. private Record _lastRecord = null;
  144. /**
  145. * The most recent DrawingRecord seen
  146. */
  147. private DrawingRecord _lastDrawingRecord = new DrawingRecord();
  148. private int _bofDepth;
  149. private boolean _lastRecordWasEOFLevelZero;
  150. /**
  151. * @param shouldIncludeContinueRecords caller can pass <code>false</code> if loose
  152. * {@link ContinueRecord}s should be skipped (this is sometimes useful in event based
  153. * processing).
  154. */
  155. public RecordFactoryInputStream(InputStream in, boolean shouldIncludeContinueRecords) {
  156. RecordInputStream rs = new RecordInputStream(in);
  157. List<Record> records = new ArrayList<Record>();
  158. StreamEncryptionInfo sei = new StreamEncryptionInfo(rs, records);
  159. if (sei.hasEncryption()) {
  160. rs = sei.createDecryptingStream(in);
  161. } else {
  162. // typical case - non-encrypted stream
  163. }
  164. if (!records.isEmpty()) {
  165. _unreadRecordBuffer = new Record[records.size()];
  166. records.toArray(_unreadRecordBuffer);
  167. _unreadRecordIndex =0;
  168. }
  169. _recStream = rs;
  170. _shouldIncludeContinueRecords = shouldIncludeContinueRecords;
  171. _lastRecord = sei.getLastRecord();
  172. /*
  173. * How to recognise end of stream?
  174. * In the best case, the underlying input stream (in) ends just after the last EOF record
  175. * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
  176. * reliably use zeros for padding and if this were always the case, this code could just
  177. * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
  178. * non-zero padding that is read OK by Excel (Excel also fixes the padding).
  179. *
  180. * So to properly detect the workbook end of stream, this code has to identify the last
  181. * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
  182. * whole stream. The worksheets follow the workbook, but it is not easy to tell how many
  183. * sheet sub-streams should be present. Hence we are looking for an EOF record that is not
  184. * immediately followed by a BOF record. One extra complication is that bof+eof sub-
  185. * streams can be nested within worksheet streams and it's not clear in these cases what
  186. * record might follow any EOF record. So we also need to keep track of the bof/eof
  187. * nesting level.
  188. */
  189. _bofDepth = sei.hasBOFRecord() ? 1 : 0;
  190. _lastRecordWasEOFLevelZero = false;
  191. }
  192. /**
  193. * Returns the next (complete) record from the
  194. * stream, or null if there are no more.
  195. */
  196. public Record nextRecord() {
  197. Record r;
  198. r = getNextUnreadRecord();
  199. if (r != null) {
  200. // found an unread record
  201. return r;
  202. }
  203. while (true) {
  204. if (!_recStream.hasNextRecord()) {
  205. // recStream is exhausted;
  206. return null;
  207. }
  208. if (_lastRecordWasEOFLevelZero) {
  209. // Potential place for ending the workbook stream
  210. // Check that the next record is not BOFRecord(0x0809)
  211. // Normally the input stream contains only zero padding after the last EOFRecord,
  212. // but bug 46987 and 48068 suggests that the padding may be garbage.
  213. // This code relies on the padding bytes not starting with BOFRecord.sid
  214. if (_recStream.getNextSid() != BOFRecord.sid) {
  215. return null;
  216. }
  217. // else - another sheet substream starting here
  218. }
  219. // step underlying RecordInputStream to the next record
  220. _recStream.nextRecord();
  221. r = readNextRecord();
  222. if (r == null) {
  223. // some record types may get skipped (e.g. DBCellRecord and ContinueRecord)
  224. continue;
  225. }
  226. return r;
  227. }
  228. }
  229. /**
  230. * @return the next {@link Record} from the multiple record group as expanded from
  231. * a recently read {@link MulRKRecord}. <code>null</code> if not present.
  232. */
  233. private Record getNextUnreadRecord() {
  234. if (_unreadRecordBuffer != null) {
  235. int ix = _unreadRecordIndex;
  236. if (ix < _unreadRecordBuffer.length) {
  237. Record result = _unreadRecordBuffer[ix];
  238. _unreadRecordIndex = ix + 1;
  239. return result;
  240. }
  241. _unreadRecordIndex = -1;
  242. _unreadRecordBuffer = null;
  243. }
  244. return null;
  245. }
  246. /**
  247. * @return the next available record, or <code>null</code> if
  248. * this pass didn't return a record that's
  249. * suitable for returning (eg was a continue record).
  250. */
  251. private Record readNextRecord() {
  252. Record record = RecordFactory.createSingleRecord(_recStream);
  253. _lastRecordWasEOFLevelZero = false;
  254. if (record instanceof BOFRecord) {
  255. _bofDepth++;
  256. return record;
  257. }
  258. if (record instanceof EOFRecord) {
  259. _bofDepth--;
  260. if (_bofDepth < 1) {
  261. _lastRecordWasEOFLevelZero = true;
  262. }
  263. return record;
  264. }
  265. if (record instanceof DBCellRecord) {
  266. // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
  267. return null;
  268. }
  269. if (record instanceof RKRecord) {
  270. return RecordFactory.convertToNumberRecord((RKRecord) record);
  271. }
  272. if (record instanceof MulRKRecord) {
  273. Record[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
  274. _unreadRecordBuffer = records;
  275. _unreadRecordIndex = 1;
  276. return records[0];
  277. }
  278. if (record.getSid() == DrawingGroupRecord.sid
  279. && _lastRecord instanceof DrawingGroupRecord) {
  280. DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord;
  281. lastDGRecord.join((AbstractEscherHolderRecord) record);
  282. return null;
  283. }
  284. if (record.getSid() == ContinueRecord.sid) {
  285. ContinueRecord contRec = (ContinueRecord) record;
  286. if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) {
  287. // Drawing records have a very strange continue behaviour.
  288. //There can actually be OBJ records mixed between the continues.
  289. _lastDrawingRecord.processContinueRecord(contRec.getData());
  290. //we must remember the position of the continue record.
  291. //in the serialization procedure the original structure of records must be preserved
  292. if (_shouldIncludeContinueRecords) {
  293. return record;
  294. }
  295. return null;
  296. }
  297. if (_lastRecord instanceof DrawingGroupRecord) {
  298. ((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData());
  299. return null;
  300. }
  301. if (_lastRecord instanceof DrawingRecord) {
  302. ((DrawingRecord) _lastRecord).processContinueRecord(contRec.getData());
  303. return null;
  304. }
  305. if (_lastRecord instanceof UnknownRecord) {
  306. //Gracefully handle records that we don't know about,
  307. //that happen to be continued
  308. return record;
  309. }
  310. if (_lastRecord instanceof EOFRecord) {
  311. // This is really odd, but excel still sometimes
  312. // outputs a file like this all the same
  313. return record;
  314. }
  315. throw new RecordFormatException("Unhandled Continue Record followining " + _lastRecord.getClass());
  316. }
  317. _lastRecord = record;
  318. if (record instanceof DrawingRecord) {
  319. _lastDrawingRecord = (DrawingRecord) record;
  320. }
  321. return record;
  322. }
  323. }