You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RecordFactoryInputStream.java 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.record;
  16. import java.io.InputStream;
  17. import java.util.ArrayList;
  18. import java.util.List;
  19. import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
  20. import org.apache.poi.hssf.eventusermodel.HSSFListener;
  21. import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
  22. import org.apache.poi.EncryptedDocumentException;
  23. /**
  24. * A stream based way to get at complete records, with
  25. * as low a memory footprint as possible.
  26. * This handles reading from a RecordInputStream, turning
  27. * the data into full records, processing continue records
  28. * etc.
  29. * Most users should use {@link HSSFEventFactory} /
  30. * {@link HSSFListener} and have new records pushed to
  31. * them, but this does allow for a "pull" style of coding.
  32. */
  33. public final class RecordFactoryInputStream {
  34. /**
  35. * Keeps track of the sizes of the initial records up to and including {@link FilePassRecord}
  36. * Needed for protected files because each byte is encrypted with respect to its absolute
  37. * position from the start of the stream.
  38. */
  39. private static final class StreamEncryptionInfo {
  40. private final int _initialRecordsSize;
  41. private final FilePassRecord _filePassRec;
  42. private final Record _lastRecord;
  43. private final boolean _hasBOFRecord;
  44. public StreamEncryptionInfo(RecordInputStream rs, List<Record> outputRecs) {
  45. Record rec;
  46. rs.nextRecord();
  47. int recSize = 4 + rs.remaining();
  48. rec = RecordFactory.createSingleRecord(rs);
  49. outputRecs.add(rec);
  50. FilePassRecord fpr = null;
  51. if (rec instanceof BOFRecord) {
  52. _hasBOFRecord = true;
  53. if (rs.hasNextRecord()) {
  54. rs.nextRecord();
  55. rec = RecordFactory.createSingleRecord(rs);
  56. recSize += rec.getRecordSize();
  57. outputRecs.add(rec);
  58. if (rec instanceof FilePassRecord) {
  59. fpr = (FilePassRecord) rec;
  60. outputRecs.remove(outputRecs.size()-1);
  61. // TODO - add fpr not added to outputRecs
  62. rec = outputRecs.get(0);
  63. } else {
  64. // workbook not encrypted (typical case)
  65. if (rec instanceof EOFRecord) {
  66. // A workbook stream is never empty, so crash instead
  67. // of trying to keep track of nesting level
  68. throw new IllegalStateException("Nothing between BOF and EOF");
  69. }
  70. }
  71. }
  72. } else {
  73. // Invalid in a normal workbook stream.
  74. // However, some test cases work on sub-sections of
  75. // the workbook stream that do not begin with BOF
  76. _hasBOFRecord = false;
  77. }
  78. _initialRecordsSize = recSize;
  79. _filePassRec = fpr;
  80. _lastRecord = rec;
  81. }
  82. public RecordInputStream createDecryptingStream(InputStream original) {
  83. FilePassRecord fpr = _filePassRec;
  84. String userPassword = Biff8EncryptionKey.getCurrentUserPassword();
  85. Biff8EncryptionKey key;
  86. if (userPassword == null) {
  87. key = Biff8EncryptionKey.create(fpr.getDocId());
  88. } else {
  89. key = Biff8EncryptionKey.create(userPassword, fpr.getDocId());
  90. }
  91. if (!key.validate(fpr.getSaltData(), fpr.getSaltHash())) {
  92. throw new EncryptedDocumentException(
  93. (userPassword == null ? "Default" : "Supplied")
  94. + " password is invalid for docId/saltData/saltHash");
  95. }
  96. return new RecordInputStream(original, key, _initialRecordsSize);
  97. }
  98. public boolean hasEncryption() {
  99. return _filePassRec != null;
  100. }
  101. /**
  102. * @return last record scanned while looking for encryption info.
  103. * This will typically be the first or second record read. Possibly <code>null</code>
  104. * if stream was empty
  105. */
  106. public Record getLastRecord() {
  107. return _lastRecord;
  108. }
  109. /**
  110. * <code>false</code> in some test cases
  111. */
  112. public boolean hasBOFRecord() {
  113. return _hasBOFRecord;
  114. }
  115. }
  116. private final RecordInputStream _recStream;
  117. private final boolean _shouldIncludeContinueRecords;
  118. /**
  119. * Temporarily stores a group of {@link Record}s, for future return by {@link #nextRecord()}.
  120. * This is used at the start of the workbook stream, and also when the most recently read
  121. * underlying record is a {@link MulRKRecord}
  122. */
  123. private Record[] _unreadRecordBuffer;
  124. /**
  125. * used to help iterating over the unread records
  126. */
  127. private int _unreadRecordIndex = -1;
  128. /**
  129. * The most recent record that we gave to the user
  130. */
  131. private Record _lastRecord = null;
  132. /**
  133. * The most recent DrawingRecord seen
  134. */
  135. private DrawingRecord _lastDrawingRecord = new DrawingRecord();
  136. private int _bofDepth;
  137. private boolean _lastRecordWasEOFLevelZero;
  138. /**
  139. * @param shouldIncludeContinueRecords caller can pass <code>false</code> if loose
  140. * {@link ContinueRecord}s should be skipped (this is sometimes useful in event based
  141. * processing).
  142. */
  143. public RecordFactoryInputStream(InputStream in, boolean shouldIncludeContinueRecords) {
  144. RecordInputStream rs = new RecordInputStream(in);
  145. List<Record> records = new ArrayList<Record>();
  146. StreamEncryptionInfo sei = new StreamEncryptionInfo(rs, records);
  147. if (sei.hasEncryption()) {
  148. rs = sei.createDecryptingStream(in);
  149. } else {
  150. // typical case - non-encrypted stream
  151. }
  152. if (!records.isEmpty()) {
  153. _unreadRecordBuffer = new Record[records.size()];
  154. records.toArray(_unreadRecordBuffer);
  155. _unreadRecordIndex =0;
  156. }
  157. _recStream = rs;
  158. _shouldIncludeContinueRecords = shouldIncludeContinueRecords;
  159. _lastRecord = sei.getLastRecord();
  160. /*
  161. * How to recognise end of stream?
  162. * In the best case, the underlying input stream (in) ends just after the last EOF record
  163. * Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
  164. * reliably use zeros for padding and if this were always the case, this code could just
  165. * skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
  166. * non-zero padding that is read OK by Excel (Excel also fixes the padding).
  167. *
  168. * So to properly detect the workbook end of stream, this code has to identify the last
  169. * EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
  170. * whole stream. The worksheets follow the workbook, but it is not easy to tell how many
  171. * sheet sub-streams should be present. Hence we are looking for an EOF record that is not
  172. * immediately followed by a BOF record. One extra complication is that bof+eof sub-
  173. * streams can be nested within worksheet streams and it's not clear in these cases what
  174. * record might follow any EOF record. So we also need to keep track of the bof/eof
  175. * nesting level.
  176. */
  177. _bofDepth = sei.hasBOFRecord() ? 1 : 0;
  178. _lastRecordWasEOFLevelZero = false;
  179. }
  180. /**
  181. * Returns the next (complete) record from the
  182. * stream, or null if there are no more.
  183. */
  184. public Record nextRecord() {
  185. Record r;
  186. r = getNextUnreadRecord();
  187. if (r != null) {
  188. // found an unread record
  189. return r;
  190. }
  191. while (true) {
  192. if (!_recStream.hasNextRecord()) {
  193. // recStream is exhausted;
  194. return null;
  195. }
  196. // step underlying RecordInputStream to the next record
  197. _recStream.nextRecord();
  198. if (_lastRecordWasEOFLevelZero) {
  199. // Potential place for ending the workbook stream
  200. // Check that the next record is not BOFRecord(0x0809)
  201. // Normally the input stream contains only zero padding after the last EOFRecord,
  202. // but bug 46987 suggests that the padding may be garbage.
  203. // This code relies on the padding bytes not starting with BOFRecord.sid
  204. if (_recStream.getSid() != BOFRecord.sid) {
  205. return null;
  206. }
  207. // else - another sheet substream starting here
  208. }
  209. r = readNextRecord();
  210. if (r == null) {
  211. // some record types may get skipped (e.g. DBCellRecord and ContinueRecord)
  212. continue;
  213. }
  214. return r;
  215. }
  216. }
  217. /**
  218. * @return the next {@link Record} from the multiple record group as expanded from
  219. * a recently read {@link MulRKRecord}. <code>null</code> if not present.
  220. */
  221. private Record getNextUnreadRecord() {
  222. if (_unreadRecordBuffer != null) {
  223. int ix = _unreadRecordIndex;
  224. if (ix < _unreadRecordBuffer.length) {
  225. Record result = _unreadRecordBuffer[ix];
  226. _unreadRecordIndex = ix + 1;
  227. return result;
  228. }
  229. _unreadRecordIndex = -1;
  230. _unreadRecordBuffer = null;
  231. }
  232. return null;
  233. }
  234. /**
  235. * @return the next available record, or <code>null</code> if
  236. * this pass didn't return a record that's
  237. * suitable for returning (eg was a continue record).
  238. */
  239. private Record readNextRecord() {
  240. Record record = RecordFactory.createSingleRecord(_recStream);
  241. _lastRecordWasEOFLevelZero = false;
  242. if (record instanceof BOFRecord) {
  243. _bofDepth++;
  244. return record;
  245. }
  246. if (record instanceof EOFRecord) {
  247. _bofDepth--;
  248. if (_bofDepth < 1) {
  249. _lastRecordWasEOFLevelZero = true;
  250. }
  251. return record;
  252. }
  253. if (record instanceof DBCellRecord) {
  254. // Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
  255. return null;
  256. }
  257. if (record instanceof RKRecord) {
  258. return RecordFactory.convertToNumberRecord((RKRecord) record);
  259. }
  260. if (record instanceof MulRKRecord) {
  261. Record[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
  262. _unreadRecordBuffer = records;
  263. _unreadRecordIndex = 1;
  264. return records[0];
  265. }
  266. if (record.getSid() == DrawingGroupRecord.sid
  267. && _lastRecord instanceof DrawingGroupRecord) {
  268. DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) _lastRecord;
  269. lastDGRecord.join((AbstractEscherHolderRecord) record);
  270. return null;
  271. }
  272. if (record.getSid() == ContinueRecord.sid) {
  273. ContinueRecord contRec = (ContinueRecord) record;
  274. if (_lastRecord instanceof ObjRecord || _lastRecord instanceof TextObjectRecord) {
  275. // Drawing records have a very strange continue behaviour.
  276. //There can actually be OBJ records mixed between the continues.
  277. _lastDrawingRecord.processContinueRecord(contRec.getData());
  278. //we must remember the position of the continue record.
  279. //in the serialization procedure the original structure of records must be preserved
  280. if (_shouldIncludeContinueRecords) {
  281. return record;
  282. }
  283. return null;
  284. }
  285. if (_lastRecord instanceof DrawingGroupRecord) {
  286. ((DrawingGroupRecord) _lastRecord).processContinueRecord(contRec.getData());
  287. return null;
  288. }
  289. if (_lastRecord instanceof DrawingRecord) {
  290. ((DrawingRecord) _lastRecord).processContinueRecord(contRec.getData());
  291. return null;
  292. }
  293. if (_lastRecord instanceof UnknownRecord) {
  294. //Gracefully handle records that we don't know about,
  295. //that happen to be continued
  296. return record;
  297. }
  298. if (_lastRecord instanceof EOFRecord) {
  299. // This is really odd, but excel still sometimes
  300. // outputs a file like this all the same
  301. return record;
  302. }
  303. throw new RecordFormatException("Unhandled Continue Record");
  304. }
  305. _lastRecord = record;
  306. if (record instanceof DrawingRecord) {
  307. _lastDrawingRecord = (DrawingRecord) record;
  308. }
  309. return record;
  310. }
  311. }