You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RecordInputStream.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.record;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.util.Locale;
  20. import org.apache.poi.hssf.dev.BiffViewer;
  21. import org.apache.poi.hssf.record.crypto.Biff8DecryptingStream;
  22. import org.apache.poi.poifs.crypt.EncryptionInfo;
  23. import org.apache.poi.util.IOUtils;
  24. import org.apache.poi.util.Internal;
  25. import org.apache.poi.util.LittleEndianConsts;
  26. import org.apache.poi.util.LittleEndianInput;
  27. import org.apache.poi.util.LittleEndianInputStream;
  28. import org.apache.poi.util.RecordFormatException;
  29. /**
  30. * Title: Record Input Stream
  31. *
  32. * Description: Wraps a stream and provides helper methods for the construction of records.
  33. */
  34. public final class RecordInputStream implements LittleEndianInput {
  35. /** Maximum size of a single record (minus the 4 byte header) without a continue*/
  36. public final static short MAX_RECORD_DATA_SIZE = 8224;
  37. private static final int INVALID_SID_VALUE = -1;
  38. //arbitrarily selected; may need to increase
  39. private static final int MAX_RECORD_LENGTH = 100_000;
  40. /**
  41. * When {@link #_currentDataLength} has this value, it means that the previous BIFF record is
  42. * finished, the next sid has been properly read, but the data size field has not been read yet.
  43. */
  44. private static final int DATA_LEN_NEEDS_TO_BE_READ = -1;
  45. private static final byte[] EMPTY_BYTE_ARRAY = { };
  46. /**
  47. * For use in {@link BiffViewer} which may construct {@link Record}s that don't completely
  48. * read all available data. This exception should never be thrown otherwise.
  49. */
  50. @SuppressWarnings("serial")
  51. public static final class LeftoverDataException extends RuntimeException {
  52. public LeftoverDataException(int sid, int remainingByteCount) {
  53. super("Initialisation of record 0x" + Integer.toHexString(sid).toUpperCase(Locale.ROOT)
  54. + "(" + getRecordName(sid) + ") left " + remainingByteCount
  55. + " bytes remaining still to be read.");
  56. }
  57. private static String getRecordName(int sid) {
  58. Class<? extends Record> recordClass = RecordFactory.getRecordClass(sid);
  59. if(recordClass == null) {
  60. return null;
  61. }
  62. return recordClass.getSimpleName();
  63. }
  64. }
  65. /** Header {@link LittleEndianInput} facet of the wrapped {@link InputStream} */
  66. private final BiffHeaderInput _bhi;
  67. /** Data {@link LittleEndianInput} facet of the wrapped {@link InputStream} */
  68. private final LittleEndianInput _dataInput;
  69. /** the record identifier of the BIFF record currently being read */
  70. private int _currentSid;
  71. /**
  72. * Length of the data section of the current BIFF record (always 4 less than the total record size).
  73. * When uninitialised, this field is set to {@link #DATA_LEN_NEEDS_TO_BE_READ}.
  74. */
  75. private int _currentDataLength;
  76. /**
  77. * The BIFF record identifier for the next record is read when just as the current record
  78. * is finished.
  79. * This field is only really valid during the time that ({@link #_currentDataLength} ==
  80. * {@link #DATA_LEN_NEEDS_TO_BE_READ}). At most other times its value is not really the
  81. * 'sid of the next record'. Wwhile mid-record, this field coincidentally holds the sid
  82. * of the current record.
  83. */
  84. private int _nextSid;
  85. /**
  86. * index within the data section of the current BIFF record
  87. */
  88. private int _currentDataOffset;
  89. /**
  90. * index within the data section when mark() was called
  91. */
  92. private int _markedDataOffset;
  93. private static final class SimpleHeaderInput implements BiffHeaderInput {
  94. private final LittleEndianInput _lei;
  95. private SimpleHeaderInput(LittleEndianInput lei) {
  96. _lei = lei;
  97. }
  98. @Override
  99. public int available() {
  100. return _lei.available();
  101. }
  102. @Override
  103. public int readDataSize() {
  104. return _lei.readUShort();
  105. }
  106. @Override
  107. public int readRecordSID() {
  108. return _lei.readUShort();
  109. }
  110. }
  111. public RecordInputStream(InputStream in) throws RecordFormatException {
  112. this (in, null, 0);
  113. }
  114. public RecordInputStream(InputStream in, EncryptionInfo key, int initialOffset) throws RecordFormatException {
  115. if (key == null) {
  116. _dataInput = (in instanceof LittleEndianInput)
  117. // accessing directly is an optimisation
  118. ? (LittleEndianInput)in
  119. // less optimal, but should work OK just the same. Often occurs in junit tests.
  120. : new LittleEndianInputStream(in);
  121. _bhi = new SimpleHeaderInput(_dataInput);
  122. } else {
  123. Biff8DecryptingStream bds = new Biff8DecryptingStream(in, initialOffset, key);
  124. _dataInput = bds;
  125. _bhi = bds;
  126. }
  127. _nextSid = readNextSid();
  128. }
  129. static LittleEndianInput getLEI(InputStream is) {
  130. if (is instanceof LittleEndianInput) {
  131. // accessing directly is an optimisation
  132. return (LittleEndianInput) is;
  133. }
  134. // less optimal, but should work OK just the same. Often occurs in junit tests.
  135. return new LittleEndianInputStream(is);
  136. }
  137. /**
  138. * @return the number of bytes available in the current BIFF record
  139. * @see #remaining()
  140. */
  141. @Override
  142. public int available() {
  143. return remaining();
  144. }
  145. public int read(byte[] b, int off, int len) {
  146. int limit = Math.min(len, remaining());
  147. if (limit == 0) {
  148. return 0;
  149. }
  150. readFully(b, off,limit);
  151. return limit;
  152. }
  153. public short getSid() {
  154. return (short) _currentSid;
  155. }
  156. /**
  157. * Note - this method is expected to be called only when completed reading the current BIFF
  158. * record.
  159. *
  160. * @return true, if there's another record in the stream
  161. *
  162. * @throws LeftoverDataException if this method is called before reaching the end of the
  163. * current record.
  164. */
  165. public boolean hasNextRecord() throws LeftoverDataException {
  166. if (_currentDataLength != -1 && _currentDataLength != _currentDataOffset) {
  167. throw new LeftoverDataException(_currentSid, remaining());
  168. }
  169. if (_currentDataLength != DATA_LEN_NEEDS_TO_BE_READ) {
  170. _nextSid = readNextSid();
  171. }
  172. return _nextSid != INVALID_SID_VALUE;
  173. }
  174. /**
  175. * @return the sid of the next record or {@link #INVALID_SID_VALUE} if at end of stream
  176. */
  177. private int readNextSid() {
  178. int nAvailable = _bhi.available();
  179. if (nAvailable < EOFRecord.ENCODED_SIZE) {
  180. // some scrap left over, if nAvailable > 0?
  181. // ex45582-22397.xls has one extra byte after the last record
  182. // Excel reads that file OK
  183. return INVALID_SID_VALUE;
  184. }
  185. int result = _bhi.readRecordSID();
  186. if (result == INVALID_SID_VALUE) {
  187. throw new RecordFormatException("Found invalid sid (" + result + ")");
  188. }
  189. _currentDataLength = DATA_LEN_NEEDS_TO_BE_READ;
  190. return result;
  191. }
  192. /** Moves to the next record in the stream.
  193. *
  194. * <i>Note: The auto continue flag is reset to true</i>
  195. */
  196. public void nextRecord() throws RecordFormatException {
  197. if (_nextSid == INVALID_SID_VALUE) {
  198. throw new IllegalStateException("EOF - next record not available");
  199. }
  200. if (_currentDataLength != DATA_LEN_NEEDS_TO_BE_READ) {
  201. throw new IllegalStateException("Cannot call nextRecord() without checking hasNextRecord() first");
  202. }
  203. _currentSid = _nextSid;
  204. _currentDataOffset = 0;
  205. _currentDataLength = _bhi.readDataSize();
  206. if (_currentDataLength > MAX_RECORD_DATA_SIZE) {
  207. throw new RecordFormatException("The content of an excel record cannot exceed "
  208. + MAX_RECORD_DATA_SIZE + " bytes");
  209. }
  210. }
  211. private void checkRecordPosition(int requiredByteCount) {
  212. int nAvailable = remaining();
  213. if (nAvailable >= requiredByteCount) {
  214. // all OK
  215. return;
  216. }
  217. if (nAvailable == 0 && isContinueNext()) {
  218. nextRecord();
  219. return;
  220. }
  221. throw new RecordFormatException("Not enough data (" + nAvailable
  222. + ") to read requested (" + requiredByteCount +") bytes");
  223. }
  224. /**
  225. * Reads an 8 bit, signed value
  226. */
  227. @Override
  228. public byte readByte() {
  229. checkRecordPosition(LittleEndianConsts.BYTE_SIZE);
  230. _currentDataOffset += LittleEndianConsts.BYTE_SIZE;
  231. return _dataInput.readByte();
  232. }
  233. /**
  234. * Reads a 16 bit, signed value
  235. */
  236. @Override
  237. public short readShort() {
  238. checkRecordPosition(LittleEndianConsts.SHORT_SIZE);
  239. _currentDataOffset += LittleEndianConsts.SHORT_SIZE;
  240. return _dataInput.readShort();
  241. }
  242. /**
  243. * Reads a 32 bit, signed value
  244. */
  245. @Override
  246. public int readInt() {
  247. checkRecordPosition(LittleEndianConsts.INT_SIZE);
  248. _currentDataOffset += LittleEndianConsts.INT_SIZE;
  249. return _dataInput.readInt();
  250. }
  251. /**
  252. * Reads a 64 bit, signed value
  253. */
  254. @Override
  255. public long readLong() {
  256. checkRecordPosition(LittleEndianConsts.LONG_SIZE);
  257. _currentDataOffset += LittleEndianConsts.LONG_SIZE;
  258. return _dataInput.readLong();
  259. }
  260. /**
  261. * Reads an 8 bit, unsigned value
  262. */
  263. @Override
  264. public int readUByte() {
  265. return readByte() & 0x00FF;
  266. }
  267. /**
  268. * Reads a 16 bit, unsigned value.
  269. */
  270. @Override
  271. public int readUShort() {
  272. checkRecordPosition(LittleEndianConsts.SHORT_SIZE);
  273. _currentDataOffset += LittleEndianConsts.SHORT_SIZE;
  274. return _dataInput.readUShort();
  275. }
  276. @Override
  277. public double readDouble() {
  278. // YK: Excel doesn't write NaN but instead converts the cell type into {@link CellType#ERROR}.
  279. return Double.longBitsToDouble(readLong());
  280. }
  281. public void readPlain(byte[] buf, int off, int len) {
  282. readFully(buf, 0, buf.length, true);
  283. }
  284. @Override
  285. public void readFully(byte[] buf) {
  286. readFully(buf, 0, buf.length, false);
  287. }
  288. @Override
  289. public void readFully(byte[] buf, int off, int len) {
  290. readFully(buf, off, len, false);
  291. }
  292. private void readFully(byte[] buf, int off, int len, boolean isPlain) {
  293. int origLen = len;
  294. if (buf == null) {
  295. throw new NullPointerException();
  296. } else if (off < 0 || len < 0 || len > buf.length - off) {
  297. throw new IndexOutOfBoundsException();
  298. }
  299. while (len > 0) {
  300. int nextChunk = Math.min(available(),len);
  301. if (nextChunk == 0) {
  302. if (!hasNextRecord()) {
  303. throw new RecordFormatException("Can't read the remaining "+len+" bytes of the requested "+origLen+" bytes. No further record exists.");
  304. } else {
  305. nextRecord();
  306. nextChunk = Math.min(available(),len);
  307. assert(nextChunk > 0);
  308. }
  309. }
  310. checkRecordPosition(nextChunk);
  311. if (isPlain) {
  312. _dataInput.readPlain(buf, off, nextChunk);
  313. } else {
  314. _dataInput.readFully(buf, off, nextChunk);
  315. }
  316. _currentDataOffset+=nextChunk;
  317. off += nextChunk;
  318. len -= nextChunk;
  319. }
  320. }
  321. public String readString() {
  322. int requestedLength = readUShort();
  323. byte compressFlag = readByte();
  324. return readStringCommon(requestedLength, compressFlag == 0);
  325. }
  326. /**
  327. * given a byte array of 16-bit unicode characters, compress to 8-bit and
  328. * return a string
  329. *
  330. * { 0x16, 0x00 } -0x16
  331. *
  332. * @param requestedLength the length of the final string
  333. * @return the converted string
  334. * @exception IllegalArgumentException if len is too large (i.e.,
  335. * there is not enough data in string to create a String of that
  336. * length)
  337. */
  338. public String readUnicodeLEString(int requestedLength) {
  339. return readStringCommon(requestedLength, false);
  340. }
  341. public String readCompressedUnicode(int requestedLength) {
  342. return readStringCommon(requestedLength, true);
  343. }
  344. private String readStringCommon(int requestedLength, boolean pIsCompressedEncoding) {
  345. // Sanity check to detect garbage string lengths
  346. if (requestedLength < 0 || requestedLength > 0x100000) { // 16 million chars?
  347. throw new IllegalArgumentException("Bad requested string length (" + requestedLength + ")");
  348. }
  349. char[] buf = new char[requestedLength];
  350. boolean isCompressedEncoding = pIsCompressedEncoding;
  351. int curLen = 0;
  352. while(true) {
  353. int availableChars =isCompressedEncoding ? remaining() : remaining() / LittleEndianConsts.SHORT_SIZE;
  354. if (requestedLength - curLen <= availableChars) {
  355. // enough space in current record, so just read it out
  356. while(curLen < requestedLength) {
  357. char ch;
  358. if (isCompressedEncoding) {
  359. ch = (char)readUByte();
  360. } else {
  361. ch = (char)readShort();
  362. }
  363. buf[curLen] = ch;
  364. curLen++;
  365. }
  366. return new String(buf);
  367. }
  368. // else string has been spilled into next continue record
  369. // so read what's left of the current record
  370. while(availableChars > 0) {
  371. char ch;
  372. if (isCompressedEncoding) {
  373. ch = (char)readUByte();
  374. } else {
  375. ch = (char)readShort();
  376. }
  377. buf[curLen] = ch;
  378. curLen++;
  379. availableChars--;
  380. }
  381. if (!isContinueNext()) {
  382. throw new RecordFormatException("Expected to find a ContinueRecord in order to read remaining "
  383. + (requestedLength-curLen) + " of " + requestedLength + " chars");
  384. }
  385. if(remaining() != 0) {
  386. throw new RecordFormatException("Odd number of bytes(" + remaining() + ") left behind");
  387. }
  388. nextRecord();
  389. // note - the compressed flag may change on the fly
  390. byte compressFlag = readByte();
  391. assert(compressFlag == 0 || compressFlag == 1);
  392. isCompressedEncoding = (compressFlag == 0);
  393. }
  394. }
  395. /** Returns the remaining bytes for the current record.
  396. *
  397. * @return The remaining bytes of the current record.
  398. */
  399. public byte[] readRemainder() {
  400. int size = remaining();
  401. if (size ==0) {
  402. return EMPTY_BYTE_ARRAY;
  403. }
  404. byte[] result = IOUtils.safelyAllocate(size, MAX_RECORD_LENGTH);
  405. readFully(result);
  406. return result;
  407. }
  408. /**
  409. * Reads all byte data for the current record, including any that overlaps
  410. * into any following continue records.
  411. *
  412. * @return all byte data for the current record
  413. *
  414. * @deprecated POI 2.0 Best to write a input stream that wraps this one
  415. * where there is special sub record that may overlap continue
  416. * records.
  417. */
  418. @Deprecated
  419. public byte[] readAllContinuedRemainder() {
  420. ByteArrayOutputStream out = new ByteArrayOutputStream(2 * MAX_RECORD_DATA_SIZE);
  421. while (true) {
  422. byte[] b = readRemainder();
  423. out.write(b, 0, b.length);
  424. if (!isContinueNext()) {
  425. break;
  426. }
  427. nextRecord();
  428. }
  429. return out.toByteArray();
  430. }
  431. /** The remaining number of bytes in the <i>current</i> record.
  432. *
  433. * @return The number of bytes remaining in the current record
  434. */
  435. public int remaining() {
  436. if (_currentDataLength == DATA_LEN_NEEDS_TO_BE_READ) {
  437. // already read sid of next record. so current one is finished
  438. return 0;
  439. }
  440. return _currentDataLength - _currentDataOffset;
  441. }
  442. /**
  443. *
  444. * @return <code>true</code> when a {@link ContinueRecord} is next.
  445. */
  446. private boolean isContinueNext() {
  447. if (_currentDataLength != DATA_LEN_NEEDS_TO_BE_READ && _currentDataOffset != _currentDataLength) {
  448. throw new IllegalStateException("Should never be called before end of current record");
  449. }
  450. if (!hasNextRecord()) {
  451. return false;
  452. }
  453. // At what point are records continued?
  454. // - Often from within the char data of long strings (caller is within readStringCommon()).
  455. // - From UnicodeString construction (many different points - call via checkRecordPosition)
  456. // - During TextObjectRecord construction (just before the text, perhaps within the text,
  457. // and before the formatting run data)
  458. return _nextSid == ContinueRecord.sid;
  459. }
  460. /**
  461. @return sid of next record. Can be called after hasNextRecord()
  462. */
  463. public int getNextSid() {
  464. return _nextSid;
  465. }
  466. /**
  467. * Mark the stream position - experimental function
  468. *
  469. * @param readlimit the read ahead limit
  470. *
  471. * @see InputStream#mark(int)
  472. */
  473. @Internal
  474. public void mark(int readlimit) {
  475. ((InputStream)_dataInput).mark(readlimit);
  476. _markedDataOffset = _currentDataOffset;
  477. }
  478. /**
  479. * Resets the stream position to the previously marked position.
  480. * Experimental function - this only works, when nextRecord() wasn't called in the meantime.
  481. *
  482. * @throws IOException if marking is not supported
  483. *
  484. * @see InputStream#reset()
  485. */
  486. @Internal
  487. public void reset() throws IOException {
  488. ((InputStream)_dataInput).reset();
  489. _currentDataOffset = _markedDataOffset;
  490. }
  491. }