You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

LZWDecompresser.java 7.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.util;
  16. import java.io.ByteArrayOutputStream;
  17. import java.io.IOException;
  18. import java.io.InputStream;
  19. import java.io.OutputStream;
  20. /**
  21. * This class provides common functionality for the
  22. * various LZW implementations in the different file
  23. * formats.
  24. * It's currently used by HDGF and HMEF.
  25. * <p>
  26. * Two good resources on LZW are:
  27. * http://en.wikipedia.org/wiki/LZW
  28. * http://marknelson.us/1989/10/01/lzw-data-compression/
  29. */
  30. public abstract class LZWDecompresser {
  31. /** the size of our dictionary */
  32. public static final int DICT_SIZE = 0x1000;
  33. /** the mask for calculating / wrapping dictionary offsets */
  34. public static final int DICT_MASK = 0xFFF;
  35. //arbitrarily selected; may need to increase
  36. private static final int MAX_RECORD_LENGTH = 1_000_000;
  37. /**
  38. * Does the mask bit mean it's compressed or uncompressed?
  39. */
  40. private final boolean maskMeansCompressed;
  41. /**
  42. * How much to append to the code length in the stream
  43. * to get the real code length? Normally 2 or 3
  44. */
  45. private final int codeLengthIncrease;
  46. /**
  47. * Does the 12 bits of the position get stored in
  48. * Little Endian or Big Endian form?
  49. * This controls whether a pos+length of 0x12 0x34
  50. * becomes a position of 0x123 or 0x312
  51. */
  52. private final boolean positionIsBigEndian;
  53. protected LZWDecompresser(boolean maskMeansCompressed,
  54. int codeLengthIncrease, boolean positionIsBigEndian) {
  55. this.maskMeansCompressed = maskMeansCompressed;
  56. this.codeLengthIncrease = codeLengthIncrease;
  57. this.positionIsBigEndian = positionIsBigEndian;
  58. }
  59. /**
  60. * Populates the dictionary, and returns where in it
  61. * to begin writing new codes.
  62. * Generally, if the dictionary is pre-populated, then new
  63. * codes should be placed at the end of that block.
  64. * Equally, if the dictionary is left with all zeros, then
  65. * usually the new codes can go in at the start.
  66. */
  67. protected abstract int populateDictionary(byte[] dict);
  68. /**
  69. * Adjusts the position offset if needed when looking
  70. * something up in the dictionary.
  71. */
  72. protected abstract int adjustDictionaryOffset(int offset);
  73. /**
  74. * Decompresses the given input stream, returning the array of bytes
  75. * of the decompressed input.
  76. */
  77. public byte[] decompress(InputStream src) throws IOException {
  78. ByteArrayOutputStream res = new ByteArrayOutputStream();
  79. decompress(src, res);
  80. return res.toByteArray();
  81. }
  82. /**
  83. * Perform a streaming decompression of the input.
  84. * Works by:
  85. * 1) Reading a flag byte, the 8 bits of which tell you if the
  86. * following 8 codes are compressed our un-compressed
  87. * 2) Consider the 8 bits in turn
  88. * 3) If the bit is set, the next code is un-compressed, so
  89. * add it to the dictionary and output it
  90. * 4) If the bit isn't set, then read in the length and start
  91. * position in the dictionary, and output the bytes there
  92. * 5) Loop until we've done all 8 bits, then read in the next
  93. * flag byte
  94. */
  95. public void decompress(InputStream src, OutputStream res) throws IOException {
  96. // How far through the output we've got
  97. // (This is normally used &4095, so it nicely wraps)
  98. // The initial value is set when populating the dictionary
  99. int pos;
  100. // The flag byte is treated as its 8 individual
  101. // bits, which tell us if the following 8 codes
  102. // are compressed or un-compressed
  103. int flag;
  104. // The mask, between 1 and 255, which is used when
  105. // processing each bit of the flag byte in turn
  106. int mask;
  107. // We use 12 bit codes:
  108. // * 0-255 are real bytes
  109. // * 256-4095 are the substring codes
  110. // Java handily initialises our buffer / dictionary
  111. // to all zeros
  112. final byte[] buffer = new byte[DICT_SIZE];
  113. pos = populateDictionary(buffer);
  114. // These are bytes as looked up in the dictionary
  115. // It needs to be signed, as it'll get passed on to
  116. // the output stream
  117. final byte[] dataB = IOUtils.safelyAllocate(16L + codeLengthIncrease, MAX_RECORD_LENGTH);
  118. // This is an unsigned byte read from the stream
  119. // It needs to be unsigned, so that bit stuff works
  120. int dataI;
  121. // The compressed code sequence is held over 2 bytes
  122. int dataIPt1, dataIPt2;
  123. // How long a code sequence is, and where in the
  124. // dictionary to start at
  125. int len, pntr;
  126. while ((flag = src.read()) != -1) {
  127. // Compare each bit in our flag byte in turn:
  128. for (mask = 1; mask < 0x100; mask <<= 1) {
  129. // Is this a new code (un-compressed), or
  130. // the use of existing codes (compressed)?
  131. boolean isMaskSet = (flag & mask) > 0;
  132. if (isMaskSet ^ maskMeansCompressed) {
  133. // Retrieve the un-compressed code
  134. if ((dataI = src.read()) != -1) {
  135. // Save the byte into the dictionary
  136. buffer[pos++ & DICT_MASK] = (byte) dataI;
  137. // And output the byte
  138. res.write(dataI);
  139. }
  140. } else {
  141. // We have a compressed sequence
  142. // Grab the next 16 bits of data
  143. dataIPt1 = src.read();
  144. dataIPt2 = src.read();
  145. if (dataIPt1 == -1 || dataIPt2 == -1) break;
  146. // Build up how long the code sequence is, and
  147. // what position of the code to start at
  148. // (The position is the usually the first 12 bits,
  149. // and the length is usually the last 4 bits)
  150. len = (dataIPt2 & 0x0F) + codeLengthIncrease;
  151. if (positionIsBigEndian) {
  152. pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
  153. } else {
  154. pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
  155. }
  156. // Adjust the pointer as needed
  157. pntr = adjustDictionaryOffset(pntr);
  158. // Loop over the codes, outputting what they correspond to
  159. for (int i = 0; i < len; i++) {
  160. dataB[i] = buffer[(pntr + i) & DICT_MASK];
  161. buffer[(pos + i) & DICT_MASK] = dataB[i];
  162. }
  163. res.write(dataB, 0, len);
  164. // Record how far along the stream we have moved
  165. pos += len;
  166. }
  167. }
  168. }
  169. }
  170. }