You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

LZWDecompresser.java 8.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.util;
  16. import java.io.IOException;
  17. import java.io.InputStream;
  18. import java.io.OutputStream;
  19. import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
  20. /**
  21. * This class provides common functionality for the
  22. * various LZW implementations in the different file
  23. * formats.
  24. * It's currently used by HDGF and HMEF.
  25. * <p>
  26. * Two good resources on LZW are:
  27. * http://en.wikipedia.org/wiki/LZW
  28. * http://marknelson.us/1989/10/01/lzw-data-compression/
  29. */
  30. public abstract class LZWDecompresser {
  31. /** the size of our dictionary */
  32. public static final int DICT_SIZE = 0x1000;
  33. /** the mask for calculating / wrapping dictionary offsets */
  34. public static final int DICT_MASK = 0xFFF;
  35. //arbitrarily selected; may need to increase
  36. private static final int DEFAULT_MAX_RECORD_LENGTH = 1_000_000;
  37. private static int MAX_RECORD_LENGTH = DEFAULT_MAX_RECORD_LENGTH;
  38. /**
  39. * Does the mask bit mean it's compressed or uncompressed?
  40. */
  41. private final boolean maskMeansCompressed;
  42. /**
  43. * How much to append to the code length in the stream
  44. * to get the real code length? Normally 2 or 3
  45. */
  46. private final int codeLengthIncrease;
  47. /**
  48. * Does the 12 bits of the position get stored in
  49. * Little Endian or Big Endian form?
  50. * This controls whether a pos+length of 0x12 0x34
  51. * becomes a position of 0x123 or 0x312
  52. */
  53. private final boolean positionIsBigEndian;
  54. /**
  55. * @param length the max record length allowed for LZWDecompresser
  56. */
  57. public static void setMaxRecordLength(int length) {
  58. MAX_RECORD_LENGTH = length;
  59. }
  60. /**
  61. * @return the max record length allowed for LZWDecompresser
  62. */
  63. public static int getMaxRecordLength() {
  64. return MAX_RECORD_LENGTH;
  65. }
  66. protected LZWDecompresser(boolean maskMeansCompressed,
  67. int codeLengthIncrease, boolean positionIsBigEndian) {
  68. this.maskMeansCompressed = maskMeansCompressed;
  69. this.codeLengthIncrease = codeLengthIncrease;
  70. this.positionIsBigEndian = positionIsBigEndian;
  71. }
  72. /**
  73. * Populates the dictionary, and returns where in it
  74. * to begin writing new codes.
  75. * Generally, if the dictionary is pre-populated, then new
  76. * codes should be placed at the end of that block.
  77. * Equally, if the dictionary is left with all zeros, then
  78. * usually the new codes can go in at the start.
  79. */
  80. protected abstract int populateDictionary(byte[] dict);
  81. /**
  82. * Adjusts the position offset if needed when looking
  83. * something up in the dictionary.
  84. */
  85. protected abstract int adjustDictionaryOffset(int offset);
  86. /**
  87. * Decompresses the given input stream, returning the array of bytes
  88. * of the decompressed input.
  89. */
  90. public byte[] decompress(InputStream src) throws IOException {
  91. UnsynchronizedByteArrayOutputStream res = new UnsynchronizedByteArrayOutputStream();
  92. decompress(src, res);
  93. return res.toByteArray();
  94. }
  95. /**
  96. * Perform a streaming decompression of the input.
  97. * Works by:
  98. * 1) Reading a flag byte, the 8 bits of which tell you if the
  99. * following 8 codes are compressed our un-compressed
  100. * 2) Consider the 8 bits in turn
  101. * 3) If the bit is set, the next code is un-compressed, so
  102. * add it to the dictionary and output it
  103. * 4) If the bit isn't set, then read in the length and start
  104. * position in the dictionary, and output the bytes there
  105. * 5) Loop until we've done all 8 bits, then read in the next
  106. * flag byte
  107. */
  108. public void decompress(InputStream src, OutputStream res) throws IOException {
  109. // How far through the output we've got
  110. // (This is normally used &4095, so it nicely wraps)
  111. // The initial value is set when populating the dictionary
  112. int pos;
  113. // The flag byte is treated as its 8 individual
  114. // bits, which tell us if the following 8 codes
  115. // are compressed or un-compressed
  116. int flag;
  117. // The mask, between 1 and 255, which is used when
  118. // processing each bit of the flag byte in turn
  119. int mask;
  120. // We use 12 bit codes:
  121. // * 0-255 are real bytes
  122. // * 256-4095 are the substring codes
  123. // Java handily initialises our buffer / dictionary
  124. // to all zeros
  125. final byte[] buffer = new byte[DICT_SIZE];
  126. pos = populateDictionary(buffer);
  127. // These are bytes as looked up in the dictionary
  128. // It needs to be signed, as it'll get passed on to
  129. // the output stream
  130. final byte[] dataB = IOUtils.safelyAllocate(16L + codeLengthIncrease, MAX_RECORD_LENGTH);
  131. // This is an unsigned byte read from the stream
  132. // It needs to be unsigned, so that bit stuff works
  133. int dataI;
  134. // The compressed code sequence is held over 2 bytes
  135. int dataIPt1, dataIPt2;
  136. // How long a code sequence is, and where in the
  137. // dictionary to start at
  138. int len, pntr;
  139. while ((flag = src.read()) != -1) {
  140. // Compare each bit in our flag byte in turn:
  141. for (mask = 1; mask < 0x100; mask <<= 1) {
  142. // Is this a new code (un-compressed), or
  143. // the use of existing codes (compressed)?
  144. boolean isMaskSet = (flag & mask) > 0;
  145. if (isMaskSet ^ maskMeansCompressed) {
  146. // Retrieve the un-compressed code
  147. if ((dataI = src.read()) != -1) {
  148. // Save the byte into the dictionary
  149. buffer[pos++ & DICT_MASK] = (byte) dataI;
  150. // And output the byte
  151. res.write(dataI);
  152. }
  153. } else {
  154. // We have a compressed sequence
  155. // Grab the next 16 bits of data
  156. dataIPt1 = src.read();
  157. dataIPt2 = src.read();
  158. if (dataIPt1 == -1 || dataIPt2 == -1) break;
  159. // Build up how long the code sequence is, and
  160. // what position of the code to start at
  161. // (The position is the usually the first 12 bits,
  162. // and the length is usually the last 4 bits)
  163. len = (dataIPt2 & 0x0F) + codeLengthIncrease;
  164. if (positionIsBigEndian) {
  165. pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
  166. } else {
  167. pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
  168. }
  169. // Adjust the pointer as needed
  170. pntr = adjustDictionaryOffset(pntr);
  171. // Loop over the codes, outputting what they correspond to
  172. for (int i = 0; i < len; i++) {
  173. dataB[i] = buffer[(pntr + i) & DICT_MASK];
  174. buffer[(pos + i) & DICT_MASK] = dataB[i];
  175. }
  176. res.write(dataB, 0, len);
  177. // Record how far along the stream we have moved
  178. pos += len;
  179. }
  180. }
  181. }
  182. }
  183. }