You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

HDGFLZWCompressor.java 8.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hdgf;
  16. import static org.apache.poi.util.LZWDecompresser.DICT_MASK;
  17. import static org.apache.poi.util.LZWDecompresser.DICT_SIZE;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.io.OutputStream;
  21. /**
  22. * Helper class to handle the Visio compatible streaming LZW compression.
  23. * Need our own class to handle keeping track of the code buffer, pending bytes to write out etc.
  24. * <p>
  25. * TODO Fix this, as it starts to go wrong on large streams
  26. */
  27. /* package */ final class HDGFLZWCompressor {
  28. // We use 12 bit codes:
  29. // * 0-255 are real bytes
  30. // * 256-4095 are the substring codes
  31. // Java handily initialises our buffer / dictionary
  32. // to all zeros
  33. private final byte[] dict = new byte[DICT_SIZE];
  34. // The next block of data to be written out, minus its mask byte
  35. private final byte[] buffer = new byte[16];
  36. // And how long it is
  37. // (Un-compressed codes are 1 byte each, compressed codes are two)
  38. private int bufferLen;
  39. // The raw length of a code is limited to 4 bits + 2
  40. private final byte[] rawCode = new byte[18];
  41. // And how much we're using
  42. private int rawCodeLen;
  43. // How far through the input and output streams we are
  44. private int posInp;
  45. private int posOut;
  46. // What the next mask byte to output will be
  47. private int nextMask;
  48. // And how many bits we've already set
  49. private int maskBitsSet;
  50. private final OutputStream res;
  51. public HDGFLZWCompressor(OutputStream res) {
  52. this.res = res;
  53. }
  54. /**
  55. * Returns the last place that the bytes from rawCode are found
  56. * at in the buffer, or -1 if they can't be found
  57. */
  58. private int findRawCodeInBuffer() {
  59. // Work our way through all the codes until we
  60. // find the right one. Visio starts from the end
  61. for (int i = rawCodeLen+1; i < DICT_SIZE; i++) {
  62. int pos = (posInp - i) & DICT_MASK;
  63. // in the example data it seems, that the compressor doesn't like to wrap beyond DICT_SIZE
  64. // if (pos + rawCodeLen > DICT_SIZE) continue;
  65. boolean matches = true;
  66. for (int j = 0; j < rawCodeLen; j++) {
  67. if (dict[(pos + j) & DICT_MASK] != rawCode[j]) {
  68. // Doesn't fit, can't be a match
  69. matches = false;
  70. break;
  71. }
  72. }
  73. // Was this position a match?
  74. if (matches) {
  75. return pos;
  76. }
  77. }
  78. // Not found
  79. return -1;
  80. }
  81. /**
  82. * Output the compressed representation for the bytes
  83. * found in rawCode
  84. */
  85. private void outputCompressed() throws IOException {
  86. // It's not worth compressing only 1 or two bytes, due to the overheads
  87. // So if asked, just output uncompressed
  88. if (rawCodeLen < 3) {
  89. final int rcl = rawCodeLen;
  90. for (int i = 0; i < rcl; i++) {
  91. outputUncompressed(rawCode[i]);
  92. }
  93. return;
  94. }
  95. // Grab where the data lives
  96. int codesAt = findRawCodeInBuffer();
  97. codesAt = (codesAt-18) & DICT_MASK;
  98. // Increment the mask bit count, we've done another code
  99. maskBitsSet++;
  100. // Add the length+code to the buffer
  101. // (The position is the first 12 bits, the length is the last 4 bits)
  102. int bp1 = (codesAt & 0xFF);
  103. int bp2 = (rawCodeLen - 3) + ((codesAt - bp1) >>> 4);
  104. buffer[bufferLen++] = (byte) bp1;
  105. buffer[bufferLen++] = (byte) bp2;
  106. assert(maskBitsSet <= 8);
  107. // If we're now at 8 codes, output
  108. if (maskBitsSet == 8) {
  109. output8Codes();
  110. }
  111. rawCodeLen = 0;
  112. }
  113. /**
  114. * Output the un-compressed byte
  115. */
  116. private void outputUncompressed(byte b) throws IOException {
  117. // Set the mask bit for us
  118. nextMask += (1 << maskBitsSet);
  119. maskBitsSet++;
  120. // And add us to the buffer + dictionary
  121. buffer[bufferLen++] = b;
  122. // If we're now at 8 codes, output
  123. if (maskBitsSet == 8) {
  124. output8Codes();
  125. }
  126. rawCodeLen = 0;
  127. }
  128. /**
  129. * We've got 8 code worth to write out, so
  130. * output along with the header
  131. */
  132. private void output8Codes() throws IOException {
  133. // Output the mask and the data
  134. res.write(nextMask);
  135. res.write(buffer, 0, bufferLen);
  136. posOut += 1 + bufferLen;
  137. // Reset things
  138. nextMask = 0;
  139. maskBitsSet = 0;
  140. bufferLen = 0;
  141. }
  142. /**
  143. * Does the compression
  144. */
  145. public void compress(InputStream src) throws IOException {
  146. int dataI = -1;
  147. while (true) {
  148. if (dataI > -1) {
  149. // copy the last read byte into the dictionary.
  150. // the example data compressor used self references, so we don't wait for filling the dictionary
  151. // until we know if it's a un-/compressed token.
  152. dict[(posInp++) & DICT_MASK] = (byte)dataI;
  153. }
  154. // This is an unsigned byte read from the stream
  155. // It needs to be unsigned, so that bit stuff works
  156. dataI = src.read();
  157. // If we've run out of data, output anything that's pending then finish
  158. if (dataI == -1) {
  159. if (rawCodeLen > 0) {
  160. outputCompressed();
  161. if (maskBitsSet > 0) {
  162. output8Codes();
  163. }
  164. }
  165. break;
  166. }
  167. // This is a byte as looked up in the dictionary
  168. // It needs to be signed, as it'll get passed on to the output stream
  169. byte dataB = (byte) dataI;
  170. // Try adding this new byte onto rawCode, and see if all of that is still found
  171. // in the buffer dictionary or not
  172. rawCode[rawCodeLen++] = dataB;
  173. int rawAt = findRawCodeInBuffer();
  174. if (rawAt > -1) {
  175. // If we found it and are now at 18 bytes, we need to output our pending code block
  176. if (rawCodeLen == 18) {
  177. outputCompressed();
  178. }
  179. // If we did find all of rawCode with our new byte added on,
  180. // we can wait to see what happens with the next byte
  181. continue;
  182. }
  183. // If we get here, then the rawCode + this byte weren't found in the dictionary
  184. // If there was something in rawCode before, then that was
  185. // found in the dictionary, so output that compressed
  186. rawCodeLen--;
  187. if (rawCodeLen > 0) {
  188. // Output the old rawCode
  189. outputCompressed();
  190. // Can this byte start a new rawCode, or does it need outputting itself?
  191. rawCode[0] = dataB;
  192. rawCodeLen = 1;
  193. if (findRawCodeInBuffer() > -1) {
  194. // Fits in, wait for next byte
  195. continue;
  196. }
  197. // Doesn't fit, output
  198. outputUncompressed(dataB);
  199. } else {
  200. // Nothing in rawCode before, so this byte isn't in the buffer dictionary
  201. // Output it un-compressed
  202. outputUncompressed(dataB);
  203. }
  204. }
  205. }
  206. }