123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hdgf;
-
- import static org.apache.poi.util.LZWDecompresser.DICT_MASK;
- import static org.apache.poi.util.LZWDecompresser.DICT_SIZE;
-
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.OutputStream;
-
- /**
- * Helper class to handle the Visio compatible streaming LZW compression.
- * Need our own class to handle keeping track of the code buffer, pending bytes to write out etc.
- * <p>
- * TODO Fix this, as it starts to go wrong on large streams
- */
- /* package */ final class HDGFLZWCompressor {
- // We use 12 bit codes:
- // * 0-255 are real bytes
- // * 256-4095 are the substring codes
- // Java handily initialises our buffer / dictionary
- // to all zeros
- private final byte[] dict = new byte[DICT_SIZE];
-
- // The next block of data to be written out, minus its mask byte
- private final byte[] buffer = new byte[16];
- // And how long it is
- // (Un-compressed codes are 1 byte each, compressed codes are two)
- private int bufferLen;
-
- // The raw length of a code is limited to 4 bits + 2
- private final byte[] rawCode = new byte[18];
- // And how much we're using
- private int rawCodeLen;
-
- // How far through the input and output streams we are
- private int posInp;
- private int posOut;
-
- // What the next mask byte to output will be
- private int nextMask;
- // And how many bits we've already set
- private int maskBitsSet;
-
- private final OutputStream res;
-
- public HDGFLZWCompressor(OutputStream res) {
- this.res = res;
- }
-
- /**
- * Returns the last place that the bytes from rawCode are found
- * at in the buffer, or -1 if they can't be found
- */
- private int findRawCodeInBuffer() {
- // Work our way through all the codes until we
- // find the right one. Visio starts from the end
- for (int i = rawCodeLen+1; i < DICT_SIZE; i++) {
- int pos = (posInp - i) & DICT_MASK;
- // in the example data it seems, that the compressor doesn't like to wrap beyond DICT_SIZE
- // if (pos + rawCodeLen > DICT_SIZE) continue;
- boolean matches = true;
- for (int j = 0; j < rawCodeLen; j++) {
- if (dict[(pos + j) & DICT_MASK] != rawCode[j]) {
- // Doesn't fit, can't be a match
- matches = false;
- break;
- }
- }
-
- // Was this position a match?
- if (matches) {
- return pos;
- }
- }
-
- // Not found
- return -1;
- }
-
- /**
- * Output the compressed representation for the bytes
- * found in rawCode
- */
- private void outputCompressed() throws IOException {
- // It's not worth compressing only 1 or two bytes, due to the overheads
- // So if asked, just output uncompressed
- if (rawCodeLen < 3) {
- final int rcl = rawCodeLen;
- for (int i = 0; i < rcl; i++) {
- outputUncompressed(rawCode[i]);
- }
- return;
- }
-
- // Grab where the data lives
- int codesAt = findRawCodeInBuffer();
- codesAt = (codesAt-18) & DICT_MASK;
-
- // Increment the mask bit count, we've done another code
- maskBitsSet++;
-
- // Add the length+code to the buffer
- // (The position is the first 12 bits, the length is the last 4 bits)
- int bp1 = (codesAt & 0xFF);
- int bp2 = (rawCodeLen - 3) + ((codesAt - bp1) >>> 4);
- buffer[bufferLen++] = (byte) bp1;
- buffer[bufferLen++] = (byte) bp2;
-
- assert(maskBitsSet <= 8);
-
- // If we're now at 8 codes, output
- if (maskBitsSet == 8) {
- output8Codes();
- }
-
- rawCodeLen = 0;
- }
-
- /**
- * Output the un-compressed byte
- */
- private void outputUncompressed(byte b) throws IOException {
- // Set the mask bit for us
- nextMask += (1 << maskBitsSet);
- maskBitsSet++;
-
- // And add us to the buffer + dictionary
- buffer[bufferLen++] = b;
-
- // If we're now at 8 codes, output
- if (maskBitsSet == 8) {
- output8Codes();
- }
-
- rawCodeLen = 0;
- }
-
- /**
- * We've got 8 code worth to write out, so
- * output along with the header
- */
- private void output8Codes() throws IOException {
- // Output the mask and the data
- res.write(nextMask);
- res.write(buffer, 0, bufferLen);
- posOut += 1 + bufferLen;
-
- // Reset things
- nextMask = 0;
- maskBitsSet = 0;
- bufferLen = 0;
- }
-
- /**
- * Does the compression
- */
- public void compress(InputStream src) throws IOException {
- int dataI = -1;
- while (true) {
- if (dataI > -1) {
- // copy the last read byte into the dictionary.
- // the example data compressor used self references, so we don't wait for filling the dictionary
- // until we know if it's a un-/compressed token.
- dict[(posInp++) & DICT_MASK] = (byte)dataI;
- }
- // This is an unsigned byte read from the stream
- // It needs to be unsigned, so that bit stuff works
- dataI = src.read();
-
- // If we've run out of data, output anything that's pending then finish
- if (dataI == -1) {
- if (rawCodeLen > 0) {
- outputCompressed();
- if (maskBitsSet > 0) {
- output8Codes();
- }
- }
- break;
- }
-
- // This is a byte as looked up in the dictionary
- // It needs to be signed, as it'll get passed on to the output stream
- byte dataB = (byte) dataI;
-
- // Try adding this new byte onto rawCode, and see if all of that is still found
- // in the buffer dictionary or not
- rawCode[rawCodeLen++] = dataB;
- int rawAt = findRawCodeInBuffer();
-
- if (rawAt > -1) {
- // If we found it and are now at 18 bytes, we need to output our pending code block
- if (rawCodeLen == 18) {
- outputCompressed();
- }
-
- // If we did find all of rawCode with our new byte added on,
- // we can wait to see what happens with the next byte
- continue;
- }
-
- // If we get here, then the rawCode + this byte weren't found in the dictionary
-
- // If there was something in rawCode before, then that was
- // found in the dictionary, so output that compressed
- rawCodeLen--;
- if (rawCodeLen > 0) {
- // Output the old rawCode
- outputCompressed();
-
- // Can this byte start a new rawCode, or does it need outputting itself?
- rawCode[0] = dataB;
- rawCodeLen = 1;
- if (findRawCodeInBuffer() > -1) {
- // Fits in, wait for next byte
- continue;
- }
- // Doesn't fit, output
- outputUncompressed(dataB);
- } else {
- // Nothing in rawCode before, so this byte isn't in the buffer dictionary
- // Output it un-compressed
- outputUncompressed(dataB);
- }
- }
- }
- }
|