poi-scratchpad/src/main/java/org/apache/poi/hdgf/HDGFLZWCompressor.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.hdgf;

import static org.apache.poi.util.LZWDecompresser.DICT_MASK;
import static org.apache.poi.util.LZWDecompresser.DICT_SIZE;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

/**
 * Helper class to handle the Visio compatible streaming LZW compression.
 * Need our own class to handle keeping track of the code buffer, pending bytes to write out etc.
 * <p>
 * TODO Fix this, as it starts to go wrong on large streams
 */
/* package */ final class HDGFLZWCompressor {
    // We use 12 bit codes:
    // * 0-255 are real bytes
    // * 256-4095 are the substring codes
    // Java handily initialises our buffer / dictionary
    //  to all zeros
    private final byte[] dict = new byte[DICT_SIZE];

    // The next block of data to be written out, minus its mask byte
    private final byte[] buffer = new byte[16];
    // And how long it is
    // (Un-compressed codes are 1 byte each, compressed codes are two)
    private int bufferLen;

    // The raw length of a code is limited to 4 bits + 2
    private final byte[] rawCode = new byte[18];
    // And how much we're using
    private int rawCodeLen;

    // How far through the input and output streams we are
    private int posInp;
    private int posOut;

    // What the next mask byte to output will be
    private int nextMask;
    // And how many bits we've already set
    private int maskBitsSet;

    private final OutputStream res;

    public HDGFLZWCompressor(OutputStream res) {
        this.res = res;
    }

    /**
     * Returns the last place that the bytes from rawCode are found
     * at in the buffer, or -1 if they can't be found
     */
    private int findRawCodeInBuffer() {
        // Work our way through all the codes until we
        //  find the right one. Visio starts from the end
        for (int i = rawCodeLen+1; i < DICT_SIZE; i++) {
            int pos = (posInp - i) & DICT_MASK;
            // in the example data it seems, that the compressor doesn't like to wrap beyond DICT_SIZE
            // if (pos + rawCodeLen > DICT_SIZE) continue;
            boolean matches = true;
            for (int j = 0; j < rawCodeLen; j++) {
                if (dict[(pos + j) & DICT_MASK] != rawCode[j]) {
                    // Doesn't fit, can't be a match
                    matches = false;
                    break;
                }
            }

            // Was this position a match?
            if (matches) {
                return pos;
            }
        }

        // Not found
        return -1;
    }

    /**
     * Output the compressed representation for the bytes
     * found in rawCode
     */
    private void outputCompressed() throws IOException {
        // It's not worth compressing only 1 or two bytes, due to the overheads
        // So if asked, just output uncompressed
        if (rawCodeLen < 3) {
            final int rcl = rawCodeLen;
            for (int i = 0; i < rcl; i++) {
                outputUncompressed(rawCode[i]);
            }
            return;
        }

        // Grab where the data lives
        int codesAt = findRawCodeInBuffer();
        codesAt = (codesAt-18) & DICT_MASK;

        // Increment the mask bit count, we've done another code
        maskBitsSet++;

        // Add the length+code to the buffer
        // (The position is the first 12 bits, the length is the last 4 bits)
        int bp1 = (codesAt & 0xFF);
        int bp2 = (rawCodeLen - 3) + ((codesAt - bp1) >>> 4);
        buffer[bufferLen++] = (byte) bp1;
        buffer[bufferLen++] = (byte) bp2;

        assert(maskBitsSet <= 8);

        // If we're now at 8 codes, output
        if (maskBitsSet == 8) {
            output8Codes();
        }

        rawCodeLen = 0;
    }

    /**
     * Output the un-compressed byte
     */
    private void outputUncompressed(byte b) throws IOException {
        // Set the mask bit for us
        nextMask += (1 << maskBitsSet);
        maskBitsSet++;

        // And add us to the buffer + dictionary
        buffer[bufferLen++] = b;

        // If we're now at 8 codes, output
        if (maskBitsSet == 8) {
            output8Codes();
        }

        rawCodeLen = 0;
    }

    /**
     * We've got 8 code worth to write out, so
     * output along with the header
     */
    private void output8Codes() throws IOException {
        // Output the mask and the data
        res.write(nextMask);
        res.write(buffer, 0, bufferLen);
        posOut += 1 + bufferLen;

        // Reset things
        nextMask = 0;
        maskBitsSet = 0;
        bufferLen = 0;
    }

    /**
     * Does the compression
     */
    public void compress(InputStream src) throws IOException {
        int dataI = -1;
        while (true) {
            if (dataI > -1) {
                // copy the last read byte into the dictionary.
                // the example data compressor used self references, so we don't wait for filling the dictionary
                // until we know if it's an un-/compressed token.
                dict[(posInp++) & DICT_MASK] = (byte)dataI;
            }
            // This is an unsigned byte read from the stream
            // It needs to be unsigned, so that bit stuff works
            dataI = src.read();

            // If we've run out of data, output anything that's pending then finish
            if (dataI == -1) {
                if (rawCodeLen > 0) {
                    outputCompressed();
                    if (maskBitsSet > 0) {
                        output8Codes();
                    }
                }
                break;
            }

            // This is a byte as looked up in the dictionary
            // It needs to be signed, as it'll get passed on to the output stream
            byte dataB = (byte) dataI;

            // Try adding this new byte onto rawCode, and see if all of that is still found
            // in the buffer dictionary or not
            rawCode[rawCodeLen++] = dataB;
            int rawAt = findRawCodeInBuffer();

            if (rawAt > -1) {
                // If we found it and are now at 18 bytes, we need to output our pending code block
                if (rawCodeLen == 18) {
                    outputCompressed();
                }

                // If we did find all of rawCode with our new byte added on,
                // we can wait to see what happens with the next byte
                continue;
            }

            // If we get here, then the rawCode + this byte weren't found in the dictionary

            // If there was something in rawCode before, then that was
            // found in the dictionary, so output that compressed
            rawCodeLen--;
            if (rawCodeLen > 0) {
                // Output the old rawCode
                outputCompressed();

                // Can this byte start a new rawCode, or does it need outputting itself?
                rawCode[0] = dataB;
                rawCodeLen = 1;
                if (findRawCodeInBuffer() > -1) {
                    // Fits in, wait for next byte
                    continue;
                }
                // Doesn't fit, output
                outputUncompressed(dataB);
            } else {
                // Nothing in rawCode before, so this byte isn't in the buffer dictionary
                // Output it un-compressed
                outputUncompressed(dataB);
            }
        }
    }
}