/**
* Performs the Visio compatible streaming LZW compression.
- * Works by:
- * 1) ???
- * 2) ???
* TODO - Finish
*/
public void compress(InputStream src, OutputStream res) throws IOException {
+ Compressor c = new Compressor();
+ c.compress(src, res);
+}
+
+/**
+ * Helper class to handle the Visio compatible
+ * streaming LZW compression.
+ * Need our own class to handle keeping track of the
+ * code buffer, pending bytes to write out etc.
+ */
+private class Compressor {
// We use 12 bit codes:
// * 0-255 are real bytes
// * 256-4095 are the substring codes
// Java handily initialises our buffer / dictionary
// to all zeros
byte[] dict = new byte[4096];
+
// The next block of data to be written out, minus
// its mask byte
byte[] buffer = new byte[16];
// are two)
int bufferLen = 0;
+ // The raw length of a code is limited to 4 bits
+ byte[] rawCode = new byte[16];
+ // And how much we're using
+ int rawCodeLen = 0;
+
// How far through the input and output streams we are
int posInp = 0;
int posOut = 0;
// And how many bits we've already set
int maskBitsSet = 0;
+/**
+ * Returns the last place that the bytes from rawCode are found
+ * at in the buffer, or -1 if they can't be found
+ */
+private int findRawCodeInBuffer() {
+ // Work our way back from the end
+ // (Visio always seems to use the last possible code)
+ for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
+ boolean matches = true;
+ for(int j=0; matches && j<rawCodeLen; j++) {
+ if(buffer[i] == rawCode[j]) {
+ // Fits
+ } else {
+ // Doesn't fit, can't be a match
+ matches = false;
+ }
+ }
+
+ // Was this position a match?
+ if(matches) {
+ return i;
+ }
+ }
+
+ // Not found
+ return -1;
+}
+
+/**
+ * Output the compressed representation for the bytes
+ * found in rawCode
+ */
+private void outputCompressed(OutputStream res) throws IOException {
+ // It's not worth compressing only 1 or two bytes,
+ // due to the overheads
+ // So if asked, just output uncompressed
+ if(rawCodeLen < 3) {
+ for(int i=0; i<rawCodeLen; i++) {
+ outputUncompressed(rawCode[i], res);
+ }
+ return;
+ }
+
+ // Increment the mask bit count, we've done another code
+ maskBitsSet++;
+ // Add the length+code to the buffer
+ // TODO
+ posOut += 2;
+
+ // If we're now at 8 codes, output
+ if(maskBitsSet == 8) {
+ output8Codes(res);
+ }
+}
+/**
+ * Output the un-compressed byte
+ */
+private void outputUncompressed(byte b, OutputStream res) throws IOException {
+ // Set the mask bit for us
+ nextMask += (1<<maskBitsSet);
+
+ // And add us to the buffer + dictionary
+ buffer[bufferLen] = fromInt(b);
+ bufferLen++;
+ dict[(posOut&4095)] = fromInt(b);
+ posOut++;
+
+ // If we're now at 8 codes, output
+ if(maskBitsSet == 8) {
+ output8Codes(res);
+ }
+}
+
+/**
+ * We've got 8 code worth to write out, so
+ * output along with the header
+ */
+private void output8Codes(OutputStream res) throws IOException {
+ // Output the mask and the data
+ res.write(new byte[] { fromInt(nextMask) } );
+ res.write(buffer, 0, bufferLen);
+
+ // Reset things
+ nextMask = 0;
+ maskBitsSet = 0;
+ bufferLen = 0;
+}
+
+/**
+ * Does the compression
+ */
+private void compress(InputStream src, OutputStream res) throws IOException {
+ // Have we hit the end of the file yet?
+ boolean going = true;
+
// This is a byte as looked up in the dictionary
// It needs to be signed, as it'll get passed on to
// the output stream
// It needs to be unsigned, so that bit stuff works
int dataI;
- // Have we hit the end of the file yet?
- boolean going = true;
-
while( going ) {
dataI = src.read();
posInp++;
if(dataI == -1) { going = false; }
+ dataB = fromInt(dataI);
- // Decide if we're going to output uncompressed or compressed
- // for this byte
- // (It takes 2 bytes to hold a compressed code, so it's only
- // worth doing for 3+ byte long sequences)
- // TODO
+ // If we've run out of data, output anything that's
+ // pending then finish
+ if(!going && rawCodeLen > 0) {
+ outputCompressed(res);
+ break;
+ }
+
+ // Try adding this new byte onto rawCode, and
+ // see if all of that is still found in the
+ // buffer dictionary or not
+ rawCode[rawCodeLen] = dataB;
+ rawCodeLen++;
+ int rawAt = findRawCodeInBuffer();
- boolean compressThis = true;
- if(compressThis) {
- // Set the mask bit for us
- nextMask += (1<<maskBitsSet);
-
- // And add us to the buffer + dictionary
- buffer[bufferLen] = fromInt(dataI);
- bufferLen++;
- dict[(posOut&4095)] = fromInt(dataI);
- posOut++;
- } else {
- // ????
+ // If we found it and are now at 16 bytes,
+ // we need to output our pending code block
+ if(rawCodeLen == 16 && rawAt > -1) {
+ outputCompressed(res);
+ rawCodeLen = 0;
+ continue;
+ }
+
+ // If we did find all of rawCode with our new
+ // byte added on, we can wait to see what happens
+ // with the next byte
+ if(rawAt > -1) {
+ continue;
}
- // Increment the mask bit count, we've done another code
- maskBitsSet++;
- // If we've just done the 8th bit, or reached the end
- // of the stream, output our mask and data
- if(maskBitsSet == 8 || !going) {
- // Output
- res.write(new byte[] { fromInt(nextMask) } );
- res.write(buffer, 0, bufferLen);
+ // If there was something in rawCode before, then we
+ // need to output that
+ rawCodeLen--;
+ if(rawCodeLen > 0) {
+ // Output the old rawCode
+ outputCompressed(res);
- // Reset things
- nextMask = 0;
- maskBitsSet = 0;
- bufferLen = 0;
+ // Can this byte start a new rawCode, or does
+ // it need outputting itself?
+ rawCode[0] = dataB;
+ rawCodeLen = 1;
+ if(findRawCodeInBuffer() > -1) {
+ // Fits in, wait for next byte
+ continue;
+ } else {
+ // Doesn't fit, output
+ outputUncompressed(dataB,res);
+ rawCodeLen = 0;
+ }
+ } else {
+ // Nothing in rawCode before, so this byte
+ // isn't in the buffer dictionary
+ // Output it un-compressed
+ outputUncompressed(dataB,res);
}
}
}
+}
}
\ No newline at end of file