A bit more on HDGF LZW compression, but it's still not quite complete

author Nick Burch <nick@apache.org>

Sat, 27 Oct 2007 22:50:41 +0000 (22:50 +0000)

committer Nick Burch <nick@apache.org>

Sat, 27 Oct 2007 22:50:41 +0000 (22:50 +0000)
author Nick Burch <nick@apache.org>
Sat, 27 Oct 2007 22:50:41 +0000 (22:50 +0000)
committer Nick Burch <nick@apache.org>
Sat, 27 Oct 2007 22:50:41 +0000 (22:50 +0000)
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java

index 2b1280837631899506e00a5ec91ee4f1342ab3b7..049a768dcce7ba6bd4da1fdf047317711d4a5f27 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@@ -170,18 +170,27 @@ public void decode(InputStream src, OutputStream res) throws IOException {
  
  /**
   * Performs the Visio compatible streaming LZW compression.
- * Works by:
- * 1) ???
- * 2) ???
   * TODO - Finish
   */
  public void compress(InputStream src, OutputStream res) throws IOException {
+       Compressor c = new Compressor();
+       c.compress(src, res);
+}
+
+/**
+ * Helper class to handle the Visio compatible
+ *  streaming LZW compression.
+ * Need our own class to handle keeping track of the
+ *  code buffer, pending bytes to write out etc.
+ */
+private class Compressor {
         // We use 12 bit codes:
         // * 0-255 are real bytes
         // * 256-4095 are the substring codes
         // Java handily initialises our buffer / dictionary
         //  to all zeros
         byte[] dict = new byte[4096];
+       
         // The next block of data to be written out, minus
         //  its mask byte
         byte[] buffer = new byte[16];
@@ -190,6 +199,11 @@ public void compress(InputStream src, OutputStream res) throws IOException {
         //   are two)
         int bufferLen = 0;
         
+       // The raw length of a code is limited to 4 bits
+       byte[] rawCode = new byte[16];
+       // And how much we're using
+       int rawCodeLen = 0;
+       
         // How far through the input and output streams we are
         int posInp = 0;
         int posOut = 0;
@@ -199,6 +213,101 @@ public void compress(InputStream src, OutputStream res) throws IOException {
         // And how many bits we've already set
         int maskBitsSet = 0;
         
+/**
+ * Returns the last place that the bytes from rawCode are found
+ *  at in the buffer, or -1 if they can't be found
+ */
+private int findRawCodeInBuffer() {
+       // Work our way back from the end
+       // (Visio always seems to use the last possible code)
+       for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
+               boolean matches = true;
+               for(int j=0; matches && j<rawCodeLen; j++) {
+                       if(buffer[i] == rawCode[j]) {
+                               // Fits
+                       } else {
+                               // Doesn't fit, can't be a match
+                               matches = false;
+                       }
+               }
+               
+               // Was this position a match?
+               if(matches) {
+                       return i;
+               }
+       }
+
+       // Not found
+       return -1;
+}
+
+/**
+ * Output the compressed representation for the bytes
+ *  found in rawCode
+ */
+private void outputCompressed(OutputStream res) throws IOException {
+       // It's not worth compressing only 1 or two bytes,
+       //  due to the overheads
+       // So if asked, just output uncompressed
+       if(rawCodeLen < 3) {
+               for(int i=0; i<rawCodeLen; i++) {
+                       outputUncompressed(rawCode[i], res);
+               }
+               return;
+       }
+       
+       // Increment the mask bit count, we've done another code
+       maskBitsSet++;
+       // Add the length+code to the buffer
+       // TODO
+       posOut += 2;
+       
+       // If we're now at 8 codes, output
+       if(maskBitsSet == 8) {
+               output8Codes(res);
+       }
+}
+/**
+ * Output the un-compressed byte
+ */
+private void outputUncompressed(byte b, OutputStream res) throws IOException {
+       // Set the mask bit for us 
+       nextMask += (1<<maskBitsSet);
+       
+       // And add us to the buffer + dictionary
+       buffer[bufferLen] = fromInt(b);
+       bufferLen++;
+       dict[(posOut&4095)] = fromInt(b);
+       posOut++;
+       
+       // If we're now at 8 codes, output
+       if(maskBitsSet == 8) {
+               output8Codes(res);
+       }
+}
+
+/**
+ * We've got 8 code worth to write out, so
+ *  output along with the header
+ */
+private void output8Codes(OutputStream res) throws IOException {
+       // Output the mask and the data
+       res.write(new byte[] { fromInt(nextMask) } );
+       res.write(buffer, 0, bufferLen);
+       
+       // Reset things
+       nextMask = 0;
+       maskBitsSet = 0;
+       bufferLen = 0;
+}
+       
+/**
+ * Does the compression
+ */
+private void compress(InputStream src, OutputStream res) throws IOException {
+       // Have we hit the end of the file yet?
+       boolean going = true;
+       
         // This is a byte as looked up in the dictionary
         // It needs to be signed, as it'll get passed on to
         //  the output stream
@@ -207,49 +316,68 @@ public void compress(InputStream src, OutputStream res) throws IOException {
         // It needs to be unsigned, so that bit stuff works
         int dataI;
         
-       // Have we hit the end of the file yet?
-       boolean going = true;
-       
         while( going ) {
                 dataI = src.read();
                 posInp++;
                 if(dataI == -1) { going = false; }
+               dataB = fromInt(dataI);
                 
-               // Decide if we're going to output uncompressed or compressed
-               //  for this byte
-               // (It takes 2 bytes to hold a compressed code, so it's only
-               //  worth doing for 3+ byte long sequences)
-               // TODO
+               // If we've run out of data, output anything that's
+               //  pending then finish
+               if(!going && rawCodeLen > 0) {
+                       outputCompressed(res);
+                       break;
+               }
+       
+               // Try adding this new byte onto rawCode, and
+               //  see if all of that is still found in the
+               //  buffer dictionary or not
+               rawCode[rawCodeLen] = dataB;
+               rawCodeLen++;
+               int rawAt = findRawCodeInBuffer();
                 
-               boolean compressThis = true;
-               if(compressThis) {
-                       // Set the mask bit for us 
-                       nextMask += (1<<maskBitsSet);
-                       
-                       // And add us to the buffer + dictionary
-                       buffer[bufferLen] = fromInt(dataI);
-                       bufferLen++;
-                       dict[(posOut&4095)] = fromInt(dataI);
-                       posOut++;
-               } else {
-                       // ????
+               // If we found it and are now at 16 bytes,
+               //  we need to output our pending code block
+               if(rawCodeLen == 16 && rawAt > -1) {
+                       outputCompressed(res);
+                       rawCodeLen = 0;
+                       continue;
+               }
+               
+               // If we did find all of rawCode with our new
+               //  byte added on, we can wait to see what happens
+               //  with the next byte
+               if(rawAt > -1) {
+                       continue;
                 }
-               // Increment the mask bit count, we've done another code
-               maskBitsSet++;
                 
-               // If we've just done the 8th bit, or reached the end
-               //  of the stream, output our mask and data
-               if(maskBitsSet == 8 || !going) {
-                       // Output
-                       res.write(new byte[] { fromInt(nextMask) } );
-                       res.write(buffer, 0, bufferLen);
+               // If there was something in rawCode before, then we
+               //  need to output that
+               rawCodeLen--;
+               if(rawCodeLen > 0) {
+                       // Output the old rawCode
+                       outputCompressed(res);
                         
-                       // Reset things
-                       nextMask = 0;
-                       maskBitsSet = 0;
-                       bufferLen = 0;
+                       // Can this byte start a new rawCode, or does
+                       //  it need outputting itself?
+                       rawCode[0] = dataB;
+                       rawCodeLen = 1;
+                       if(findRawCodeInBuffer() > -1) {
+                               // Fits in, wait for next byte
+                               continue;
+                       } else {
+                               // Doesn't fit, output
+                               outputUncompressed(dataB,res);
+                               rawCodeLen = 0;
+                       }
+               } else {
+                       // Nothing in rawCode before, so this byte
+                       //  isn't in the buffer dictionary
+                       // Output it un-compressed
+                       outputUncompressed(dataB,res);
                 }
         }
  }
+}
  
  }
 \ No newline at end of file
author	Nick Burch <nick@apache.org>
	Sat, 27 Oct 2007 22:50:41 +0000 (22:50 +0000)
committer	Nick Burch <nick@apache.org>
	Sat, 27 Oct 2007 22:50:41 +0000 (22:50 +0000)