Improve the HDGF LZW code.

author Nick Burch <nick@apache.org>

Thu, 16 Dec 2010 07:41:41 +0000 (07:41 +0000)

committer Nick Burch <nick@apache.org>

Thu, 16 Dec 2010 07:41:41 +0000 (07:41 +0000)
author Nick Burch <nick@apache.org>
Thu, 16 Dec 2010 07:41:41 +0000 (07:41 +0000)
committer Nick Burch <nick@apache.org>
Thu, 16 Dec 2010 07:41:41 +0000 (07:41 +0000)
diff --git a/src/documentation/content/xdocs/hdgf/index.xml b/src/documentation/content/xdocs/hdgf/index.xml

index 9e4ca894cb66011e4c3d04450b31c08ad7ebd6a6..fc24c108d7c641dad6628bd415415ba1950db83e 100644 (file)
--- a/src/documentation/content/xdocs/hdgf/index.xml
+++ b/src/documentation/content/xdocs/hdgf/index.xml
@@ -72,7 +72,8 @@
                                   HDGFLZW, which will be much better documented, and also 
                                   under the ASL. <strong>Completed October 2007</strong></li>
                                  <li>Add compression support to HDGFLZW. 
-                                 <strong>In progress</strong></li>
+                                 <strong>In progress - works for small streams but encoding
+               goes wrong on larger ones</strong></li>
                                  <li>Have HDGF just write back the raw bytes it read in, and
                                   have a test to ensure the file is un-changed.</li>
                                  <li>Have HDGF generate the bytes to write out from the
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 589ce752cb0d36fd0a9a5c6565ee53a339e516d5..fed8713441d0a367f8c4821ed7f5674ab572b556 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,8 @@
  
      <changes>
          <release version="3.8-beta1" date="2010-??-??">
+           <action dev="poi-developers" type="add">Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them</action>
+           <action dev="poi-developers" type="add">Partial HDGF LZW compression support</action>
             <action dev="poi-developers" type="add">50244 - Support for continued NameRecords</action>
             <action dev="POI-DEVELOPERS" type="fix">50416 - Correct shifting of the first or last row in a sheet by multiple rows</action>
             <action dev="POI-DEVELOPERS" type="fix">50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not)</action>
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java

index 0595343d45241d204fc6128408ef2e7cb09c4727..d6d8d6b764df419fa3d4e7c38c5f90ec64534809 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@@ -35,358 +35,147 @@ import java.io.OutputStream;
   */
  public class HDGFLZW {
  
-/**
- * Given an integer, turn it into a java byte, handling
- *  the wrapping.
- * This is a convenience method
- */
-public static byte fromInt(int b) {
-       if(b < 128) return (byte)b;
-       return (byte)(b - 256);
-}
-/**
- * Given a java byte, turn it into an integer between 0
- *  and 255 (i.e. handle the unwrapping).
- * This is a convenience method
- */
-public static int fromByte(byte b) {
-       if(b >= 0) {
-               return b;
-       }
-       return b + 256;
-}
-
-/**
- * Compress the given input stream, returning the array of bytes
- *  of the compressed input
- */
-public byte[] compress(InputStream src) throws IOException {
-       ByteArrayOutputStream res = new ByteArrayOutputStream();
-       compress(src,res);
-    return res.toByteArray();
-}
-
-/**
- * Decompresses the given input stream, returning the array of bytes
- *  of the decompressed input.
- */
-public byte[] decode(InputStream src) throws IOException {
-       ByteArrayOutputStream res = new ByteArrayOutputStream();
-       decode(src,res);
-    return res.toByteArray();
-}
-/**
- * Perform a streaming decompression of the input.
- * Works by:
- * 1) Reading a flag byte, the 8 bits of which tell you if the
- *     following 8 codes are compressed our un-compressed
- * 2) Consider the 8 bits in turn
- * 3) If the bit is set, the next code is un-compressed, so
- *     add it to the dictionary and output it
- * 4) If the bit isn't set, then read in the length and start
- *     position in the dictionary, and output the bytes there
- * 5) Loop until we've done all 8 bits, then read in the next
- *     flag byte
- */
-public void decode(InputStream src, OutputStream res) throws IOException {
-       // We use 12 bit codes:
-       // * 0-255 are real bytes
-       // * 256-4095 are the substring codes
-       // Java handily initialises our buffer / dictionary
-       //  to all zeros
-       byte[] buffer = new byte[4096];
-
-       // How far through the output we've got
-       // (This is normally used &4095, so it nicely wraps)
-       int pos = 0;
-       // The flag byte is treated as its 8 individual
-       //  bits, which tell us if the following 8 codes
-       //  are compressed or un-compressed
-       int flag;
-       // The mask, between 1 and 255, which is used when
-       //  processing each bit of the flag byte in turn
-       int mask;
-
-       // This is a byte as looked up in the dictionary
-       // It needs to be signed, as it'll get passed on to
-       //  the output stream
-       byte dataB;
-       // This is an unsigned byte read from the stream
-       // It needs to be unsigned, so that bit stuff works
-       int dataI;
-       // The compressed code sequence is held over 2 bytes
-       int dataIPt1, dataIPt2;
-       // How long a code sequence is, and where in the
-       //  dictionary to start at
-       int len, pntr;
-
-       while( (flag = src.read()) != -1 ) {
-               // Compare each bit in our flag byte in turn:
-               for(mask = 1; mask < 256 ; mask <<= 1) {
-                       // Is this a new code (un-compressed), or
-                       //  the use of existing codes (compressed)?
-                       if( (flag & mask) > 0 ) {
-                               // Retrieve the un-compressed code
-                               if( (dataI = src.read()) != -1) {
-                                       // Save the byte into the dictionary
-                                       buffer[(pos&4095)] = fromInt(dataI);
-                                       pos++;
-                                       // And output the byte
-                                       res.write( new byte[] {fromInt(dataI)} );
-                               }
-                       } else {
-                               // We have a compressed sequence
-                               // Grab the next 16 bits of data
-                               dataIPt1 = src.read();
-                               dataIPt2 = src.read();
-                               if(dataIPt1 == -1 || dataIPt2 == -1) break;
-
-                               // Build up how long the code sequence is, and
-                               //  what position of the code to start at
-                               // (The position is the first 12 bits, the
-                               //  length is the last 4 bits)
-                               len = (dataIPt2 & 15) + 3;
-                               pntr = (dataIPt2 & 240)*16 + dataIPt1;
-
-                               // If the pointer happens to be passed the end
-                               //  of our buffer, then wrap around
-                               if(pntr > 4078) {
-                                       pntr = pntr - 4078;
-                               } else {
-                                       pntr = pntr + 18;
-                               }
-
-                               // Loop over the codes, outputting what they correspond to
-                               for(int i=0; i<len; i++) {
-                                       buffer [(pos + i) & 4095] = buffer [(pntr + i) & 4095];
-                                       dataB = buffer[(pntr + i) & 4095];
-                                       res.write(new byte[] {dataB});
-                               }
-
-                               // Record how far along the stream we have moved
-                               pos = pos + len;
-                       }
-               }
-    }
-}
-
-/**
- * Performs the Visio compatible streaming LZW compression.
- * TODO - Finish
- */
-public void compress(InputStream src, OutputStream res) throws IOException {
-       Compressor c = new Compressor();
-       c.compress(src, res);
-}
-
-/**
- * Helper class to handle the Visio compatible
- *  streaming LZW compression.
- * Need our own class to handle keeping track of the
- *  code buffer, pending bytes to write out etc.
- */
-private static final class Compressor {
-       // We use 12 bit codes:
-       // * 0-255 are real bytes
-       // * 256-4095 are the substring codes
-       // Java handily initialises our buffer / dictionary
-       //  to all zeros
-       byte[] dict = new byte[4096];
-
-       // The next block of data to be written out, minus
-       //  its mask byte
-       byte[] buffer = new byte[16];
-       // And how long it is
-       // (Un-compressed codes are 1 byte each, compressed codes
-       //   are two)
-       int bufferLen = 0;
-
-       // The raw length of a code is limited to 4 bits
-       byte[] rawCode = new byte[16];
-       // And how much we're using
-       int rawCodeLen = 0;
-
-       // How far through the input and output streams we are
-       int posInp = 0;
-       int posOut = 0;
-
-       // What the next mask byte to output will be
-       int nextMask = 0;
-       // And how many bits we've already set
-       int maskBitsSet = 0;
-
-       public Compressor() {
-               //
-       }
-/**
- * Returns the last place that the bytes from rawCode are found
- *  at in the buffer, or -1 if they can't be found
- */
-private int findRawCodeInBuffer() {
-       // Work our way back from the end
-       // (Visio always seems to use the last possible code)
-       for(int i=(buffer.length - rawCodeLen); i>=0; i--) {
-               boolean matches = true;
-               for(int j=0; matches && j<rawCodeLen; j++) {
-                       if(buffer[i] == rawCode[j]) {
-                               // Fits
-                       } else {
-                               // Doesn't fit, can't be a match
-                               matches = false;
-                       }
-               }
-
-               // Was this position a match?
-               if(matches) {
-                       return i;
-               }
-       }
-
-       // Not found
-       return -1;
-}
-
-/**
- * Output the compressed representation for the bytes
- *  found in rawCode
- */
-private void outputCompressed(OutputStream res) throws IOException {
-       // It's not worth compressing only 1 or two bytes,
-       //  due to the overheads
-       // So if asked, just output uncompressed
-       if(rawCodeLen < 3) {
-               for(int i=0; i<rawCodeLen; i++) {
-                       outputUncompressed(rawCode[i], res);
-               }
-               return;
-       }
-
-       // Increment the mask bit count, we've done another code
-       maskBitsSet++;
-       // Add the length+code to the buffer
-       // (The position is the first 12 bits, the
-       //  length is the last 4 bits)
-       // TODO
-       posOut += 2;
-
-       // If we're now at 8 codes, output
-       if(maskBitsSet == 8) {
-               output8Codes(res);
-       }
-}
-/**
- * Output the un-compressed byte
- */
-private void outputUncompressed(byte b, OutputStream res) throws IOException {
-       // Set the mask bit for us
-       nextMask += (1<<maskBitsSet);
-
-       // And add us to the buffer + dictionary
-       buffer[bufferLen] = fromInt(b);
-       bufferLen++;
-       dict[(posOut&4095)] = fromInt(b);
-       posOut++;
-
-       // If we're now at 8 codes, output
-       if(maskBitsSet == 8) {
-               output8Codes(res);
-       }
-}
-
-/**
- * We've got 8 code worth to write out, so
- *  output along with the header
- */
-private void output8Codes(OutputStream res) throws IOException {
-       // Output the mask and the data
-       res.write(new byte[] { fromInt(nextMask) } );
-       res.write(buffer, 0, bufferLen);
-
-       // Reset things
-       nextMask = 0;
-       maskBitsSet = 0;
-       bufferLen = 0;
-}
-
-/**
- * Does the compression
- */
-public void compress(InputStream src, OutputStream res) throws IOException {
-       // Have we hit the end of the file yet?
-       boolean going = true;
-
-       // This is a byte as looked up in the dictionary
-       // It needs to be signed, as it'll get passed on to
-       //  the output stream
-       byte dataB;
-       // This is an unsigned byte read from the stream
-       // It needs to be unsigned, so that bit stuff works
-       int dataI;
-
-       while( going ) {
-               dataI = src.read();
-               posInp++;
-               if(dataI == -1) { going = false; }
-               dataB = fromInt(dataI);
-
-               // If we've run out of data, output anything that's
-               //  pending then finish
-               if(!going && rawCodeLen > 0) {
-                       outputCompressed(res);
-                       break;
-               }
-
-               // Try adding this new byte onto rawCode, and
-               //  see if all of that is still found in the
-               //  buffer dictionary or not
-               rawCode[rawCodeLen] = dataB;
-               rawCodeLen++;
-               int rawAt = findRawCodeInBuffer();
-
-               // If we found it and are now at 16 bytes,
-               //  we need to output our pending code block
-               if(rawCodeLen == 16 && rawAt > -1) {
-                       outputCompressed(res);
-                       rawCodeLen = 0;
-                       continue;
-               }
-
-               // If we did find all of rawCode with our new
-               //  byte added on, we can wait to see what happens
-               //  with the next byte
-               if(rawAt > -1) {
-                       continue;
-               }
-
-               // If we get here, then the rawCode + this byte weren't
-               // found in the dictionary
-
-               // If there was something in rawCode before, then that was
-               // found in the dictionary, so output that compressed
-               rawCodeLen--;
-               if(rawCodeLen > 0) {
-                       // Output the old rawCode
-                       outputCompressed(res);
-
-                       // Can this byte start a new rawCode, or does
-                       //  it need outputting itself?
-                       rawCode[0] = dataB;
-                       rawCodeLen = 1;
-                       if(findRawCodeInBuffer() > -1) {
-                               // Fits in, wait for next byte
-                               continue;
-                       }
-                       // Doesn't fit, output
-                       outputUncompressed(dataB,res);
-                       rawCodeLen = 0;
-               } else {
-                       // Nothing in rawCode before, so this byte
-                       //  isn't in the buffer dictionary
-                       // Output it un-compressed
-                       outputUncompressed(dataB,res);
-               }
-       }
-}
-}
-
+   /**
+    * Given an integer, turn it into a java byte, handling
+    *  the wrapping.
+    * This is a convenience method
+    */
+   public static byte fromInt(int b) {
+      if(b < 128) return (byte)b;
+      return (byte)(b - 256);
+   }
+   /**
+    * Given a java byte, turn it into an integer between 0
+    *  and 255 (i.e. handle the unwrapping).
+    * This is a convenience method
+    */
+   public static int fromByte(byte b) {
+      if(b >= 0) {
+         return b;
+      }
+      return b + 256;
+   }
+
+   /**
+    * Compress the given input stream, returning the array of bytes
+    *  of the compressed input
+    */
+   public byte[] compress(InputStream src) throws IOException {
+      ByteArrayOutputStream res = new ByteArrayOutputStream();
+      compress(src,res);
+      return res.toByteArray();
+   }
+
+   /**
+    * Decompresses the given input stream, returning the array of bytes
+    *  of the decompressed input.
+    */
+   public byte[] decode(InputStream src) throws IOException {
+      ByteArrayOutputStream res = new ByteArrayOutputStream();
+      decode(src,res);
+      return res.toByteArray();
+   }
+   
+   /**
+    * Perform a streaming decompression of the input.
+    * Works by:
+    * 1) Reading a flag byte, the 8 bits of which tell you if the
+    *     following 8 codes are compressed our un-compressed
+    * 2) Consider the 8 bits in turn
+    * 3) If the bit is set, the next code is un-compressed, so
+    *     add it to the dictionary and output it
+    * 4) If the bit isn't set, then read in the length and start
+    *     position in the dictionary, and output the bytes there
+    * 5) Loop until we've done all 8 bits, then read in the next
+    *     flag byte
+    */
+   public void decode(InputStream src, OutputStream res) throws IOException {
+      // We use 12 bit codes:
+      // * 0-255 are real bytes
+      // * 256-4095 are the substring codes
+      // Java handily initialises our buffer / dictionary
+      //  to all zeros
+      byte[] buffer = new byte[4096];
+
+      // How far through the output we've got
+      // (This is normally used &4095, so it nicely wraps)
+      int pos = 0;
+      // The flag byte is treated as its 8 individual
+      //  bits, which tell us if the following 8 codes
+      //  are compressed or un-compressed
+      int flag;
+      // The mask, between 1 and 255, which is used when
+      //  processing each bit of the flag byte in turn
+      int mask;
+
+      // These are bytes as looked up in the dictionary
+      // It needs to be signed, as it'll get passed on to
+      //  the output stream
+      byte[] dataB = new byte[19];
+      // This is an unsigned byte read from the stream
+      // It needs to be unsigned, so that bit stuff works
+      int dataI;
+      // The compressed code sequence is held over 2 bytes
+      int dataIPt1, dataIPt2;
+      // How long a code sequence is, and where in the
+      //  dictionary to start at
+      int len, pntr;
+
+      while( (flag = src.read()) != -1 ) {
+         // Compare each bit in our flag byte in turn:
+         for(mask = 1; mask < 256 ; mask <<= 1) {
+            // Is this a new code (un-compressed), or
+            //  the use of existing codes (compressed)?
+            if( (flag & mask) > 0 ) {
+               // Retrieve the un-compressed code
+               if( (dataI = src.read()) != -1) {
+                  // Save the byte into the dictionary
+                  buffer[(pos&4095)] = fromInt(dataI);
+                  pos++;
+                  // And output the byte
+                  res.write( new byte[] {fromInt(dataI)} );
+               }
+            } else {
+               // We have a compressed sequence
+               // Grab the next 16 bits of data
+               dataIPt1 = src.read();
+               dataIPt2 = src.read();
+               if(dataIPt1 == -1 || dataIPt2 == -1) break;
+
+               // Build up how long the code sequence is, and
+               //  what position of the code to start at
+               // (The position is the first 12 bits, the
+               //  length is the last 4 bits)
+               len = (dataIPt2 & 15) + 3;
+               pntr = (dataIPt2 & 240)*16 + dataIPt1;
+
+               // If the pointer happens to be passed the end
+               //  of our buffer, then wrap around
+               if(pntr > 4078) {
+                  pntr = pntr - 4078;
+               } else {
+                  pntr = pntr + 18;
+               }
+
+               // Loop over the codes, outputting what they correspond to
+               for(int i=0; i<len; i++) {
+                  dataB[i] = buffer[(pntr + i) & 4095];
+                  buffer[ (pos + i) & 4095 ] = dataB[i];
+               }
+               res.write(dataB, 0, len);
+
+               // Record how far along the stream we have moved
+               pos = pos + len;
+            }
+         }
+      }
+   }
+
+   /**
+    * Performs the Visio compatible streaming LZW compression.
+    */
+   public void compress(InputStream src, OutputStream res) throws IOException {
+      HDGFLZWCompressor c = new HDGFLZWCompressor();
+      c.compress(src, res);
+   }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java

new file mode 100644 (file)

index 0000000..3391b7c
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java
@@ -0,0 +1,241 @@
+package org.apache.poi.hdgf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ * Helper class to handle the Visio compatible
+ *  streaming LZW compression.
+ * Need our own class to handle keeping track of the
+ *  code buffer, pending bytes to write out etc.
+ *  
+ * TODO Fix this, as it starts to go wrong on
+ *  large streams 
+ */
+final class HDGFLZWCompressor {
+       // We use 12 bit codes:
+       // * 0-255 are real bytes
+       // * 256-4095 are the substring codes
+       // Java handily initialises our buffer / dictionary
+       //  to all zeros
+       byte[] dict = new byte[4096];
+
+       // The next block of data to be written out, minus
+       //  its mask byte
+       byte[] buffer = new byte[16];
+       // And how long it is
+       // (Un-compressed codes are 1 byte each, compressed codes
+       //   are two)
+       int bufferLen = 0;
+
+       // The raw length of a code is limited to 4 bits + 2
+       byte[] rawCode = new byte[18];
+       // And how much we're using
+       int rawCodeLen = 0;
+
+       // How far through the input and output streams we are
+       int posInp = 0;
+       int posOut = 0;
+
+       // What the next mask byte to output will be
+       int nextMask = 0;
+       // And how many bits we've already set
+       int maskBitsSet = 0;
+
+       public HDGFLZWCompressor() {}
+       
+/**
+ * Returns the last place that the bytes from rawCode are found
+ *  at in the buffer, or -1 if they can't be found
+ */
+private int findRawCodeInBuffer() {
+       // Work our way through all the codes until we
+   //  find the right one. Visio starts from the end
+       for(int i=4096-rawCodeLen; i>0; i--) {
+               boolean matches = true;
+               for(int j=0; matches && j<rawCodeLen; j++) {
+                       if(dict[i+j] == rawCode[j]) {
+                               // Fits
+                       } else {
+                               // Doesn't fit, can't be a match
+                               matches = false;
+                       }
+               }
+
+               // Was this position a match?
+               if(matches) {
+                       return i;
+               }
+       }
+
+       // Not found
+       return -1;
+}
+
+/**
+ * Output the compressed representation for the bytes
+ *  found in rawCode
+ */
+private void outputCompressed(OutputStream res) throws IOException {
+       // It's not worth compressing only 1 or two bytes,
+       //  due to the overheads
+       // So if asked, just output uncompressed
+       if(rawCodeLen < 3) {
+               for(int i=0; i<rawCodeLen; i++) {
+                       outputUncompressed(rawCode[i], res);
+               }
+               return;
+       }
+       
+       // Grab where the data lives
+       int codesAt = findRawCodeInBuffer();
+   codesAt -= 18;
+       if(codesAt < 0) {
+          codesAt += 4096;
+       }
+
+       // Increment the mask bit count, we've done another code
+       maskBitsSet++;
+       
+       // Add the length+code to the buffer
+       // (The position is the first 12 bits, the
+       //  length is the last 4 bits)
+       int bp1 = (codesAt & 255);
+       int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
+       buffer[bufferLen] = HDGFLZW.fromInt(bp1);
+       bufferLen++;
+   buffer[bufferLen] = HDGFLZW.fromInt(bp2);
+   bufferLen++;
+   
+   // Copy the data to the dictionary in the new place
+   for(int i=0; i<rawCodeLen; i++) {
+      dict[(posOut&4095)] = rawCode[i];
+      posOut++; 
+   }
+
+       // If we're now at 8 codes, output
+       if(maskBitsSet == 8) {
+               output8Codes(res);
+       }
+}
+/**
+ * Output the un-compressed byte
+ */
+private void outputUncompressed(byte b, OutputStream res) throws IOException {
+       // Set the mask bit for us
+       nextMask += (1<<maskBitsSet);
+       maskBitsSet++;
+
+       // And add us to the buffer + dictionary
+       buffer[bufferLen] = b;
+       bufferLen++;
+       dict[(posOut&4095)] = b;
+       posOut++;
+
+       // If we're now at 8 codes, output
+       if(maskBitsSet == 8) {
+               output8Codes(res);
+       }
+}
+
+/**
+ * We've got 8 code worth to write out, so
+ *  output along with the header
+ */
+private void output8Codes(OutputStream res) throws IOException {
+       // Output the mask and the data
+       res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
+       res.write(buffer, 0, bufferLen);
+
+       // Reset things
+       nextMask = 0;
+       maskBitsSet = 0;
+       bufferLen = 0;
+}
+
+/**
+ * Does the compression
+ */
+public void compress(InputStream src, OutputStream res) throws IOException {
+       // Have we hit the end of the file yet?
+       boolean going = true;
+
+       // This is a byte as looked up in the dictionary
+       // It needs to be signed, as it'll get passed on to
+       //  the output stream
+       byte dataB;
+       // This is an unsigned byte read from the stream
+       // It needs to be unsigned, so that bit stuff works
+       int dataI;
+
+       while( going ) {
+               dataI = src.read();
+               posInp++;
+               if(dataI == -1) { going = false; }
+               dataB = HDGFLZW.fromInt(dataI);
+
+               // If we've run out of data, output anything that's
+               //  pending then finish
+               if(!going) {
+                  if(rawCodeLen > 0) {
+                outputCompressed(res);
+                if(maskBitsSet > 0) {
+                   output8Codes(res);
+                }
+                  }
+                       break;
+               }
+
+               // Try adding this new byte onto rawCode, and
+               //  see if all of that is still found in the
+               //  buffer dictionary or not
+               rawCode[rawCodeLen] = dataB;
+               rawCodeLen++;
+               int rawAt = findRawCodeInBuffer();
+               
+               // If we found it and are now at 18 bytes,
+               //  we need to output our pending code block
+               if(rawCodeLen == 18 && rawAt > -1) {
+                       outputCompressed(res);
+                       rawCodeLen = 0;
+                       continue;
+               }
+
+               // If we did find all of rawCode with our new
+               //  byte added on, we can wait to see what happens
+               //  with the next byte
+               if(rawAt > -1) {
+                       continue;
+               }
+
+               // If we get here, then the rawCode + this byte weren't
+               // found in the dictionary
+
+               // If there was something in rawCode before, then that was
+               // found in the dictionary, so output that compressed
+               rawCodeLen--;
+               if(rawCodeLen > 0) {
+                       // Output the old rawCode
+                       outputCompressed(res);
+
+                       // Can this byte start a new rawCode, or does
+                       //  it need outputting itself?
+                       rawCode[0] = dataB;
+                       rawCodeLen = 1;
+                       if(findRawCodeInBuffer() > -1) {
+                               // Fits in, wait for next byte
+                               continue;
+                       }
+                       // Doesn't fit, output
+                       outputUncompressed(dataB,res);
+                       rawCodeLen = 0;
+               } else {
+                       // Nothing in rawCode before, so this byte
+                       //  isn't in the buffer dictionary
+                       // Output it un-compressed
+                       outputUncompressed(dataB,res);
+               }
+       }
+}
+}
+\ No newline at end of file
diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java

index f3af7c375f2edf1066b6c985f008456e145cfeb7..b997ce5c4b8e1f14780264232947c11b4a62535e 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
+++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
@@ -28,17 +28,19 @@ public final class TestHDGFLZW extends TestCase {
                 -21, -16, // 3 @ 4093
                 1, 0, 0, -72,
                 -13, -16, // 3 @ 5
-               78,       // *mask bit*
+               78,       // *mask bit* 2,3,4,7
                 -32, -5,  // 14 @ 4082
                 1, 0, 3,
                 -21, -16, // 3 @ 4093
                 10, 5,    // 8 @ 28
                 4,
                 -21, -16, // 3 @ 4093
-               21,       // *mask bit*
+               21,       // *mask bit* 1,3,5
                 9,
                 -21, -16, // 3 @ 4093
-               103, -21, -16, 34,
+               103, 
+               -21, -16, // 3 @ 4093
+               34,
                 -36, -1,  // 18 @ 4078
                 52, 15,   // 18 @ 70
                 70, 15,   // 18 @ 88
@@ -169,19 +171,98 @@ public final class TestHDGFLZW extends TestCase {
                 }
         }
  
-       public void DISABLEDtestCompress() throws Exception {
-               assertEquals(339, testTrailerComp.length);
-               assertEquals(632, testTrailerDecomp.length);
+       /**
+        * Test that we can round-trip a little bit.
+        * Uses a part short enough that we agree with visio
+        *  on the best way to compress it
+        */
+       public void testCompressMini() throws Exception {
+          // first 11 bytes compressed = 12 bytes uncompressed
+          byte[] sourceComp = new byte[11];
+          byte[] sourceDecomp = new byte[12];
+          System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
+      System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
  
                 // Compress it using our engine
                 HDGFLZW lzw = new HDGFLZW();
-               byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
+               byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
+               
+               // Now decompress it again
+               byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
  
-               // Now check it's the right data
-               assertEquals(339, comp.length);
-               for(int i=0; i<comp.length; i++) {
-                       if(comp[i] != testTrailerComp[i])
-                               System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
-               }
+               // First up, check the round tripping
+               assertEquals(12, decomp.length);
+      for(int i=0; i<decomp.length; i++) {
+         assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
+      }
+
+               // Now check the compressed intermediate version
+      assertEquals(11, comp.length);
+      for(int i=0; i<comp.length; i++) {
+         assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
+      }
         }
+
+       /**
+        * Tests that we can do several mask pages
+        */
+   public void testCompressMidi() throws Exception {
+      // First 12 -> 11
+      // Next 32 -> 13
+      byte[] sourceComp = new byte[24];
+      byte[] sourceDecomp = new byte[44];
+      System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length);
+      System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length);
+
+      // Compress it using our engine
+      HDGFLZW lzw = new HDGFLZW();
+      byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
+      
+      // We should be 3 characters bigger, as
+      //  we split one compressed bit into two
+      assertEquals(27, comp.length);
+      
+      // Now decompress it again
+      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+
+      // We can only check the round-tripping, as for now
+      //  visio cheats on re-using a block
+      assertEquals(44, decomp.length);
+      for(int i=0; i<decomp.length; i++) {
+         assertEquals("Wrong at " + i, decomp[i], sourceDecomp[i]);
+      }
+   }
+
+   /**
+    * Gets 160 bytes through then starts going wrong...
+    * TODO Fix this
+    */
+   public void DISABLEDtestCompressFull() throws Exception {
+      assertEquals(339, testTrailerComp.length);
+      assertEquals(632, testTrailerDecomp.length);
+
+      // Compress it using our engine
+      HDGFLZW lzw = new HDGFLZW();
+      byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
+      
+      // Now decompress it again
+      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+
+//      for(int i=0; i<comp.length; i++) {
+//         System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
+//      }
+      
+      // First up, check the round tripping
+//    assertEquals(632, decomp.length);
+      for(int i=0; i<decomp.length; i++) {
+         assertEquals("Wrong at " + i, decomp[i], testTrailerDecomp[i]);
+      }
+
+      
+      // Now check the compressed intermediate version
+      assertEquals(339, comp.length);
+      for(int i=0; i<comp.length; i++) {
+         assertEquals("Wrong at " + i, comp[i], testTrailerComp[i]);
+      }
+   }
  }
author	Nick Burch <nick@apache.org>
	Thu, 16 Dec 2010 07:41:41 +0000 (07:41 +0000)
committer	Nick Burch <nick@apache.org>
	Thu, 16 Dec 2010 07:41:41 +0000 (07:41 +0000)
src/documentation/content/xdocs/hdgf/index.xml		patch \| blob \| history
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java		patch \| blob \| history