Refactor the common LZW decompression code out into utils

author Nick Burch <nick@apache.org>

Tue, 21 Dec 2010 05:18:34 +0000 (05:18 +0000)

committer Nick Burch <nick@apache.org>

Tue, 21 Dec 2010 05:18:34 +0000 (05:18 +0000)
author Nick Burch <nick@apache.org>
Tue, 21 Dec 2010 05:18:34 +0000 (05:18 +0000)
committer Nick Burch <nick@apache.org>
Tue, 21 Dec 2010 05:18:34 +0000 (05:18 +0000)
diff --git a/src/java/org/apache/poi/util/LZWDecompresser.java b/src/java/org/apache/poi/util/LZWDecompresser.java

new file mode 100644 (file)

index 0000000..f172a01
--- /dev/null
+++ b/src/java/org/apache/poi/util/LZWDecompresser.java
@@ -0,0 +1,178 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ * This class provides common functionality for the
+ *  various LZW implementations in the different file
+ *  formats.
+ * It's currently used by HDGF and HMEF.
+ *
+ * Two good resources on LZW are:
+ *  http://en.wikipedia.org/wiki/LZW
+ *  http://marknelson.us/1989/10/01/lzw-data-compression/
+ */
+public abstract class LZWDecompresser {
+   /**
+    * Does the mask bit mean it's compressed or uncompressed?
+    */
+   private boolean maskMeansCompressed;
+   
+   protected LZWDecompresser(boolean maskMeansCompressed) {
+      this.maskMeansCompressed = maskMeansCompressed;
+   }
+   
+   /**
+    * Populates the dictionary. May not need
+    *  to do anything if all zeros is fine.
+    */
+   protected abstract void populateDictionary(byte[] dict);
+   
+   /**
+    * Adjusts the position offset if needed when looking
+    *  something up in the dictionary.
+    */
+   protected abstract int adjustDictionaryOffset(int offset); 
+   
+   /**
+    * Decompresses the given input stream, returning the array of bytes
+    *  of the decompressed input.
+    */
+   public byte[] decompress(InputStream src) throws IOException {
+      ByteArrayOutputStream res = new ByteArrayOutputStream();
+      decompress(src,res);
+      return res.toByteArray();
+   }
+   
+   /**
+    * Perform a streaming decompression of the input.
+    * Works by:
+    * 1) Reading a flag byte, the 8 bits of which tell you if the
+    *     following 8 codes are compressed our un-compressed
+    * 2) Consider the 8 bits in turn
+    * 3) If the bit is set, the next code is un-compressed, so
+    *     add it to the dictionary and output it
+    * 4) If the bit isn't set, then read in the length and start
+    *     position in the dictionary, and output the bytes there
+    * 5) Loop until we've done all 8 bits, then read in the next
+    *     flag byte
+    */
+   public void decompress(InputStream src, OutputStream res) throws IOException {
+      // We use 12 bit codes:
+      // * 0-255 are real bytes
+      // * 256-4095 are the substring codes
+      // Java handily initialises our buffer / dictionary
+      //  to all zeros
+      byte[] buffer = new byte[4096];
+      populateDictionary(buffer);
+
+      // How far through the output we've got
+      // (This is normally used &4095, so it nicely wraps)
+      int pos = 0;
+      // The flag byte is treated as its 8 individual
+      //  bits, which tell us if the following 8 codes
+      //  are compressed or un-compressed
+      int flag;
+      // The mask, between 1 and 255, which is used when
+      //  processing each bit of the flag byte in turn
+      int mask;
+
+      // These are bytes as looked up in the dictionary
+      // It needs to be signed, as it'll get passed on to
+      //  the output stream
+      byte[] dataB = new byte[19];
+      // This is an unsigned byte read from the stream
+      // It needs to be unsigned, so that bit stuff works
+      int dataI;
+      // The compressed code sequence is held over 2 bytes
+      int dataIPt1, dataIPt2;
+      // How long a code sequence is, and where in the
+      //  dictionary to start at
+      int len, pntr;
+
+      while( (flag = src.read()) != -1 ) {
+         // Compare each bit in our flag byte in turn:
+         for(mask = 1; mask < 256 ; mask <<= 1) {
+            // Is this a new code (un-compressed), or
+            //  the use of existing codes (compressed)?
+            boolean isMaskSet = (flag & mask) > 0;
+            if( isMaskSet && !maskMeansCompressed ) {
+               // Retrieve the un-compressed code
+               if( (dataI = src.read()) != -1) {
+                  // Save the byte into the dictionary
+                  buffer[(pos&4095)] = fromInt(dataI);
+                  pos++;
+                  // And output the byte
+                  res.write( new byte[] {fromInt(dataI)} );
+               }
+            } else {
+               // We have a compressed sequence
+               // Grab the next 16 bits of data
+               dataIPt1 = src.read();
+               dataIPt2 = src.read();
+               if(dataIPt1 == -1 || dataIPt2 == -1) break;
+
+               // Build up how long the code sequence is, and
+               //  what position of the code to start at
+               // (The position is the first 12 bits, the
+               //  length is the last 4 bits)
+               len = (dataIPt2 & 15) + 3;
+               pntr = (dataIPt2 & 240)*16 + dataIPt1;
+
+               // Adjust the pointer as needed
+               pntr = adjustDictionaryOffset(pntr);
+
+               // Loop over the codes, outputting what they correspond to
+               for(int i=0; i<len; i++) {
+                  dataB[i] = buffer[(pntr + i) & 4095];
+                  buffer[ (pos + i) & 4095 ] = dataB[i];
+               }
+               res.write(dataB, 0, len);
+
+               // Record how far along the stream we have moved
+               pos = pos + len;
+            }
+         }
+      }
+   }
+
+   /**
+    * Given an integer, turn it into a java byte, handling
+    *  the wrapping.
+    * This is a convenience method
+    */
+   public static byte fromInt(int b) {
+      if(b < 128) return (byte)b;
+      return (byte)(b - 256);
+   }
+   /**
+    * Given a java byte, turn it into an integer between 0
+    *  and 255 (i.e. handle the unwrapping).
+    * This is a convenience method
+    */
+   public static int fromByte(byte b) {
+      if(b >= 0) {
+         return b;
+      }
+      return b + 256;
+   }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java

index d6d8d6b764df419fa3d4e7c38c5f90ec64534809..290c14799b7866f1e94a70f23e8911eadf343c52 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java
@@ -21,6 +21,8 @@ import java.io.IOException;
  import java.io.InputStream;
  import java.io.OutputStream;
  
+import org.apache.poi.util.LZWDecompresser;
+
  /**
   * A decoder for the crazy LZW implementation used
   *  in Visio.
@@ -33,27 +35,10 @@ import java.io.OutputStream;
   *  http://en.wikipedia.org/wiki/LZW
   *  http://marknelson.us/1989/10/01/lzw-data-compression/
   */
-public class HDGFLZW {
-
-   /**
-    * Given an integer, turn it into a java byte, handling
-    *  the wrapping.
-    * This is a convenience method
-    */
-   public static byte fromInt(int b) {
-      if(b < 128) return (byte)b;
-      return (byte)(b - 256);
-   }
-   /**
-    * Given a java byte, turn it into an integer between 0
-    *  and 255 (i.e. handle the unwrapping).
-    * This is a convenience method
-    */
-   public static int fromByte(byte b) {
-      if(b >= 0) {
-         return b;
-      }
-      return b + 256;
+public class HDGFLZW extends LZWDecompresser {
+   public HDGFLZW() {
+      // We're the wrong way round!
+      super(false);
     }
  
     /**
@@ -67,108 +52,23 @@ public class HDGFLZW {
     }
  
     /**
-    * Decompresses the given input stream, returning the array of bytes
-    *  of the decompressed input.
+    * We have a slight shift by 18 bytes
      */
-   public byte[] decode(InputStream src) throws IOException {
-      ByteArrayOutputStream res = new ByteArrayOutputStream();
-      decode(src,res);
-      return res.toByteArray();
+   @Override
+   protected int adjustDictionaryOffset(int pntr) {
+      if(pntr > 4078) {
+         pntr = pntr - 4078;
+      } else {
+         pntr = pntr + 18;
+      }
+      return pntr;
     }
-   
+
     /**
-    * Perform a streaming decompression of the input.
-    * Works by:
-    * 1) Reading a flag byte, the 8 bits of which tell you if the
-    *     following 8 codes are compressed our un-compressed
-    * 2) Consider the 8 bits in turn
-    * 3) If the bit is set, the next code is un-compressed, so
-    *     add it to the dictionary and output it
-    * 4) If the bit isn't set, then read in the length and start
-    *     position in the dictionary, and output the bytes there
-    * 5) Loop until we've done all 8 bits, then read in the next
-    *     flag byte
+    * We want an empty dictionary, so do nothing
      */
-   public void decode(InputStream src, OutputStream res) throws IOException {
-      // We use 12 bit codes:
-      // * 0-255 are real bytes
-      // * 256-4095 are the substring codes
-      // Java handily initialises our buffer / dictionary
-      //  to all zeros
-      byte[] buffer = new byte[4096];
-
-      // How far through the output we've got
-      // (This is normally used &4095, so it nicely wraps)
-      int pos = 0;
-      // The flag byte is treated as its 8 individual
-      //  bits, which tell us if the following 8 codes
-      //  are compressed or un-compressed
-      int flag;
-      // The mask, between 1 and 255, which is used when
-      //  processing each bit of the flag byte in turn
-      int mask;
-
-      // These are bytes as looked up in the dictionary
-      // It needs to be signed, as it'll get passed on to
-      //  the output stream
-      byte[] dataB = new byte[19];
-      // This is an unsigned byte read from the stream
-      // It needs to be unsigned, so that bit stuff works
-      int dataI;
-      // The compressed code sequence is held over 2 bytes
-      int dataIPt1, dataIPt2;
-      // How long a code sequence is, and where in the
-      //  dictionary to start at
-      int len, pntr;
-
-      while( (flag = src.read()) != -1 ) {
-         // Compare each bit in our flag byte in turn:
-         for(mask = 1; mask < 256 ; mask <<= 1) {
-            // Is this a new code (un-compressed), or
-            //  the use of existing codes (compressed)?
-            if( (flag & mask) > 0 ) {
-               // Retrieve the un-compressed code
-               if( (dataI = src.read()) != -1) {
-                  // Save the byte into the dictionary
-                  buffer[(pos&4095)] = fromInt(dataI);
-                  pos++;
-                  // And output the byte
-                  res.write( new byte[] {fromInt(dataI)} );
-               }
-            } else {
-               // We have a compressed sequence
-               // Grab the next 16 bits of data
-               dataIPt1 = src.read();
-               dataIPt2 = src.read();
-               if(dataIPt1 == -1 || dataIPt2 == -1) break;
-
-               // Build up how long the code sequence is, and
-               //  what position of the code to start at
-               // (The position is the first 12 bits, the
-               //  length is the last 4 bits)
-               len = (dataIPt2 & 15) + 3;
-               pntr = (dataIPt2 & 240)*16 + dataIPt1;
-
-               // If the pointer happens to be passed the end
-               //  of our buffer, then wrap around
-               if(pntr > 4078) {
-                  pntr = pntr - 4078;
-               } else {
-                  pntr = pntr + 18;
-               }
-
-               // Loop over the codes, outputting what they correspond to
-               for(int i=0; i<len; i++) {
-                  dataB[i] = buffer[(pntr + i) & 4095];
-                  buffer[ (pos + i) & 4095 ] = dataB[i];
-               }
-               res.write(dataB, 0, len);
-
-               // Record how far along the stream we have moved
-               pos = pos + len;
-            }
-         }
-      }
+   @Override
+   protected void populateDictionary(byte[] dict) {
     }
  
     /**
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java

index da6f3c11a59aed63661a3b7bf5a3781d5c8ace90..2fae8888d7da125a260b3d954076e7f9eb8abd7d 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java
@@ -78,7 +78,7 @@ public final class CompressedStreamStore extends StreamStore {
  
                 // Decompress
                 HDGFLZW lzw = new HDGFLZW();
-               byte[] decompressed = lzw.decode(bais);
+               byte[] decompressed = lzw.decompress(bais);
  
                 // Split into header and contents
                 byte[][] ret = new byte[2][];
diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java

index b997ce5c4b8e1f14780264232947c11b4a62535e..7f89a050fb0ce7951bb5c6a3485cef9758e05ee5 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
+++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java
@@ -139,9 +139,9 @@ public final class TestHDGFLZW extends TestCase {
                 assertEquals(339, testTrailerComp.length);
                 assertEquals(632, testTrailerDecomp.length);
  
-               // Decode it using our engine
+               // decompress it using our engine
                 HDGFLZW lzw = new HDGFLZW();
-               byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
+               byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
  
                 // Check it's of the right size
                 assertEquals(632, dec.length);
@@ -159,9 +159,9 @@ public final class TestHDGFLZW extends TestCase {
                 assertEquals(339, testTrailerComp.length);
                 assertEquals(632, testTrailerDecomp.length);
  
-               // Decode it using our engine
+               // decompress it using our engine
                 HDGFLZW lzw = new HDGFLZW();
-               byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
+               byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
  
                 // Now check it's the right data
                 assertEquals(632, dec.length);
@@ -188,7 +188,7 @@ public final class TestHDGFLZW extends TestCase {
                 byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
                 
                 // Now decompress it again
-               byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+               byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
  
                 // First up, check the round tripping
                 assertEquals(12, decomp.length);
@@ -223,7 +223,7 @@ public final class TestHDGFLZW extends TestCase {
        assertEquals(27, comp.length);
        
        // Now decompress it again
-      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+      byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
  
        // We can only check the round-tripping, as for now
        //  visio cheats on re-using a block
@@ -246,7 +246,7 @@ public final class TestHDGFLZW extends TestCase {
        byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
        
        // Now decompress it again
-      byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+      byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
  
  //      for(int i=0; i<comp.length; i++) {
  //         System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);
author	Nick Burch <nick@apache.org>
	Tue, 21 Dec 2010 05:18:34 +0000 (05:18 +0000)
committer	Nick Burch <nick@apache.org>
	Tue, 21 Dec 2010 05:18:34 +0000 (05:18 +0000)
src/java/org/apache/poi/util/LZWDecompresser.java	[new file with mode: 0644]	patch \| blob
src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hdgf/streams/CompressedStreamStore.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java		patch \| blob \| history