Fix Visio compression

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1872223 13f79535-47bb-0310-9956-ffa450edef68
author: Andreas Beeker <kiwiwings@apache.org> 2020-01-01 22:44:42 +0000
committer: Andreas Beeker <kiwiwings@apache.org> 2020-01-01 22:44:42 +0000
commit: adb8424bc1a1c9a502d2cd07757615b711d32c50 (patch)
tree: c6097e1f80c499176f20b3c29c523e7c348342ae /src/java/org/apache/poi
parent: 07b5bc667c33f5fbab0f2b070a139b087328dd60 (diff)
download: poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.tar.gz
poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.zip
1 files changed, 157 insertions, 174 deletions
diff --git a/src/java/org/apache/poi/util/LZWDecompresser.java b/src/java/org/apache/poi/util/LZWDecompresser.java
index ab24bf0f25..22007b4f72 100644
--- a/src/java/org/apache/poi/util/LZWDecompresser.java
+++ b/src/java/org/apache/poi/util/LZWDecompresser.java
@@ -23,184 +23,167 @@ import java.io.OutputStream;
 
 /**
  * This class provides common functionality for the
- *  various LZW implementations in the different file
- *  formats.
+ * various LZW implementations in the different file
+ * formats.
  * It's currently used by HDGF and HMEF.
- *
+ * <p>
  * Two good resources on LZW are:
- *  http://en.wikipedia.org/wiki/LZW
- *  http://marknelson.us/1989/10/01/lzw-data-compression/
+ * http://en.wikipedia.org/wiki/LZW
+ * http://marknelson.us/1989/10/01/lzw-data-compression/
  */
 public abstract class LZWDecompresser {
 
-   //arbitrarily selected; may need to increase
-   private static final int MAX_RECORD_LENGTH = 1_000_000;
-
-   /**
-    * Does the mask bit mean it's compressed or uncompressed?
-    */
-   private final boolean maskMeansCompressed;
-   /**
-    * How much to append to the code length in the stream
-    *  to get the real code length? Normally 2 or 3
-    */
-   private final int codeLengthIncrease;
-   /**
-    * Does the 12 bits of the position get stored in
-    *  Little Endian or Big Endian form?
-    * This controls whether a pos+length of 0x12 0x34
-    *  becomes a position of 0x123 or 0x312
-    */
-   private final boolean positionIsBigEndian;
-   
-   protected LZWDecompresser(boolean maskMeansCompressed, 
-            int codeLengthIncrease, boolean positionIsBigEndian) {
-      this.maskMeansCompressed = maskMeansCompressed;
-      this.codeLengthIncrease = codeLengthIncrease;
-      this.positionIsBigEndian = positionIsBigEndian;
-   }
-   
-   /**
-    * Populates the dictionary, and returns where in it
-    *  to begin writing new codes.
-    * Generally, if the dictionary is pre-populated, then new
-    *  codes should be placed at the end of that block.
-    * Equally, if the dictionary is left with all zeros, then
-    *  usually the new codes can go in at the start.
-    */
-   protected abstract int populateDictionary(byte[] dict);
-   
-   /**
-    * Adjusts the position offset if needed when looking
-    *  something up in the dictionary.
-    */
-   protected abstract int adjustDictionaryOffset(int offset);
-   
-   /**
-    * Decompresses the given input stream, returning the array of bytes
-    *  of the decompressed input.
-    */
-   public byte[] decompress(InputStream src) throws IOException {
-      ByteArrayOutputStream res = new ByteArrayOutputStream();
-      decompress(src,res);
-      return res.toByteArray();
-   }
-   
-   /**
-    * Perform a streaming decompression of the input.
-    * Works by:
-    * 1) Reading a flag byte, the 8 bits of which tell you if the
-    *     following 8 codes are compressed our un-compressed
-    * 2) Consider the 8 bits in turn
-    * 3) If the bit is set, the next code is un-compressed, so
-    *     add it to the dictionary and output it
-    * 4) If the bit isn't set, then read in the length and start
-    *     position in the dictionary, and output the bytes there
-    * 5) Loop until we've done all 8 bits, then read in the next
-    *     flag byte
-    */
-   public void decompress(InputStream src, OutputStream res) throws IOException {
-      // How far through the output we've got
-      // (This is normally used &4095, so it nicely wraps)
-      // The initial value is set when populating the dictionary
-      int pos;
-      // The flag byte is treated as its 8 individual
-      //  bits, which tell us if the following 8 codes
-      //  are compressed or un-compressed
-      int flag;
-      // The mask, between 1 and 255, which is used when
-      //  processing each bit of the flag byte in turn
-      int mask;
-
-      // We use 12 bit codes:
-      // * 0-255 are real bytes
-      // * 256-4095 are the substring codes
-      // Java handily initialises our buffer / dictionary
-      //  to all zeros
-      byte[] buffer = new byte[4096];
-      pos = populateDictionary(buffer);
-
-      // These are bytes as looked up in the dictionary
-      // It needs to be signed, as it'll get passed on to
-      //  the output stream
-      byte[] dataB = IOUtils.safelyAllocate(16+codeLengthIncrease, MAX_RECORD_LENGTH);
-      // This is an unsigned byte read from the stream
-      // It needs to be unsigned, so that bit stuff works
-      int dataI;
-      // The compressed code sequence is held over 2 bytes
-      int dataIPt1, dataIPt2;
-      // How long a code sequence is, and where in the
-      //  dictionary to start at
-      int len, pntr;
-
-      while( (flag = src.read()) != -1 ) {
-         // Compare each bit in our flag byte in turn:
-         for(mask = 1; mask < 256 ; mask <<= 1) {
-            // Is this a new code (un-compressed), or
-            //  the use of existing codes (compressed)?
-            boolean isMaskSet = (flag & mask) > 0;
-            if( isMaskSet ^ maskMeansCompressed ) {
-               // Retrieve the un-compressed code
-               if( (dataI = src.read()) != -1) {
-                  // Save the byte into the dictionary
-                  buffer[(pos&4095)] = fromInt(dataI);
-                  pos++;
-                  // And output the byte
-                  res.write( new byte[] {fromInt(dataI)} );
-               }
-            } else {
-               // We have a compressed sequence
-               // Grab the next 16 bits of data
-               dataIPt1 = src.read();
-               dataIPt2 = src.read();
-               if(dataIPt1 == -1 || dataIPt2 == -1) break;
-
-               // Build up how long the code sequence is, and
-               //  what position of the code to start at
-               // (The position is the usually the first 12 bits, 
-               //  and the length is usually the last 4 bits)
-               len = (dataIPt2 & 15) + codeLengthIncrease;
-               if(positionIsBigEndian) {
-                  pntr = (dataIPt1<<4) + (dataIPt2>>4);
-               } else {
-                  pntr = dataIPt1 + ((dataIPt2&0xF0)<<4);
-               }
-               
-               // Adjust the pointer as needed
-               pntr = adjustDictionaryOffset(pntr);
-
-               // Loop over the codes, outputting what they correspond to
-               for(int i=0; i<len; i++) {
-                  dataB[i] = buffer[(pntr + i) & 4095];
-                  buffer[ (pos + i) & 4095 ] = dataB[i];
-               }
-               res.write(dataB, 0, len);
-
-               // Record how far along the stream we have moved
-               pos = pos + len;
+    /** the size of our dictionary */
+    public static final int DICT_SIZE = 0x1000;
+    /** the mask for calculating / wrapping dictionary offsets */
+    public static final int DICT_MASK = 0xFFF;
+
+    //arbitrarily selected; may need to increase
+    private static final int MAX_RECORD_LENGTH = 1_000_000;
+
+    /**
+     * Does the mask bit mean it's compressed or uncompressed?
+     */
+    private final boolean maskMeansCompressed;
+    /**
+     * How much to append to the code length in the stream
+     * to get the real code length? Normally 2 or 3
+     */
+    private final int codeLengthIncrease;
+    /**
+     * Does the 12 bits of the position get stored in
+     * Little Endian or Big Endian form?
+     * This controls whether a pos+length of 0x12 0x34
+     * becomes a position of 0x123 or 0x312
+     */
+    private final boolean positionIsBigEndian;
+
+    protected LZWDecompresser(boolean maskMeansCompressed,
+                              int codeLengthIncrease, boolean positionIsBigEndian) {
+        this.maskMeansCompressed = maskMeansCompressed;
+        this.codeLengthIncrease = codeLengthIncrease;
+        this.positionIsBigEndian = positionIsBigEndian;
+    }
+
+    /**
+     * Populates the dictionary, and returns where in it
+     * to begin writing new codes.
+     * Generally, if the dictionary is pre-populated, then new
+     * codes should be placed at the end of that block.
+     * Equally, if the dictionary is left with all zeros, then
+     * usually the new codes can go in at the start.
+     */
+    protected abstract int populateDictionary(byte[] dict);
+
+    /**
+     * Adjusts the position offset if needed when looking
+     * something up in the dictionary.
+     */
+    protected abstract int adjustDictionaryOffset(int offset);
+
+    /**
+     * Decompresses the given input stream, returning the array of bytes
+     * of the decompressed input.
+     */
+    public byte[] decompress(InputStream src) throws IOException {
+        ByteArrayOutputStream res = new ByteArrayOutputStream();
+        decompress(src, res);
+        return res.toByteArray();
+    }
+
+    /**
+     * Perform a streaming decompression of the input.
+     * Works by:
+     * 1) Reading a flag byte, the 8 bits of which tell you if the
+     * following 8 codes are compressed our un-compressed
+     * 2) Consider the 8 bits in turn
+     * 3) If the bit is set, the next code is un-compressed, so
+     * add it to the dictionary and output it
+     * 4) If the bit isn't set, then read in the length and start
+     * position in the dictionary, and output the bytes there
+     * 5) Loop until we've done all 8 bits, then read in the next
+     * flag byte
+     */
+    public void decompress(InputStream src, OutputStream res) throws IOException {
+        // How far through the output we've got
+        // (This is normally used &4095, so it nicely wraps)
+        // The initial value is set when populating the dictionary
+        int pos;
+        // The flag byte is treated as its 8 individual
+        //  bits, which tell us if the following 8 codes
+        //  are compressed or un-compressed
+        int flag;
+        // The mask, between 1 and 255, which is used when
+        //  processing each bit of the flag byte in turn
+        int mask;
+
+        // We use 12 bit codes:
+        // * 0-255 are real bytes
+        // * 256-4095 are the substring codes
+        // Java handily initialises our buffer / dictionary
+        //  to all zeros
+        final byte[] buffer = new byte[DICT_SIZE];
+        pos = populateDictionary(buffer);
+
+        // These are bytes as looked up in the dictionary
+        // It needs to be signed, as it'll get passed on to
+        //  the output stream
+        final byte[] dataB = IOUtils.safelyAllocate(16 + codeLengthIncrease, MAX_RECORD_LENGTH);
+        // This is an unsigned byte read from the stream
+        // It needs to be unsigned, so that bit stuff works
+        int dataI;
+        // The compressed code sequence is held over 2 bytes
+        int dataIPt1, dataIPt2;
+        // How long a code sequence is, and where in the
+        //  dictionary to start at
+        int len, pntr;
+
+        while ((flag = src.read()) != -1) {
+            // Compare each bit in our flag byte in turn:
+            for (mask = 1; mask < 0x100; mask <<= 1) {
+                // Is this a new code (un-compressed), or
+                //  the use of existing codes (compressed)?
+                boolean isMaskSet = (flag & mask) > 0;
+                if (isMaskSet ^ maskMeansCompressed) {
+                    // Retrieve the un-compressed code
+                    if ((dataI = src.read()) != -1) {
+                        // Save the byte into the dictionary
+                        buffer[pos++ & DICT_MASK] = (byte) dataI;
+                        // And output the byte
+                        res.write(dataI);
+                    }
+                } else {
+                    // We have a compressed sequence
+                    // Grab the next 16 bits of data
+                    dataIPt1 = src.read();
+                    dataIPt2 = src.read();
+                    if (dataIPt1 == -1 || dataIPt2 == -1) break;
+
+                    // Build up how long the code sequence is, and
+                    //  what position of the code to start at
+                    // (The position is the usually the first 12 bits,
+                    //  and the length is usually the last 4 bits)
+                    len = (dataIPt2 & 0x0F) + codeLengthIncrease;
+                    if (positionIsBigEndian) {
+                        pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
+                    } else {
+                        pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
+                    }
+
+                    // Adjust the pointer as needed
+                    pntr = adjustDictionaryOffset(pntr);
+
+                    // Loop over the codes, outputting what they correspond to
+                    for (int i = 0; i < len; i++) {
+                        dataB[i] = buffer[(pntr + i) & DICT_MASK];
+                        buffer[(pos + i) & DICT_MASK] = dataB[i];
+                    }
+                    res.write(dataB, 0, len);
+
+                    // Record how far along the stream we have moved
+                    pos += len;
+                }
             }
-         }
-      }
-   }
-
-   /**
-    * Given an integer, turn it into a java byte, handling
-    *  the wrapping.
-    * This is a convenience method
-    */
-   public static byte fromInt(int b) {
-      if(b < 128) return (byte)b;
-      return (byte)(b - 256);
-   }
-   /**
-    * Given a java byte, turn it into an integer between 0
-    *  and 255 (i.e. handle the unwrapping).
-    * This is a convenience method
-    */
-   public static int fromByte(byte b) {
-      if(b >= 0) {
-         return b;
-      }
-      return b + 256;
-   }
+        }
+    }
 }
author	Andreas Beeker <kiwiwings@apache.org>	2020-01-01 22:44:42 +0000
committer	Andreas Beeker <kiwiwings@apache.org>	2020-01-01 22:44:42 +0000
commit	adb8424bc1a1c9a502d2cd07757615b711d32c50 (patch)
tree	c6097e1f80c499176f20b3c29c523e7c348342ae /src/java/org/apache/poi
parent	07b5bc667c33f5fbab0f2b070a139b087328dd60 (diff)
download	poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.tar.gz poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.zip