diff options
author | Andreas Beeker <kiwiwings@apache.org> | 2020-01-01 22:44:42 +0000 |
---|---|---|
committer | Andreas Beeker <kiwiwings@apache.org> | 2020-01-01 22:44:42 +0000 |
commit | adb8424bc1a1c9a502d2cd07757615b711d32c50 (patch) | |
tree | c6097e1f80c499176f20b3c29c523e7c348342ae /src/java/org/apache/poi/util/LZWDecompresser.java | |
parent | 07b5bc667c33f5fbab0f2b070a139b087328dd60 (diff) | |
download | poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.tar.gz poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.zip |
Fix Visio compression
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1872223 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/java/org/apache/poi/util/LZWDecompresser.java')
-rw-r--r-- | src/java/org/apache/poi/util/LZWDecompresser.java | 331 |
1 files changed, 157 insertions, 174 deletions
diff --git a/src/java/org/apache/poi/util/LZWDecompresser.java b/src/java/org/apache/poi/util/LZWDecompresser.java index ab24bf0f25..22007b4f72 100644 --- a/src/java/org/apache/poi/util/LZWDecompresser.java +++ b/src/java/org/apache/poi/util/LZWDecompresser.java @@ -23,184 +23,167 @@ import java.io.OutputStream; /** * This class provides common functionality for the - * various LZW implementations in the different file - * formats. + * various LZW implementations in the different file + * formats. * It's currently used by HDGF and HMEF. - * + * <p> * Two good resources on LZW are: - * http://en.wikipedia.org/wiki/LZW - * http://marknelson.us/1989/10/01/lzw-data-compression/ + * http://en.wikipedia.org/wiki/LZW + * http://marknelson.us/1989/10/01/lzw-data-compression/ */ public abstract class LZWDecompresser { - //arbitrarily selected; may need to increase - private static final int MAX_RECORD_LENGTH = 1_000_000; - - /** - * Does the mask bit mean it's compressed or uncompressed? - */ - private final boolean maskMeansCompressed; - /** - * How much to append to the code length in the stream - * to get the real code length? Normally 2 or 3 - */ - private final int codeLengthIncrease; - /** - * Does the 12 bits of the position get stored in - * Little Endian or Big Endian form? - * This controls whether a pos+length of 0x12 0x34 - * becomes a position of 0x123 or 0x312 - */ - private final boolean positionIsBigEndian; - - protected LZWDecompresser(boolean maskMeansCompressed, - int codeLengthIncrease, boolean positionIsBigEndian) { - this.maskMeansCompressed = maskMeansCompressed; - this.codeLengthIncrease = codeLengthIncrease; - this.positionIsBigEndian = positionIsBigEndian; - } - - /** - * Populates the dictionary, and returns where in it - * to begin writing new codes. - * Generally, if the dictionary is pre-populated, then new - * codes should be placed at the end of that block. - * Equally, if the dictionary is left with all zeros, then - * usually the new codes can go in at the start. - */ - protected abstract int populateDictionary(byte[] dict); - - /** - * Adjusts the position offset if needed when looking - * something up in the dictionary. - */ - protected abstract int adjustDictionaryOffset(int offset); - - /** - * Decompresses the given input stream, returning the array of bytes - * of the decompressed input. - */ - public byte[] decompress(InputStream src) throws IOException { - ByteArrayOutputStream res = new ByteArrayOutputStream(); - decompress(src,res); - return res.toByteArray(); - } - - /** - * Perform a streaming decompression of the input. - * Works by: - * 1) Reading a flag byte, the 8 bits of which tell you if the - * following 8 codes are compressed our un-compressed - * 2) Consider the 8 bits in turn - * 3) If the bit is set, the next code is un-compressed, so - * add it to the dictionary and output it - * 4) If the bit isn't set, then read in the length and start - * position in the dictionary, and output the bytes there - * 5) Loop until we've done all 8 bits, then read in the next - * flag byte - */ - public void decompress(InputStream src, OutputStream res) throws IOException { - // How far through the output we've got - // (This is normally used &4095, so it nicely wraps) - // The initial value is set when populating the dictionary - int pos; - // The flag byte is treated as its 8 individual - // bits, which tell us if the following 8 codes - // are compressed or un-compressed - int flag; - // The mask, between 1 and 255, which is used when - // processing each bit of the flag byte in turn - int mask; - - // We use 12 bit codes: - // * 0-255 are real bytes - // * 256-4095 are the substring codes - // Java handily initialises our buffer / dictionary - // to all zeros - byte[] buffer = new byte[4096]; - pos = populateDictionary(buffer); - - // These are bytes as looked up in the dictionary - // It needs to be signed, as it'll get passed on to - // the output stream - byte[] dataB = IOUtils.safelyAllocate(16+codeLengthIncrease, MAX_RECORD_LENGTH); - // This is an unsigned byte read from the stream - // It needs to be unsigned, so that bit stuff works - int dataI; - // The compressed code sequence is held over 2 bytes - int dataIPt1, dataIPt2; - // How long a code sequence is, and where in the - // dictionary to start at - int len, pntr; - - while( (flag = src.read()) != -1 ) { - // Compare each bit in our flag byte in turn: - for(mask = 1; mask < 256 ; mask <<= 1) { - // Is this a new code (un-compressed), or - // the use of existing codes (compressed)? - boolean isMaskSet = (flag & mask) > 0; - if( isMaskSet ^ maskMeansCompressed ) { - // Retrieve the un-compressed code - if( (dataI = src.read()) != -1) { - // Save the byte into the dictionary - buffer[(pos&4095)] = fromInt(dataI); - pos++; - // And output the byte - res.write( new byte[] {fromInt(dataI)} ); - } - } else { - // We have a compressed sequence - // Grab the next 16 bits of data - dataIPt1 = src.read(); - dataIPt2 = src.read(); - if(dataIPt1 == -1 || dataIPt2 == -1) break; - - // Build up how long the code sequence is, and - // what position of the code to start at - // (The position is the usually the first 12 bits, - // and the length is usually the last 4 bits) - len = (dataIPt2 & 15) + codeLengthIncrease; - if(positionIsBigEndian) { - pntr = (dataIPt1<<4) + (dataIPt2>>4); - } else { - pntr = dataIPt1 + ((dataIPt2&0xF0)<<4); - } - - // Adjust the pointer as needed - pntr = adjustDictionaryOffset(pntr); - - // Loop over the codes, outputting what they correspond to - for(int i=0; i<len; i++) { - dataB[i] = buffer[(pntr + i) & 4095]; - buffer[ (pos + i) & 4095 ] = dataB[i]; - } - res.write(dataB, 0, len); - - // Record how far along the stream we have moved - pos = pos + len; + /** the size of our dictionary */ + public static final int DICT_SIZE = 0x1000; + /** the mask for calculating / wrapping dictionary offsets */ + public static final int DICT_MASK = 0xFFF; + + //arbitrarily selected; may need to increase + private static final int MAX_RECORD_LENGTH = 1_000_000; + + /** + * Does the mask bit mean it's compressed or uncompressed? + */ + private final boolean maskMeansCompressed; + /** + * How much to append to the code length in the stream + * to get the real code length? Normally 2 or 3 + */ + private final int codeLengthIncrease; + /** + * Does the 12 bits of the position get stored in + * Little Endian or Big Endian form? + * This controls whether a pos+length of 0x12 0x34 + * becomes a position of 0x123 or 0x312 + */ + private final boolean positionIsBigEndian; + + protected LZWDecompresser(boolean maskMeansCompressed, + int codeLengthIncrease, boolean positionIsBigEndian) { + this.maskMeansCompressed = maskMeansCompressed; + this.codeLengthIncrease = codeLengthIncrease; + this.positionIsBigEndian = positionIsBigEndian; + } + + /** + * Populates the dictionary, and returns where in it + * to begin writing new codes. + * Generally, if the dictionary is pre-populated, then new + * codes should be placed at the end of that block. + * Equally, if the dictionary is left with all zeros, then + * usually the new codes can go in at the start. + */ + protected abstract int populateDictionary(byte[] dict); + + /** + * Adjusts the position offset if needed when looking + * something up in the dictionary. + */ + protected abstract int adjustDictionaryOffset(int offset); + + /** + * Decompresses the given input stream, returning the array of bytes + * of the decompressed input. + */ + public byte[] decompress(InputStream src) throws IOException { + ByteArrayOutputStream res = new ByteArrayOutputStream(); + decompress(src, res); + return res.toByteArray(); + } + + /** + * Perform a streaming decompression of the input. + * Works by: + * 1) Reading a flag byte, the 8 bits of which tell you if the + * following 8 codes are compressed our un-compressed + * 2) Consider the 8 bits in turn + * 3) If the bit is set, the next code is un-compressed, so + * add it to the dictionary and output it + * 4) If the bit isn't set, then read in the length and start + * position in the dictionary, and output the bytes there + * 5) Loop until we've done all 8 bits, then read in the next + * flag byte + */ + public void decompress(InputStream src, OutputStream res) throws IOException { + // How far through the output we've got + // (This is normally used &4095, so it nicely wraps) + // The initial value is set when populating the dictionary + int pos; + // The flag byte is treated as its 8 individual + // bits, which tell us if the following 8 codes + // are compressed or un-compressed + int flag; + // The mask, between 1 and 255, which is used when + // processing each bit of the flag byte in turn + int mask; + + // We use 12 bit codes: + // * 0-255 are real bytes + // * 256-4095 are the substring codes + // Java handily initialises our buffer / dictionary + // to all zeros + final byte[] buffer = new byte[DICT_SIZE]; + pos = populateDictionary(buffer); + + // These are bytes as looked up in the dictionary + // It needs to be signed, as it'll get passed on to + // the output stream + final byte[] dataB = IOUtils.safelyAllocate(16 + codeLengthIncrease, MAX_RECORD_LENGTH); + // This is an unsigned byte read from the stream + // It needs to be unsigned, so that bit stuff works + int dataI; + // The compressed code sequence is held over 2 bytes + int dataIPt1, dataIPt2; + // How long a code sequence is, and where in the + // dictionary to start at + int len, pntr; + + while ((flag = src.read()) != -1) { + // Compare each bit in our flag byte in turn: + for (mask = 1; mask < 0x100; mask <<= 1) { + // Is this a new code (un-compressed), or + // the use of existing codes (compressed)? + boolean isMaskSet = (flag & mask) > 0; + if (isMaskSet ^ maskMeansCompressed) { + // Retrieve the un-compressed code + if ((dataI = src.read()) != -1) { + // Save the byte into the dictionary + buffer[pos++ & DICT_MASK] = (byte) dataI; + // And output the byte + res.write(dataI); + } + } else { + // We have a compressed sequence + // Grab the next 16 bits of data + dataIPt1 = src.read(); + dataIPt2 = src.read(); + if (dataIPt1 == -1 || dataIPt2 == -1) break; + + // Build up how long the code sequence is, and + // what position of the code to start at + // (The position is the usually the first 12 bits, + // and the length is usually the last 4 bits) + len = (dataIPt2 & 0x0F) + codeLengthIncrease; + if (positionIsBigEndian) { + pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4); + } else { + pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4); + } + + // Adjust the pointer as needed + pntr = adjustDictionaryOffset(pntr); + + // Loop over the codes, outputting what they correspond to + for (int i = 0; i < len; i++) { + dataB[i] = buffer[(pntr + i) & DICT_MASK]; + buffer[(pos + i) & DICT_MASK] = dataB[i]; + } + res.write(dataB, 0, len); + + // Record how far along the stream we have moved + pos += len; + } } - } - } - } - - /** - * Given an integer, turn it into a java byte, handling - * the wrapping. - * This is a convenience method - */ - public static byte fromInt(int b) { - if(b < 128) return (byte)b; - return (byte)(b - 256); - } - /** - * Given a java byte, turn it into an integer between 0 - * and 255 (i.e. handle the unwrapping). - * This is a convenience method - */ - public static int fromByte(byte b) { - if(b >= 0) { - return b; - } - return b + 256; - } + } + } } |