From: Nick Burch Date: Tue, 21 Dec 2010 05:18:34 +0000 (+0000) Subject: Refactor the common LZW decompression code out into utils X-Git-Tag: REL_3_8_BETA1~82 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=bd17bf45a966eae21fe716fe028c2d357f7b1ec4;p=poi.git Refactor the common LZW decompression code out into utils git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1051377 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/util/LZWDecompresser.java b/src/java/org/apache/poi/util/LZWDecompresser.java new file mode 100644 index 0000000000..f172a01db4 --- /dev/null +++ b/src/java/org/apache/poi/util/LZWDecompresser.java @@ -0,0 +1,178 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * This class provides common functionality for the + * various LZW implementations in the different file + * formats. + * It's currently used by HDGF and HMEF. + * + * Two good resources on LZW are: + * http://en.wikipedia.org/wiki/LZW + * http://marknelson.us/1989/10/01/lzw-data-compression/ + */ +public abstract class LZWDecompresser { + /** + * Does the mask bit mean it's compressed or uncompressed? + */ + private boolean maskMeansCompressed; + + protected LZWDecompresser(boolean maskMeansCompressed) { + this.maskMeansCompressed = maskMeansCompressed; + } + + /** + * Populates the dictionary. May not need + * to do anything if all zeros is fine. + */ + protected abstract void populateDictionary(byte[] dict); + + /** + * Adjusts the position offset if needed when looking + * something up in the dictionary. + */ + protected abstract int adjustDictionaryOffset(int offset); + + /** + * Decompresses the given input stream, returning the array of bytes + * of the decompressed input. + */ + public byte[] decompress(InputStream src) throws IOException { + ByteArrayOutputStream res = new ByteArrayOutputStream(); + decompress(src,res); + return res.toByteArray(); + } + + /** + * Perform a streaming decompression of the input. + * Works by: + * 1) Reading a flag byte, the 8 bits of which tell you if the + * following 8 codes are compressed our un-compressed + * 2) Consider the 8 bits in turn + * 3) If the bit is set, the next code is un-compressed, so + * add it to the dictionary and output it + * 4) If the bit isn't set, then read in the length and start + * position in the dictionary, and output the bytes there + * 5) Loop until we've done all 8 bits, then read in the next + * flag byte + */ + public void decompress(InputStream src, OutputStream res) throws IOException { + // We use 12 bit codes: + // * 0-255 are real bytes + // * 256-4095 are the substring codes + // Java handily initialises our buffer / dictionary + // to all zeros + byte[] buffer = new byte[4096]; + populateDictionary(buffer); + + // How far through the output we've got + // (This is normally used &4095, so it nicely wraps) + int pos = 0; + // The flag byte is treated as its 8 individual + // bits, which tell us if the following 8 codes + // are compressed or un-compressed + int flag; + // The mask, between 1 and 255, which is used when + // processing each bit of the flag byte in turn + int mask; + + // These are bytes as looked up in the dictionary + // It needs to be signed, as it'll get passed on to + // the output stream + byte[] dataB = new byte[19]; + // This is an unsigned byte read from the stream + // It needs to be unsigned, so that bit stuff works + int dataI; + // The compressed code sequence is held over 2 bytes + int dataIPt1, dataIPt2; + // How long a code sequence is, and where in the + // dictionary to start at + int len, pntr; + + while( (flag = src.read()) != -1 ) { + // Compare each bit in our flag byte in turn: + for(mask = 1; mask < 256 ; mask <<= 1) { + // Is this a new code (un-compressed), or + // the use of existing codes (compressed)? + boolean isMaskSet = (flag & mask) > 0; + if( isMaskSet && !maskMeansCompressed ) { + // Retrieve the un-compressed code + if( (dataI = src.read()) != -1) { + // Save the byte into the dictionary + buffer[(pos&4095)] = fromInt(dataI); + pos++; + // And output the byte + res.write( new byte[] {fromInt(dataI)} ); + } + } else { + // We have a compressed sequence + // Grab the next 16 bits of data + dataIPt1 = src.read(); + dataIPt2 = src.read(); + if(dataIPt1 == -1 || dataIPt2 == -1) break; + + // Build up how long the code sequence is, and + // what position of the code to start at + // (The position is the first 12 bits, the + // length is the last 4 bits) + len = (dataIPt2 & 15) + 3; + pntr = (dataIPt2 & 240)*16 + dataIPt1; + + // Adjust the pointer as needed + pntr = adjustDictionaryOffset(pntr); + + // Loop over the codes, outputting what they correspond to + for(int i=0; i= 0) { + return b; + } + return b + 256; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java index d6d8d6b764..290c14799b 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import org.apache.poi.util.LZWDecompresser; + /** * A decoder for the crazy LZW implementation used * in Visio. @@ -33,27 +35,10 @@ import java.io.OutputStream; * http://en.wikipedia.org/wiki/LZW * http://marknelson.us/1989/10/01/lzw-data-compression/ */ -public class HDGFLZW { - - /** - * Given an integer, turn it into a java byte, handling - * the wrapping. - * This is a convenience method - */ - public static byte fromInt(int b) { - if(b < 128) return (byte)b; - return (byte)(b - 256); - } - /** - * Given a java byte, turn it into an integer between 0 - * and 255 (i.e. handle the unwrapping). - * This is a convenience method - */ - public static int fromByte(byte b) { - if(b >= 0) { - return b; - } - return b + 256; +public class HDGFLZW extends LZWDecompresser { + public HDGFLZW() { + // We're the wrong way round! + super(false); } /** @@ -67,108 +52,23 @@ public class HDGFLZW { } /** - * Decompresses the given input stream, returning the array of bytes - * of the decompressed input. + * We have a slight shift by 18 bytes */ - public byte[] decode(InputStream src) throws IOException { - ByteArrayOutputStream res = new ByteArrayOutputStream(); - decode(src,res); - return res.toByteArray(); + @Override + protected int adjustDictionaryOffset(int pntr) { + if(pntr > 4078) { + pntr = pntr - 4078; + } else { + pntr = pntr + 18; + } + return pntr; } - + /** - * Perform a streaming decompression of the input. - * Works by: - * 1) Reading a flag byte, the 8 bits of which tell you if the - * following 8 codes are compressed our un-compressed - * 2) Consider the 8 bits in turn - * 3) If the bit is set, the next code is un-compressed, so - * add it to the dictionary and output it - * 4) If the bit isn't set, then read in the length and start - * position in the dictionary, and output the bytes there - * 5) Loop until we've done all 8 bits, then read in the next - * flag byte + * We want an empty dictionary, so do nothing */ - public void decode(InputStream src, OutputStream res) throws IOException { - // We use 12 bit codes: - // * 0-255 are real bytes - // * 256-4095 are the substring codes - // Java handily initialises our buffer / dictionary - // to all zeros - byte[] buffer = new byte[4096]; - - // How far through the output we've got - // (This is normally used &4095, so it nicely wraps) - int pos = 0; - // The flag byte is treated as its 8 individual - // bits, which tell us if the following 8 codes - // are compressed or un-compressed - int flag; - // The mask, between 1 and 255, which is used when - // processing each bit of the flag byte in turn - int mask; - - // These are bytes as looked up in the dictionary - // It needs to be signed, as it'll get passed on to - // the output stream - byte[] dataB = new byte[19]; - // This is an unsigned byte read from the stream - // It needs to be unsigned, so that bit stuff works - int dataI; - // The compressed code sequence is held over 2 bytes - int dataIPt1, dataIPt2; - // How long a code sequence is, and where in the - // dictionary to start at - int len, pntr; - - while( (flag = src.read()) != -1 ) { - // Compare each bit in our flag byte in turn: - for(mask = 1; mask < 256 ; mask <<= 1) { - // Is this a new code (un-compressed), or - // the use of existing codes (compressed)? - if( (flag & mask) > 0 ) { - // Retrieve the un-compressed code - if( (dataI = src.read()) != -1) { - // Save the byte into the dictionary - buffer[(pos&4095)] = fromInt(dataI); - pos++; - // And output the byte - res.write( new byte[] {fromInt(dataI)} ); - } - } else { - // We have a compressed sequence - // Grab the next 16 bits of data - dataIPt1 = src.read(); - dataIPt2 = src.read(); - if(dataIPt1 == -1 || dataIPt2 == -1) break; - - // Build up how long the code sequence is, and - // what position of the code to start at - // (The position is the first 12 bits, the - // length is the last 4 bits) - len = (dataIPt2 & 15) + 3; - pntr = (dataIPt2 & 240)*16 + dataIPt1; - - // If the pointer happens to be passed the end - // of our buffer, then wrap around - if(pntr > 4078) { - pntr = pntr - 4078; - } else { - pntr = pntr + 18; - } - - // Loop over the codes, outputting what they correspond to - for(int i=0; i