--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ * This class provides common functionality for the
+ * various LZW implementations in the different file
+ * formats.
+ * It's currently used by HDGF and HMEF.
+ *
+ * Two good resources on LZW are:
+ * http://en.wikipedia.org/wiki/LZW
+ * http://marknelson.us/1989/10/01/lzw-data-compression/
+ */
+public abstract class LZWDecompresser {
+ /**
+ * Does the mask bit mean it's compressed or uncompressed?
+ */
+ private boolean maskMeansCompressed;
+
+ protected LZWDecompresser(boolean maskMeansCompressed) {
+ this.maskMeansCompressed = maskMeansCompressed;
+ }
+
+ /**
+ * Populates the dictionary. May not need
+ * to do anything if all zeros is fine.
+ */
+ protected abstract void populateDictionary(byte[] dict);
+
+ /**
+ * Adjusts the position offset if needed when looking
+ * something up in the dictionary.
+ */
+ protected abstract int adjustDictionaryOffset(int offset);
+
+ /**
+ * Decompresses the given input stream, returning the array of bytes
+ * of the decompressed input.
+ */
+ public byte[] decompress(InputStream src) throws IOException {
+ ByteArrayOutputStream res = new ByteArrayOutputStream();
+ decompress(src,res);
+ return res.toByteArray();
+ }
+
+ /**
+ * Perform a streaming decompression of the input.
+ * Works by:
+ * 1) Reading a flag byte, the 8 bits of which tell you if the
+ * following 8 codes are compressed our un-compressed
+ * 2) Consider the 8 bits in turn
+ * 3) If the bit is set, the next code is un-compressed, so
+ * add it to the dictionary and output it
+ * 4) If the bit isn't set, then read in the length and start
+ * position in the dictionary, and output the bytes there
+ * 5) Loop until we've done all 8 bits, then read in the next
+ * flag byte
+ */
+ public void decompress(InputStream src, OutputStream res) throws IOException {
+ // We use 12 bit codes:
+ // * 0-255 are real bytes
+ // * 256-4095 are the substring codes
+ // Java handily initialises our buffer / dictionary
+ // to all zeros
+ byte[] buffer = new byte[4096];
+ populateDictionary(buffer);
+
+ // How far through the output we've got
+ // (This is normally used &4095, so it nicely wraps)
+ int pos = 0;
+ // The flag byte is treated as its 8 individual
+ // bits, which tell us if the following 8 codes
+ // are compressed or un-compressed
+ int flag;
+ // The mask, between 1 and 255, which is used when
+ // processing each bit of the flag byte in turn
+ int mask;
+
+ // These are bytes as looked up in the dictionary
+ // It needs to be signed, as it'll get passed on to
+ // the output stream
+ byte[] dataB = new byte[19];
+ // This is an unsigned byte read from the stream
+ // It needs to be unsigned, so that bit stuff works
+ int dataI;
+ // The compressed code sequence is held over 2 bytes
+ int dataIPt1, dataIPt2;
+ // How long a code sequence is, and where in the
+ // dictionary to start at
+ int len, pntr;
+
+ while( (flag = src.read()) != -1 ) {
+ // Compare each bit in our flag byte in turn:
+ for(mask = 1; mask < 256 ; mask <<= 1) {
+ // Is this a new code (un-compressed), or
+ // the use of existing codes (compressed)?
+ boolean isMaskSet = (flag & mask) > 0;
+ if( isMaskSet && !maskMeansCompressed ) {
+ // Retrieve the un-compressed code
+ if( (dataI = src.read()) != -1) {
+ // Save the byte into the dictionary
+ buffer[(pos&4095)] = fromInt(dataI);
+ pos++;
+ // And output the byte
+ res.write( new byte[] {fromInt(dataI)} );
+ }
+ } else {
+ // We have a compressed sequence
+ // Grab the next 16 bits of data
+ dataIPt1 = src.read();
+ dataIPt2 = src.read();
+ if(dataIPt1 == -1 || dataIPt2 == -1) break;
+
+ // Build up how long the code sequence is, and
+ // what position of the code to start at
+ // (The position is the first 12 bits, the
+ // length is the last 4 bits)
+ len = (dataIPt2 & 15) + 3;
+ pntr = (dataIPt2 & 240)*16 + dataIPt1;
+
+ // Adjust the pointer as needed
+ pntr = adjustDictionaryOffset(pntr);
+
+ // Loop over the codes, outputting what they correspond to
+ for(int i=0; i<len; i++) {
+ dataB[i] = buffer[(pntr + i) & 4095];
+ buffer[ (pos + i) & 4095 ] = dataB[i];
+ }
+ res.write(dataB, 0, len);
+
+ // Record how far along the stream we have moved
+ pos = pos + len;
+ }
+ }
+ }
+ }
+
+ /**
+ * Given an integer, turn it into a java byte, handling
+ * the wrapping.
+ * This is a convenience method
+ */
+ public static byte fromInt(int b) {
+ if(b < 128) return (byte)b;
+ return (byte)(b - 256);
+ }
+ /**
+ * Given a java byte, turn it into an integer between 0
+ * and 255 (i.e. handle the unwrapping).
+ * This is a convenience method
+ */
+ public static int fromByte(byte b) {
+ if(b >= 0) {
+ return b;
+ }
+ return b + 256;
+ }
+}
import java.io.InputStream;
import java.io.OutputStream;
+import org.apache.poi.util.LZWDecompresser;
+
/**
* A decoder for the crazy LZW implementation used
* in Visio.
* http://en.wikipedia.org/wiki/LZW
* http://marknelson.us/1989/10/01/lzw-data-compression/
*/
-public class HDGFLZW {
-
- /**
- * Given an integer, turn it into a java byte, handling
- * the wrapping.
- * This is a convenience method
- */
- public static byte fromInt(int b) {
- if(b < 128) return (byte)b;
- return (byte)(b - 256);
- }
- /**
- * Given a java byte, turn it into an integer between 0
- * and 255 (i.e. handle the unwrapping).
- * This is a convenience method
- */
- public static int fromByte(byte b) {
- if(b >= 0) {
- return b;
- }
- return b + 256;
+public class HDGFLZW extends LZWDecompresser {
+ public HDGFLZW() {
+ // We're the wrong way round!
+ super(false);
}
/**
}
/**
- * Decompresses the given input stream, returning the array of bytes
- * of the decompressed input.
+ * We have a slight shift by 18 bytes
*/
- public byte[] decode(InputStream src) throws IOException {
- ByteArrayOutputStream res = new ByteArrayOutputStream();
- decode(src,res);
- return res.toByteArray();
+ @Override
+ protected int adjustDictionaryOffset(int pntr) {
+ if(pntr > 4078) {
+ pntr = pntr - 4078;
+ } else {
+ pntr = pntr + 18;
+ }
+ return pntr;
}
-
+
/**
- * Perform a streaming decompression of the input.
- * Works by:
- * 1) Reading a flag byte, the 8 bits of which tell you if the
- * following 8 codes are compressed our un-compressed
- * 2) Consider the 8 bits in turn
- * 3) If the bit is set, the next code is un-compressed, so
- * add it to the dictionary and output it
- * 4) If the bit isn't set, then read in the length and start
- * position in the dictionary, and output the bytes there
- * 5) Loop until we've done all 8 bits, then read in the next
- * flag byte
+ * We want an empty dictionary, so do nothing
*/
- public void decode(InputStream src, OutputStream res) throws IOException {
- // We use 12 bit codes:
- // * 0-255 are real bytes
- // * 256-4095 are the substring codes
- // Java handily initialises our buffer / dictionary
- // to all zeros
- byte[] buffer = new byte[4096];
-
- // How far through the output we've got
- // (This is normally used &4095, so it nicely wraps)
- int pos = 0;
- // The flag byte is treated as its 8 individual
- // bits, which tell us if the following 8 codes
- // are compressed or un-compressed
- int flag;
- // The mask, between 1 and 255, which is used when
- // processing each bit of the flag byte in turn
- int mask;
-
- // These are bytes as looked up in the dictionary
- // It needs to be signed, as it'll get passed on to
- // the output stream
- byte[] dataB = new byte[19];
- // This is an unsigned byte read from the stream
- // It needs to be unsigned, so that bit stuff works
- int dataI;
- // The compressed code sequence is held over 2 bytes
- int dataIPt1, dataIPt2;
- // How long a code sequence is, and where in the
- // dictionary to start at
- int len, pntr;
-
- while( (flag = src.read()) != -1 ) {
- // Compare each bit in our flag byte in turn:
- for(mask = 1; mask < 256 ; mask <<= 1) {
- // Is this a new code (un-compressed), or
- // the use of existing codes (compressed)?
- if( (flag & mask) > 0 ) {
- // Retrieve the un-compressed code
- if( (dataI = src.read()) != -1) {
- // Save the byte into the dictionary
- buffer[(pos&4095)] = fromInt(dataI);
- pos++;
- // And output the byte
- res.write( new byte[] {fromInt(dataI)} );
- }
- } else {
- // We have a compressed sequence
- // Grab the next 16 bits of data
- dataIPt1 = src.read();
- dataIPt2 = src.read();
- if(dataIPt1 == -1 || dataIPt2 == -1) break;
-
- // Build up how long the code sequence is, and
- // what position of the code to start at
- // (The position is the first 12 bits, the
- // length is the last 4 bits)
- len = (dataIPt2 & 15) + 3;
- pntr = (dataIPt2 & 240)*16 + dataIPt1;
-
- // If the pointer happens to be passed the end
- // of our buffer, then wrap around
- if(pntr > 4078) {
- pntr = pntr - 4078;
- } else {
- pntr = pntr + 18;
- }
-
- // Loop over the codes, outputting what they correspond to
- for(int i=0; i<len; i++) {
- dataB[i] = buffer[(pntr + i) & 4095];
- buffer[ (pos + i) & 4095 ] = dataB[i];
- }
- res.write(dataB, 0, len);
-
- // Record how far along the stream we have moved
- pos = pos + len;
- }
- }
- }
+ @Override
+ protected void populateDictionary(byte[] dict) {
}
/**
// Decompress
HDGFLZW lzw = new HDGFLZW();
- byte[] decompressed = lzw.decode(bais);
+ byte[] decompressed = lzw.decompress(bais);
// Split into header and contents
byte[][] ret = new byte[2][];
assertEquals(339, testTrailerComp.length);
assertEquals(632, testTrailerDecomp.length);
- // Decode it using our engine
+ // decompress it using our engine
HDGFLZW lzw = new HDGFLZW();
- byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
+ byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
// Check it's of the right size
assertEquals(632, dec.length);
assertEquals(339, testTrailerComp.length);
assertEquals(632, testTrailerDecomp.length);
- // Decode it using our engine
+ // decompress it using our engine
HDGFLZW lzw = new HDGFLZW();
- byte[] dec = lzw.decode(new ByteArrayInputStream(testTrailerComp));
+ byte[] dec = lzw.decompress(new ByteArrayInputStream(testTrailerComp));
// Now check it's the right data
assertEquals(632, dec.length);
byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp));
// Now decompress it again
- byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+ byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
// First up, check the round tripping
assertEquals(12, decomp.length);
assertEquals(27, comp.length);
// Now decompress it again
- byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+ byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
// We can only check the round-tripping, as for now
// visio cheats on re-using a block
byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp));
// Now decompress it again
- byte[] decomp = lzw.decode(new ByteArrayInputStream(comp));
+ byte[] decomp = lzw.decompress(new ByteArrayInputStream(comp));
// for(int i=0; i<comp.length; i++) {
// System.err.println(i + "\t" + comp[i] + "\t" + testTrailerComp[i]);