aboutsummaryrefslogtreecommitdiffstats
path: root/src/java/org/apache/poi
diff options
context:
space:
mode:
authorAndreas Beeker <kiwiwings@apache.org>2020-01-01 22:44:42 +0000
committerAndreas Beeker <kiwiwings@apache.org>2020-01-01 22:44:42 +0000
commitadb8424bc1a1c9a502d2cd07757615b711d32c50 (patch)
treec6097e1f80c499176f20b3c29c523e7c348342ae /src/java/org/apache/poi
parent07b5bc667c33f5fbab0f2b070a139b087328dd60 (diff)
downloadpoi-adb8424bc1a1c9a502d2cd07757615b711d32c50.tar.gz
poi-adb8424bc1a1c9a502d2cd07757615b711d32c50.zip
Fix Visio compression
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1872223 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/java/org/apache/poi')
-rw-r--r--src/java/org/apache/poi/util/LZWDecompresser.java331
1 files changed, 157 insertions, 174 deletions
diff --git a/src/java/org/apache/poi/util/LZWDecompresser.java b/src/java/org/apache/poi/util/LZWDecompresser.java
index ab24bf0f25..22007b4f72 100644
--- a/src/java/org/apache/poi/util/LZWDecompresser.java
+++ b/src/java/org/apache/poi/util/LZWDecompresser.java
@@ -23,184 +23,167 @@ import java.io.OutputStream;
/**
* This class provides common functionality for the
- * various LZW implementations in the different file
- * formats.
+ * various LZW implementations in the different file
+ * formats.
* It's currently used by HDGF and HMEF.
- *
+ * <p>
* Two good resources on LZW are:
- * http://en.wikipedia.org/wiki/LZW
- * http://marknelson.us/1989/10/01/lzw-data-compression/
+ * http://en.wikipedia.org/wiki/LZW
+ * http://marknelson.us/1989/10/01/lzw-data-compression/
*/
public abstract class LZWDecompresser {
- //arbitrarily selected; may need to increase
- private static final int MAX_RECORD_LENGTH = 1_000_000;
-
- /**
- * Does the mask bit mean it's compressed or uncompressed?
- */
- private final boolean maskMeansCompressed;
- /**
- * How much to append to the code length in the stream
- * to get the real code length? Normally 2 or 3
- */
- private final int codeLengthIncrease;
- /**
- * Does the 12 bits of the position get stored in
- * Little Endian or Big Endian form?
- * This controls whether a pos+length of 0x12 0x34
- * becomes a position of 0x123 or 0x312
- */
- private final boolean positionIsBigEndian;
-
- protected LZWDecompresser(boolean maskMeansCompressed,
- int codeLengthIncrease, boolean positionIsBigEndian) {
- this.maskMeansCompressed = maskMeansCompressed;
- this.codeLengthIncrease = codeLengthIncrease;
- this.positionIsBigEndian = positionIsBigEndian;
- }
-
- /**
- * Populates the dictionary, and returns where in it
- * to begin writing new codes.
- * Generally, if the dictionary is pre-populated, then new
- * codes should be placed at the end of that block.
- * Equally, if the dictionary is left with all zeros, then
- * usually the new codes can go in at the start.
- */
- protected abstract int populateDictionary(byte[] dict);
-
- /**
- * Adjusts the position offset if needed when looking
- * something up in the dictionary.
- */
- protected abstract int adjustDictionaryOffset(int offset);
-
- /**
- * Decompresses the given input stream, returning the array of bytes
- * of the decompressed input.
- */
- public byte[] decompress(InputStream src) throws IOException {
- ByteArrayOutputStream res = new ByteArrayOutputStream();
- decompress(src,res);
- return res.toByteArray();
- }
-
- /**
- * Perform a streaming decompression of the input.
- * Works by:
- * 1) Reading a flag byte, the 8 bits of which tell you if the
- * following 8 codes are compressed our un-compressed
- * 2) Consider the 8 bits in turn
- * 3) If the bit is set, the next code is un-compressed, so
- * add it to the dictionary and output it
- * 4) If the bit isn't set, then read in the length and start
- * position in the dictionary, and output the bytes there
- * 5) Loop until we've done all 8 bits, then read in the next
- * flag byte
- */
- public void decompress(InputStream src, OutputStream res) throws IOException {
- // How far through the output we've got
- // (This is normally used &4095, so it nicely wraps)
- // The initial value is set when populating the dictionary
- int pos;
- // The flag byte is treated as its 8 individual
- // bits, which tell us if the following 8 codes
- // are compressed or un-compressed
- int flag;
- // The mask, between 1 and 255, which is used when
- // processing each bit of the flag byte in turn
- int mask;
-
- // We use 12 bit codes:
- // * 0-255 are real bytes
- // * 256-4095 are the substring codes
- // Java handily initialises our buffer / dictionary
- // to all zeros
- byte[] buffer = new byte[4096];
- pos = populateDictionary(buffer);
-
- // These are bytes as looked up in the dictionary
- // It needs to be signed, as it'll get passed on to
- // the output stream
- byte[] dataB = IOUtils.safelyAllocate(16+codeLengthIncrease, MAX_RECORD_LENGTH);
- // This is an unsigned byte read from the stream
- // It needs to be unsigned, so that bit stuff works
- int dataI;
- // The compressed code sequence is held over 2 bytes
- int dataIPt1, dataIPt2;
- // How long a code sequence is, and where in the
- // dictionary to start at
- int len, pntr;
-
- while( (flag = src.read()) != -1 ) {
- // Compare each bit in our flag byte in turn:
- for(mask = 1; mask < 256 ; mask <<= 1) {
- // Is this a new code (un-compressed), or
- // the use of existing codes (compressed)?
- boolean isMaskSet = (flag & mask) > 0;
- if( isMaskSet ^ maskMeansCompressed ) {
- // Retrieve the un-compressed code
- if( (dataI = src.read()) != -1) {
- // Save the byte into the dictionary
- buffer[(pos&4095)] = fromInt(dataI);
- pos++;
- // And output the byte
- res.write( new byte[] {fromInt(dataI)} );
- }
- } else {
- // We have a compressed sequence
- // Grab the next 16 bits of data
- dataIPt1 = src.read();
- dataIPt2 = src.read();
- if(dataIPt1 == -1 || dataIPt2 == -1) break;
-
- // Build up how long the code sequence is, and
- // what position of the code to start at
- // (The position is the usually the first 12 bits,
- // and the length is usually the last 4 bits)
- len = (dataIPt2 & 15) + codeLengthIncrease;
- if(positionIsBigEndian) {
- pntr = (dataIPt1<<4) + (dataIPt2>>4);
- } else {
- pntr = dataIPt1 + ((dataIPt2&0xF0)<<4);
- }
-
- // Adjust the pointer as needed
- pntr = adjustDictionaryOffset(pntr);
-
- // Loop over the codes, outputting what they correspond to
- for(int i=0; i<len; i++) {
- dataB[i] = buffer[(pntr + i) & 4095];
- buffer[ (pos + i) & 4095 ] = dataB[i];
- }
- res.write(dataB, 0, len);
-
- // Record how far along the stream we have moved
- pos = pos + len;
+ /** the size of our dictionary */
+ public static final int DICT_SIZE = 0x1000;
+ /** the mask for calculating / wrapping dictionary offsets */
+ public static final int DICT_MASK = 0xFFF;
+
+ //arbitrarily selected; may need to increase
+ private static final int MAX_RECORD_LENGTH = 1_000_000;
+
+ /**
+ * Does the mask bit mean it's compressed or uncompressed?
+ */
+ private final boolean maskMeansCompressed;
+ /**
+ * How much to append to the code length in the stream
+ * to get the real code length? Normally 2 or 3
+ */
+ private final int codeLengthIncrease;
+ /**
+ * Does the 12 bits of the position get stored in
+ * Little Endian or Big Endian form?
+ * This controls whether a pos+length of 0x12 0x34
+ * becomes a position of 0x123 or 0x312
+ */
+ private final boolean positionIsBigEndian;
+
+ protected LZWDecompresser(boolean maskMeansCompressed,
+ int codeLengthIncrease, boolean positionIsBigEndian) {
+ this.maskMeansCompressed = maskMeansCompressed;
+ this.codeLengthIncrease = codeLengthIncrease;
+ this.positionIsBigEndian = positionIsBigEndian;
+ }
+
+ /**
+ * Populates the dictionary, and returns where in it
+ * to begin writing new codes.
+ * Generally, if the dictionary is pre-populated, then new
+ * codes should be placed at the end of that block.
+ * Equally, if the dictionary is left with all zeros, then
+ * usually the new codes can go in at the start.
+ */
+ protected abstract int populateDictionary(byte[] dict);
+
+ /**
+ * Adjusts the position offset if needed when looking
+ * something up in the dictionary.
+ */
+ protected abstract int adjustDictionaryOffset(int offset);
+
+ /**
+ * Decompresses the given input stream, returning the array of bytes
+ * of the decompressed input.
+ */
+ public byte[] decompress(InputStream src) throws IOException {
+ ByteArrayOutputStream res = new ByteArrayOutputStream();
+ decompress(src, res);
+ return res.toByteArray();
+ }
+
+ /**
+ * Perform a streaming decompression of the input.
+ * Works by:
+ * 1) Reading a flag byte, the 8 bits of which tell you if the
+ * following 8 codes are compressed our un-compressed
+ * 2) Consider the 8 bits in turn
+ * 3) If the bit is set, the next code is un-compressed, so
+ * add it to the dictionary and output it
+ * 4) If the bit isn't set, then read in the length and start
+ * position in the dictionary, and output the bytes there
+ * 5) Loop until we've done all 8 bits, then read in the next
+ * flag byte
+ */
+ public void decompress(InputStream src, OutputStream res) throws IOException {
+ // How far through the output we've got
+ // (This is normally used &4095, so it nicely wraps)
+ // The initial value is set when populating the dictionary
+ int pos;
+ // The flag byte is treated as its 8 individual
+ // bits, which tell us if the following 8 codes
+ // are compressed or un-compressed
+ int flag;
+ // The mask, between 1 and 255, which is used when
+ // processing each bit of the flag byte in turn
+ int mask;
+
+ // We use 12 bit codes:
+ // * 0-255 are real bytes
+ // * 256-4095 are the substring codes
+ // Java handily initialises our buffer / dictionary
+ // to all zeros
+ final byte[] buffer = new byte[DICT_SIZE];
+ pos = populateDictionary(buffer);
+
+ // These are bytes as looked up in the dictionary
+ // It needs to be signed, as it'll get passed on to
+ // the output stream
+ final byte[] dataB = IOUtils.safelyAllocate(16 + codeLengthIncrease, MAX_RECORD_LENGTH);
+ // This is an unsigned byte read from the stream
+ // It needs to be unsigned, so that bit stuff works
+ int dataI;
+ // The compressed code sequence is held over 2 bytes
+ int dataIPt1, dataIPt2;
+ // How long a code sequence is, and where in the
+ // dictionary to start at
+ int len, pntr;
+
+ while ((flag = src.read()) != -1) {
+ // Compare each bit in our flag byte in turn:
+ for (mask = 1; mask < 0x100; mask <<= 1) {
+ // Is this a new code (un-compressed), or
+ // the use of existing codes (compressed)?
+ boolean isMaskSet = (flag & mask) > 0;
+ if (isMaskSet ^ maskMeansCompressed) {
+ // Retrieve the un-compressed code
+ if ((dataI = src.read()) != -1) {
+ // Save the byte into the dictionary
+ buffer[pos++ & DICT_MASK] = (byte) dataI;
+ // And output the byte
+ res.write(dataI);
+ }
+ } else {
+ // We have a compressed sequence
+ // Grab the next 16 bits of data
+ dataIPt1 = src.read();
+ dataIPt2 = src.read();
+ if (dataIPt1 == -1 || dataIPt2 == -1) break;
+
+ // Build up how long the code sequence is, and
+ // what position of the code to start at
+ // (The position is the usually the first 12 bits,
+ // and the length is usually the last 4 bits)
+ len = (dataIPt2 & 0x0F) + codeLengthIncrease;
+ if (positionIsBigEndian) {
+ pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
+ } else {
+ pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
+ }
+
+ // Adjust the pointer as needed
+ pntr = adjustDictionaryOffset(pntr);
+
+ // Loop over the codes, outputting what they correspond to
+ for (int i = 0; i < len; i++) {
+ dataB[i] = buffer[(pntr + i) & DICT_MASK];
+ buffer[(pos + i) & DICT_MASK] = dataB[i];
+ }
+ res.write(dataB, 0, len);
+
+ // Record how far along the stream we have moved
+ pos += len;
+ }
}
- }
- }
- }
-
- /**
- * Given an integer, turn it into a java byte, handling
- * the wrapping.
- * This is a convenience method
- */
- public static byte fromInt(int b) {
- if(b < 128) return (byte)b;
- return (byte)(b - 256);
- }
- /**
- * Given a java byte, turn it into an integer between 0
- * and 255 (i.e. handle the unwrapping).
- * This is a convenience method
- */
- public static int fromByte(byte b) {
- if(b >= 0) {
- return b;
- }
- return b + 256;
- }
+ }
+ }
}