From: Nick Burch Date: Thu, 16 Dec 2010 07:41:41 +0000 (+0000) Subject: Improve the HDGF LZW code. X-Git-Tag: REL_3_8_BETA1~95 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=0b4b029e2a0a4631d66972ea87e855a4515011d8;p=poi.git Improve the HDGF LZW code. Some tweaks to the decompression, and more tests, but mostly work on the compression side. We can now compress small streams properly, and these round-trip fine. However, some longer streams don't compress correctly, and more work on that is still needed (see the disabled unit test) git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1049805 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/hdgf/index.xml b/src/documentation/content/xdocs/hdgf/index.xml index 9e4ca894cb..fc24c108d7 100644 --- a/src/documentation/content/xdocs/hdgf/index.xml +++ b/src/documentation/content/xdocs/hdgf/index.xml @@ -72,7 +72,8 @@ HDGFLZW, which will be much better documented, and also under the ASL. Completed October 2007
  • Add compression support to HDGFLZW. - In progress
  • + In progress - works for small streams but encoding + goes wrong on larger ones
  • Have HDGF just write back the raw bytes it read in, and have a test to ensure the file is un-changed.
  • Have HDGF generate the bytes to write out from the diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 589ce752cb..fed8713441 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,8 @@ + Inside ExtractorFactory, support finding embedded OOXML documents and providing extractors for them + Partial HDGF LZW compression support 50244 - Support for continued NameRecords 50416 - Correct shifting of the first or last row in a sheet by multiple rows 50440 - Support evaluating formulas with newlines in them, which XSSF may have (but HSSF may not) diff --git a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java index 0595343d45..d6d8d6b764 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZW.java @@ -35,358 +35,147 @@ import java.io.OutputStream; */ public class HDGFLZW { -/** - * Given an integer, turn it into a java byte, handling - * the wrapping. - * This is a convenience method - */ -public static byte fromInt(int b) { - if(b < 128) return (byte)b; - return (byte)(b - 256); -} -/** - * Given a java byte, turn it into an integer between 0 - * and 255 (i.e. handle the unwrapping). - * This is a convenience method - */ -public static int fromByte(byte b) { - if(b >= 0) { - return b; - } - return b + 256; -} - -/** - * Compress the given input stream, returning the array of bytes - * of the compressed input - */ -public byte[] compress(InputStream src) throws IOException { - ByteArrayOutputStream res = new ByteArrayOutputStream(); - compress(src,res); - return res.toByteArray(); -} - -/** - * Decompresses the given input stream, returning the array of bytes - * of the decompressed input. - */ -public byte[] decode(InputStream src) throws IOException { - ByteArrayOutputStream res = new ByteArrayOutputStream(); - decode(src,res); - return res.toByteArray(); -} -/** - * Perform a streaming decompression of the input. - * Works by: - * 1) Reading a flag byte, the 8 bits of which tell you if the - * following 8 codes are compressed our un-compressed - * 2) Consider the 8 bits in turn - * 3) If the bit is set, the next code is un-compressed, so - * add it to the dictionary and output it - * 4) If the bit isn't set, then read in the length and start - * position in the dictionary, and output the bytes there - * 5) Loop until we've done all 8 bits, then read in the next - * flag byte - */ -public void decode(InputStream src, OutputStream res) throws IOException { - // We use 12 bit codes: - // * 0-255 are real bytes - // * 256-4095 are the substring codes - // Java handily initialises our buffer / dictionary - // to all zeros - byte[] buffer = new byte[4096]; - - // How far through the output we've got - // (This is normally used &4095, so it nicely wraps) - int pos = 0; - // The flag byte is treated as its 8 individual - // bits, which tell us if the following 8 codes - // are compressed or un-compressed - int flag; - // The mask, between 1 and 255, which is used when - // processing each bit of the flag byte in turn - int mask; - - // This is a byte as looked up in the dictionary - // It needs to be signed, as it'll get passed on to - // the output stream - byte dataB; - // This is an unsigned byte read from the stream - // It needs to be unsigned, so that bit stuff works - int dataI; - // The compressed code sequence is held over 2 bytes - int dataIPt1, dataIPt2; - // How long a code sequence is, and where in the - // dictionary to start at - int len, pntr; - - while( (flag = src.read()) != -1 ) { - // Compare each bit in our flag byte in turn: - for(mask = 1; mask < 256 ; mask <<= 1) { - // Is this a new code (un-compressed), or - // the use of existing codes (compressed)? - if( (flag & mask) > 0 ) { - // Retrieve the un-compressed code - if( (dataI = src.read()) != -1) { - // Save the byte into the dictionary - buffer[(pos&4095)] = fromInt(dataI); - pos++; - // And output the byte - res.write( new byte[] {fromInt(dataI)} ); - } - } else { - // We have a compressed sequence - // Grab the next 16 bits of data - dataIPt1 = src.read(); - dataIPt2 = src.read(); - if(dataIPt1 == -1 || dataIPt2 == -1) break; - - // Build up how long the code sequence is, and - // what position of the code to start at - // (The position is the first 12 bits, the - // length is the last 4 bits) - len = (dataIPt2 & 15) + 3; - pntr = (dataIPt2 & 240)*16 + dataIPt1; - - // If the pointer happens to be passed the end - // of our buffer, then wrap around - if(pntr > 4078) { - pntr = pntr - 4078; - } else { - pntr = pntr + 18; - } - - // Loop over the codes, outputting what they correspond to - for(int i=0; i=0; i--) { - boolean matches = true; - for(int j=0; matches && j 0) { - outputCompressed(res); - break; - } - - // Try adding this new byte onto rawCode, and - // see if all of that is still found in the - // buffer dictionary or not - rawCode[rawCodeLen] = dataB; - rawCodeLen++; - int rawAt = findRawCodeInBuffer(); - - // If we found it and are now at 16 bytes, - // we need to output our pending code block - if(rawCodeLen == 16 && rawAt > -1) { - outputCompressed(res); - rawCodeLen = 0; - continue; - } - - // If we did find all of rawCode with our new - // byte added on, we can wait to see what happens - // with the next byte - if(rawAt > -1) { - continue; - } - - // If we get here, then the rawCode + this byte weren't - // found in the dictionary - - // If there was something in rawCode before, then that was - // found in the dictionary, so output that compressed - rawCodeLen--; - if(rawCodeLen > 0) { - // Output the old rawCode - outputCompressed(res); - - // Can this byte start a new rawCode, or does - // it need outputting itself? - rawCode[0] = dataB; - rawCodeLen = 1; - if(findRawCodeInBuffer() > -1) { - // Fits in, wait for next byte - continue; - } - // Doesn't fit, output - outputUncompressed(dataB,res); - rawCodeLen = 0; - } else { - // Nothing in rawCode before, so this byte - // isn't in the buffer dictionary - // Output it un-compressed - outputUncompressed(dataB,res); - } - } -} -} - + /** + * Given an integer, turn it into a java byte, handling + * the wrapping. + * This is a convenience method + */ + public static byte fromInt(int b) { + if(b < 128) return (byte)b; + return (byte)(b - 256); + } + /** + * Given a java byte, turn it into an integer between 0 + * and 255 (i.e. handle the unwrapping). + * This is a convenience method + */ + public static int fromByte(byte b) { + if(b >= 0) { + return b; + } + return b + 256; + } + + /** + * Compress the given input stream, returning the array of bytes + * of the compressed input + */ + public byte[] compress(InputStream src) throws IOException { + ByteArrayOutputStream res = new ByteArrayOutputStream(); + compress(src,res); + return res.toByteArray(); + } + + /** + * Decompresses the given input stream, returning the array of bytes + * of the decompressed input. + */ + public byte[] decode(InputStream src) throws IOException { + ByteArrayOutputStream res = new ByteArrayOutputStream(); + decode(src,res); + return res.toByteArray(); + } + + /** + * Perform a streaming decompression of the input. + * Works by: + * 1) Reading a flag byte, the 8 bits of which tell you if the + * following 8 codes are compressed our un-compressed + * 2) Consider the 8 bits in turn + * 3) If the bit is set, the next code is un-compressed, so + * add it to the dictionary and output it + * 4) If the bit isn't set, then read in the length and start + * position in the dictionary, and output the bytes there + * 5) Loop until we've done all 8 bits, then read in the next + * flag byte + */ + public void decode(InputStream src, OutputStream res) throws IOException { + // We use 12 bit codes: + // * 0-255 are real bytes + // * 256-4095 are the substring codes + // Java handily initialises our buffer / dictionary + // to all zeros + byte[] buffer = new byte[4096]; + + // How far through the output we've got + // (This is normally used &4095, so it nicely wraps) + int pos = 0; + // The flag byte is treated as its 8 individual + // bits, which tell us if the following 8 codes + // are compressed or un-compressed + int flag; + // The mask, between 1 and 255, which is used when + // processing each bit of the flag byte in turn + int mask; + + // These are bytes as looked up in the dictionary + // It needs to be signed, as it'll get passed on to + // the output stream + byte[] dataB = new byte[19]; + // This is an unsigned byte read from the stream + // It needs to be unsigned, so that bit stuff works + int dataI; + // The compressed code sequence is held over 2 bytes + int dataIPt1, dataIPt2; + // How long a code sequence is, and where in the + // dictionary to start at + int len, pntr; + + while( (flag = src.read()) != -1 ) { + // Compare each bit in our flag byte in turn: + for(mask = 1; mask < 256 ; mask <<= 1) { + // Is this a new code (un-compressed), or + // the use of existing codes (compressed)? + if( (flag & mask) > 0 ) { + // Retrieve the un-compressed code + if( (dataI = src.read()) != -1) { + // Save the byte into the dictionary + buffer[(pos&4095)] = fromInt(dataI); + pos++; + // And output the byte + res.write( new byte[] {fromInt(dataI)} ); + } + } else { + // We have a compressed sequence + // Grab the next 16 bits of data + dataIPt1 = src.read(); + dataIPt2 = src.read(); + if(dataIPt1 == -1 || dataIPt2 == -1) break; + + // Build up how long the code sequence is, and + // what position of the code to start at + // (The position is the first 12 bits, the + // length is the last 4 bits) + len = (dataIPt2 & 15) + 3; + pntr = (dataIPt2 & 240)*16 + dataIPt1; + + // If the pointer happens to be passed the end + // of our buffer, then wrap around + if(pntr > 4078) { + pntr = pntr - 4078; + } else { + pntr = pntr + 18; + } + + // Loop over the codes, outputting what they correspond to + for(int i=0; i0; i--) { + boolean matches = true; + for(int j=0; matches && j> 4); + buffer[bufferLen] = HDGFLZW.fromInt(bp1); + bufferLen++; + buffer[bufferLen] = HDGFLZW.fromInt(bp2); + bufferLen++; + + // Copy the data to the dictionary in the new place + for(int i=0; i 0) { + outputCompressed(res); + if(maskBitsSet > 0) { + output8Codes(res); + } + } + break; + } + + // Try adding this new byte onto rawCode, and + // see if all of that is still found in the + // buffer dictionary or not + rawCode[rawCodeLen] = dataB; + rawCodeLen++; + int rawAt = findRawCodeInBuffer(); + + // If we found it and are now at 18 bytes, + // we need to output our pending code block + if(rawCodeLen == 18 && rawAt > -1) { + outputCompressed(res); + rawCodeLen = 0; + continue; + } + + // If we did find all of rawCode with our new + // byte added on, we can wait to see what happens + // with the next byte + if(rawAt > -1) { + continue; + } + + // If we get here, then the rawCode + this byte weren't + // found in the dictionary + + // If there was something in rawCode before, then that was + // found in the dictionary, so output that compressed + rawCodeLen--; + if(rawCodeLen > 0) { + // Output the old rawCode + outputCompressed(res); + + // Can this byte start a new rawCode, or does + // it need outputting itself? + rawCode[0] = dataB; + rawCodeLen = 1; + if(findRawCodeInBuffer() > -1) { + // Fits in, wait for next byte + continue; + } + // Doesn't fit, output + outputUncompressed(dataB,res); + rawCodeLen = 0; + } else { + // Nothing in rawCode before, so this byte + // isn't in the buffer dictionary + // Output it un-compressed + outputUncompressed(dataB,res); + } + } +} +} \ No newline at end of file diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java index f3af7c375f..b997ce5c4b 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/TestHDGFLZW.java @@ -28,17 +28,19 @@ public final class TestHDGFLZW extends TestCase { -21, -16, // 3 @ 4093 1, 0, 0, -72, -13, -16, // 3 @ 5 - 78, // *mask bit* + 78, // *mask bit* 2,3,4,7 -32, -5, // 14 @ 4082 1, 0, 3, -21, -16, // 3 @ 4093 10, 5, // 8 @ 28 4, -21, -16, // 3 @ 4093 - 21, // *mask bit* + 21, // *mask bit* 1,3,5 9, -21, -16, // 3 @ 4093 - 103, -21, -16, 34, + 103, + -21, -16, // 3 @ 4093 + 34, -36, -1, // 18 @ 4078 52, 15, // 18 @ 70 70, 15, // 18 @ 88 @@ -169,19 +171,98 @@ public final class TestHDGFLZW extends TestCase { } } - public void DISABLEDtestCompress() throws Exception { - assertEquals(339, testTrailerComp.length); - assertEquals(632, testTrailerDecomp.length); + /** + * Test that we can round-trip a little bit. + * Uses a part short enough that we agree with visio + * on the best way to compress it + */ + public void testCompressMini() throws Exception { + // first 11 bytes compressed = 12 bytes uncompressed + byte[] sourceComp = new byte[11]; + byte[] sourceDecomp = new byte[12]; + System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length); + System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length); // Compress it using our engine HDGFLZW lzw = new HDGFLZW(); - byte[] comp = lzw.compress(new ByteArrayInputStream(testTrailerDecomp)); + byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp)); + + // Now decompress it again + byte[] decomp = lzw.decode(new ByteArrayInputStream(comp)); - // Now check it's the right data - assertEquals(339, comp.length); - for(int i=0; i 11 + // Next 32 -> 13 + byte[] sourceComp = new byte[24]; + byte[] sourceDecomp = new byte[44]; + System.arraycopy(testTrailerComp, 0, sourceComp, 0, sourceComp.length); + System.arraycopy(testTrailerDecomp, 0, sourceDecomp, 0, sourceDecomp.length); + + // Compress it using our engine + HDGFLZW lzw = new HDGFLZW(); + byte[] comp = lzw.compress(new ByteArrayInputStream(sourceDecomp)); + + // We should be 3 characters bigger, as + // we split one compressed bit into two + assertEquals(27, comp.length); + + // Now decompress it again + byte[] decomp = lzw.decode(new ByteArrayInputStream(comp)); + + // We can only check the round-tripping, as for now + // visio cheats on re-using a block + assertEquals(44, decomp.length); + for(int i=0; i