diff options
author | James Ahlborn <jtahlborn@yahoo.com> | 2008-07-22 19:00:25 +0000 |
---|---|---|
committer | James Ahlborn <jtahlborn@yahoo.com> | 2008-07-22 19:00:25 +0000 |
commit | 4af4fe44512c2c885498794c7a5a6e878318b2ef (patch) | |
tree | 3eb2a1966bd1fdf7777741492a256f645b1e07cf /src/java/com/healthmarketscience/jackcess/scsu | |
parent | d1359adfbf6199a305b062c0a8da0d05e49e84a7 (diff) | |
download | jackcess-4af4fe44512c2c885498794c7a5a6e878318b2ef.tar.gz jackcess-4af4fe44512c2c885498794c7a5a6e878318b2ef.zip |
Add compression code for possible future use; add compression unit
tests.
git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2
Diffstat (limited to 'src/java/com/healthmarketscience/jackcess/scsu')
3 files changed, 677 insertions, 0 deletions
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Compress.java b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java new file mode 100644 index 0000000..c5f7360 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java @@ -0,0 +1,628 @@ +package com.healthmarketscience.jackcess.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + This class implements a simple compression algorithm + **/ +/* + Note on exception handling + This compressor is designed so that it can be restarted after + an exception. All operations advancing input and/or output cursor + (iIn and iOut) either complete an action, or set a state (fUnicodeMode) + before updating the cursors. +*/ +public class Compress extends SCSU +{ + + /** next input character to be read **/ + private int iIn; + + /** next output byte to be written **/ + private int iOut; + + /** start index of Unicode mode in output array, or -1 if in single byte mode **/ + private int iSCU = -1; + + /** true if the next command byte is of the Uxx family */ + private boolean fUnicodeMode = false; + + /** locate a window for a character given a table of offsets + @param ch - character + @param offsetTable - table of window offsets + @return true if the character fits a window from the table of windows */ + private boolean locateWindow(int ch, int[] offsetTable) + { + // always try the current window first + int iWin = getCurrentWindow(); + + // if the character fits the current window + // just use the current window + if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) + { + return true; + } + + // try all windows in order + for (iWin = 0; iWin < offsetTable.length; iWin++) + { + if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80) + { + selectWindow(iWin); + return true; + } + } + // none found + return false; + } + + /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */ + public static boolean isAsciiCrLfOrTab(int ch) + { + return (ch >= 0x20 && ch <= 0x7F) // ASCII + || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB + + } + + /** output a run of characters in single byte mode + In single byte mode pass through characters in the ASCII range, but + quote characters overlapping with compression command codes. Runs + of characters fitting the current window are output as runs of bytes + in the range 0x80-0xFF. Checks for and validates Surrogate Pairs. + Uses and updates the current input and output cursors store in + the instance variables <i>iIn</i> and <i>iOut</i>. + @param in - input character array + @param out - output byte array + @return the next chaacter to be processed. This may be an extended character. + **/ + @SuppressWarnings("fallthrough") + public int outputSingleByteRun(char [] in, byte [] out) + throws EndOfOutputException, EndOfInputException, IllegalInputException + { + int iWin = getCurrentWindow(); + while(iIn < in.length) + { + int outlen = 0; + byte byte1 = 0; + byte byte2 = 0; + + // get the input character + int ch = in[iIn]; + + int inlen = 1; + + // Check input for Surrogate pair + if ( (ch & 0xF800) == 0xD800 ) + { + if ( (ch & 0xFC00) == 0xDC00 ) + { + // low surrogate out of order + throw new IllegalInputException("Unpaired low surrogate: "+iIn); + } + else + { + // have high surrogate now get low surrogate + if ( iIn >= in.length-1) + { + // premature end of input + throw new EndOfInputException(); + } + // get the char + int ch2 = in[iIn+1]; + + // make sure it's a low surrogate + if ( (ch2 & 0xFC00) != 0xDC00 ) + { + // a low surrogate was required + throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1)); + } + + // combine the two values + ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; + // ch = ch<<10 + ch2 - 0x36F0000; + + inlen = 2; + } + } + + // ASCII Letter, NUL, CR, LF and TAB are always passed through + if (isAsciiCrLfOrTab(ch) || ch == 0) + { + // pass through directcly + byte2 = (byte)(ch & 0x7F); + outlen = 1; + } + + // All other control codes must be quoted + else if (ch < 0x20) + { + byte1 = SQ0; + byte2 = (byte)(ch); + outlen = 2; + } + + // Letters that fit the current dynamic window + else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) + { + ch -= dynamicOffset[iWin]; + byte2 = (byte)(ch | 0x80); + outlen = 1; + } + + // check for room in the output array + if (iOut + outlen >= out.length) + { + throw new EndOfOutputException(); + } + + switch(outlen) + { + default: + // need to use some other compression mode for this + // character so we terminate this loop + + return ch; // input not finished + + // output the characters + case 2: + out[iOut++] = byte1; + // fall through + case 1: + out[iOut++] = byte2; + break; + } + // advance input pointer + iIn += inlen; + } + return 0; // input all used up + } + + /** quote a single character in single byte mode + Quoting a character (aka 'non-locking shift') gives efficient access + to characters that occur in isolation--usually punctuation characters. + When quoting a character from a dynamic window use 0x80 - 0xFF, when + quoting a character from a static window use 0x00-0x7f. + @param ch - character to be quoted + @param out - output byte array + **/ + + private void quoteSingleByte(int ch, byte [] out) + throws EndOfOutputException + { + Debug.out("Quoting SingleByte ", ch); + int iWin = getCurrentWindow(); + + // check for room in the output array + if (iOut >= out.length -2) + { + throw new EndOfOutputException(); + } + + // Output command byte followed by + out[iOut++] = (byte)(SQ0 + iWin); + + // Letter that fits the current dynamic window + if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80) + { + ch -= dynamicOffset[iWin]; + out[iOut++] = (byte)(ch | 0x80); + } + + // Letter that fits the current static window + else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80) + { + ch -= staticOffset[iWin]; + out[iOut++] = (byte)ch; + } + else + { + throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error"); + } + // advance input pointer + iIn ++; + Debug.out("New input: ", iIn); + } + + /** output a run of characters in Unicode mode + A run of Unicode mode consists of characters which are all in the + range of non-compressible characters or isolated occurrence + of any other characters. Characters in the range 0xE00-0xF2FF must + be quoted to avoid overlap with the Unicode mode compression command codes. + Uses and updates the current input and output cursors store in + the instance variables <i>iIn</i> and <i>iOut</i>. + NOTE: Characters from surrogate pairs are passed through and unlike single + byte mode no checks are made for unpaired surrogate characters. + @param in - input character array + @param out - output byte array + @return the next input character to be processed + **/ + public char outputUnicodeRun(char [] in, byte [] out) + throws EndOfOutputException + { + // current character + char ch = 0; + + while(iIn < in.length) + { + // get current input and set default output length + ch = in[iIn]; + int outlen = 2; + + // Characters in these ranges could potentially be compressed. + // We require 2 or more compressible characters to break the run + if (isCompressible(ch)) + { + // check whether we can look ahead + if( iIn < in.length - 1) + { + // DEBUG + Debug.out("is-comp: ",ch); + char ch2 = in[iIn + 1]; + if (isCompressible(ch2)) + { + // at least 2 characters are compressible + // break the run + break; + } + //DEBUG + Debug.out("no-comp: ",ch2); + } + // If we get here, the current character is only character + // left in the input or it is followed by a non-compressible + // character. In neither case do we gain by breaking the + // run, so we proceed to output the character. + if (ch >= 0xE000 && ch <= 0xF2FF) + { + // Characters in this range need to be escaped + outlen = 3; + } + + } + // check that there is enough room to output the character + if(iOut >= out.length - outlen) + { + // DEBUG + Debug.out("End of Output @", iOut); + // if we got here, we ran out of space in the output array + throw new EndOfOutputException(); + } + + // output any characters that cannot be compressed, + if (outlen == 3) + { + // output the quote character + out[iOut++] = UQU; + } + // pass the Unicode character in MSB,LSB order + out[iOut++] = (byte)(ch >>> 8); + out[iOut++] = (byte)(ch & 0xFF); + + // advance input cursor + iIn++; + } + + // return the last character + return ch; + } + + static int iNextWindow = 3; + + /** redefine a window so it surrounds a given character value + For now, this function uses window 3 exclusively (window 4 + for extended windows); + @return true if a window was successfully defined + @param ch - character around which window is positioned + @param out - output byte array + @param fCurUnicodeMode - type of window + **/ + private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode) + throws IllegalInputException, EndOfOutputException + { + int iWin = iNextWindow % 8; // simple LRU + int iPosition = 0; + + // iPosition 0 is a reserved value + if (ch < 0x80) + { + throw new IllegalStateException("ch < 0x80"); + //return false; + } + + // Check the fixed offsets + for (int i = 0; i < fixedOffset.length; i++) + { + if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80) + { + iPosition = i; + break; + } + } + + if (iPosition != 0) + { + // DEBUG + Debug.out("FIXED position is ", iPosition + 0xF9); + + // ch fits in a fixed offset window position + dynamicOffset[iWin] = fixedOffset[iPosition]; + iPosition += 0xF9; + } + else if (ch < 0x3400) + { + // calculate a window position command and set the offset + iPosition = ch >>> 7; + dynamicOffset[iWin] = ch & 0xFF80; + + Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); + } + else if (ch < 0xE000) + { + // attempt to place a window where none can go + return false; + } + else if (ch <= 0xFFFF) + { + // calculate a window position command, accounting + // for the gap in position values, and set the offset + iPosition = ((ch - gapOffset)>>> 7); + + dynamicOffset[iWin] = ch & 0xFF80; + + Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch); + } + else + { + // if we get here, the character is in the extended range. + // Always use Window 4 to define an extended window + + iPosition = (ch - 0x10000) >>> 7; + // DEBUG + Debug.out("Try position Window at ", iPosition); + + iPosition |= iWin << 13; + dynamicOffset[iWin] = ch & 0x1FFF80; + } + + // Outputting window defintion command for the general cases + if ( iPosition < 0x100 && iOut < out.length-1) + { + out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin); + out[iOut++] = (byte) (iPosition & 0xFF); + } + // Output an extended window definiton command + else if ( iPosition >= 0x100 && iOut < out.length - 2) + { + + Debug.out("Setting extended window at ", iPosition); + out[iOut++] = (fCurUnicodeMode ? UDX : SDX); + out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF); + out[iOut++] = (byte) (iPosition & 0xFF); + } + else + { + throw new EndOfOutputException(); + } + selectWindow(iWin); + iNextWindow++; + return true; + } + + /** + compress a Unicode character array with some simplifying assumptions + **/ + public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut) + throws IllegalInputException, EndOfInputException, EndOfOutputException + { + iIn = iStartIn; + iOut = iStartOut; + + + while (iIn < in.length) + { + int ch; + + // previously we switched to a Unicode run + if (iSCU != -1) + { + + Debug.out("Remaining", in, iIn); + Debug.out("Output until ["+iOut+"]: ", out); + + // output characters as Unicode + ch = outputUnicodeRun(in, out); + + // for single character Unicode runs (3 bytes) use quote + if (iOut - iSCU == 3 ) + { + // go back and fix up the SCU to an SQU instead + out[iSCU] = SQU; + iSCU = -1; + continue; + } + else + { + iSCU = -1; + fUnicodeMode = true; + } + } + // next, try to output characters as single byte run + else + { + ch = outputSingleByteRun(in, out); + } + + // check whether we still have input + if (iIn == in.length) + { + break; // no more input + } + + // if we get here, we have a consistent value for ch, whether or + // not it is an regular or extended character. Locate or define a + // Window for the current character + + Debug.out("Output so far: ", out); + Debug.out("Routing ch="+ch+" for Input", in, iIn); + + // Check that we have enough room to output the command byte + if (iOut >= out.length - 1) + { + throw new EndOfOutputException(); + } + + // In order to switch away from Unicode mode, it is necessary + // to select (or define) a window. If the characters that follow + // the Unicode range are ASCII characters, we can't use them + // to decide which window to select, since ASCII characters don't + // influence window settings. This loop looks ahead until it finds + // one compressible character that isn't in the ASCII range. + for (int ich = iIn; ch < 0x80; ich++) + { + if (ich == in.length || !isCompressible(in[ich])) + { + // if there are only ASCII characters left, + ch = in[iIn]; + break; + } + ch = in[ich]; // lookahead for next non-ASCII char + } + // The character value contained in ch here will only be used to select + // output modes. Actual output of characters starts with in[iIn] and + // only takes place near the top of the loop. + + int iprevWindow = getCurrentWindow(); + + // try to locate a dynamic window + if (ch < 0x80 || locateWindow(ch, dynamicOffset)) + { + Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1); + // lookahead to use SQn instead of SCn for single + // character interruptions of runs in current window + if(!fUnicodeMode && iIn < in.length -1) + { + char ch2 = in[iIn+1]; + if (ch2 >= dynamicOffset[iprevWindow] && + ch2 < dynamicOffset[iprevWindow] + 0x80) + { + quoteSingleByte(ch, out); + selectWindow(iprevWindow); + continue; + } + } + + out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow()); + fUnicodeMode = false; + } + // try to locate a static window + else if (!fUnicodeMode && locateWindow(ch, staticOffset)) + { + // static windows are not accessible from Unicode mode + Debug.out("located a static window", getCurrentWindow()); + quoteSingleByte(ch, out); + selectWindow(iprevWindow); // restore current Window settings + continue; + } + // try to define a window around ch + else if (positionWindow(ch, out, fUnicodeMode) ) + { + fUnicodeMode = false; + } + // If all else fails, start a Unicode run + else + { + iSCU = iOut; + out[iOut++] = SCU; + continue; + } + } + + return iOut - iStartOut; + } + + public byte[] compress(String inStr) + throws IllegalInputException, EndOfInputException + { + // Running out of room for output can cause non-optimal + // compression. In order to not slow down compression too + // much, not all intermediate state is constantly saved. + + byte [] out = new byte[inStr.length() * 2]; + char [] in = inStr.toCharArray(); + //DEBUG + Debug.out("compress input: ",in); + reset(); + while(true) + { + try + { + simpleCompress(in, charsRead(), out, bytesWritten()); + // if we get here things went fine. + break; + } + catch (EndOfOutputException e) + { + // create a larger output buffer and continue + byte [] largerOut = new byte[out.length * 2]; + System.arraycopy(out, 0, largerOut, 0, out.length); + out = largerOut; + } + } + byte [] trimmedOut = new byte[bytesWritten()]; + System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length); + out = trimmedOut; + + Debug.out("compress output: ", out); + return out; + } + + /** reset is only needed to bail out after an exception and + restart with new input */ + @Override + public void reset() + { + super.reset(); + fUnicodeMode = false; + iSCU = - 1; + } + + /** returns the number of bytes written **/ + public int bytesWritten() + { + return iOut; + } + + /** returns the number of bytes written **/ + public int charsRead() + { + return iIn; + } + +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java new file mode 100644 index 0000000..501d195 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java @@ -0,0 +1,48 @@ +package com.healthmarketscience.jackcess.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ +/** + * The input string or input byte array ended prematurely + */ +public class EndOfOutputException + extends java.lang.Exception + +{ + + private static final long serialVersionUID = 1L; + + public EndOfOutputException(){ + super("The input string or input byte array ended prematurely"); + } + + public EndOfOutputException(String s) { + super(s); + } +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java index 9605554..4858044 100644 --- a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java +++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java @@ -411,6 +411,7 @@ public class Expand extends SCSU /** reset is called to start with new input, w/o creating a new instance */ + @Override public void reset() { iOut = 0; |