123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628 |
- package com.healthmarketscience.jackcess.scsu;
-
- /**
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
- /**
- This class implements a simple compression algorithm
- **/
- /*
- Note on exception handling
- This compressor is designed so that it can be restarted after
- an exception. All operations advancing input and/or output cursor
- (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
- before updating the cursors.
- */
- public class Compress extends SCSU
- {
-
- /** next input character to be read **/
- private int iIn;
-
- /** next output byte to be written **/
- private int iOut;
-
- /** start index of Unicode mode in output array, or -1 if in single byte mode **/
- private int iSCU = -1;
-
- /** true if the next command byte is of the Uxx family */
- private boolean fUnicodeMode = false;
-
- /** locate a window for a character given a table of offsets
- @param ch - character
- @param offsetTable - table of window offsets
- @return true if the character fits a window from the table of windows */
- private boolean locateWindow(int ch, int[] offsetTable)
- {
- // always try the current window first
- int iWin = getCurrentWindow();
-
- // if the character fits the current window
- // just use the current window
- if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
- {
- return true;
- }
-
- // try all windows in order
- for (iWin = 0; iWin < offsetTable.length; iWin++)
- {
- if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
- {
- selectWindow(iWin);
- return true;
- }
- }
- // none found
- return false;
- }
-
- /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
- public static boolean isAsciiCrLfOrTab(int ch)
- {
- return (ch >= 0x20 && ch <= 0x7F) // ASCII
- || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB
-
- }
-
- /** output a run of characters in single byte mode
- In single byte mode pass through characters in the ASCII range, but
- quote characters overlapping with compression command codes. Runs
- of characters fitting the current window are output as runs of bytes
- in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
- Uses and updates the current input and output cursors store in
- the instance variables <i>iIn</i> and <i>iOut</i>.
- @param in - input character array
- @param out - output byte array
- @return the next chaacter to be processed. This may be an extended character.
- **/
- @SuppressWarnings("fallthrough")
- public int outputSingleByteRun(char [] in, byte [] out)
- throws EndOfOutputException, EndOfInputException, IllegalInputException
- {
- int iWin = getCurrentWindow();
- while(iIn < in.length)
- {
- int outlen = 0;
- byte byte1 = 0;
- byte byte2 = 0;
-
- // get the input character
- int ch = in[iIn];
-
- int inlen = 1;
-
- // Check input for Surrogate pair
- if ( (ch & 0xF800) == 0xD800 )
- {
- if ( (ch & 0xFC00) == 0xDC00 )
- {
- // low surrogate out of order
- throw new IllegalInputException("Unpaired low surrogate: "+iIn);
- }
- else
- {
- // have high surrogate now get low surrogate
- if ( iIn >= in.length-1)
- {
- // premature end of input
- throw new EndOfInputException();
- }
- // get the char
- int ch2 = in[iIn+1];
-
- // make sure it's a low surrogate
- if ( (ch2 & 0xFC00) != 0xDC00 )
- {
- // a low surrogate was required
- throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
- }
-
- // combine the two values
- ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
- // ch = ch<<10 + ch2 - 0x36F0000;
-
- inlen = 2;
- }
- }
-
- // ASCII Letter, NUL, CR, LF and TAB are always passed through
- if (isAsciiCrLfOrTab(ch) || ch == 0)
- {
- // pass through directcly
- byte2 = (byte)(ch & 0x7F);
- outlen = 1;
- }
-
- // All other control codes must be quoted
- else if (ch < 0x20)
- {
- byte1 = SQ0;
- byte2 = (byte)(ch);
- outlen = 2;
- }
-
- // Letters that fit the current dynamic window
- else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
- {
- ch -= dynamicOffset[iWin];
- byte2 = (byte)(ch | 0x80);
- outlen = 1;
- }
-
- // check for room in the output array
- if (iOut + outlen >= out.length)
- {
- throw new EndOfOutputException();
- }
-
- switch(outlen)
- {
- default:
- // need to use some other compression mode for this
- // character so we terminate this loop
-
- return ch; // input not finished
-
- // output the characters
- case 2:
- out[iOut++] = byte1;
- // fall through
- case 1:
- out[iOut++] = byte2;
- break;
- }
- // advance input pointer
- iIn += inlen;
- }
- return 0; // input all used up
- }
-
- /** quote a single character in single byte mode
- Quoting a character (aka 'non-locking shift') gives efficient access
- to characters that occur in isolation--usually punctuation characters.
- When quoting a character from a dynamic window use 0x80 - 0xFF, when
- quoting a character from a static window use 0x00-0x7f.
- @param ch - character to be quoted
- @param out - output byte array
- **/
-
- private void quoteSingleByte(int ch, byte [] out)
- throws EndOfOutputException
- {
- Debug.out("Quoting SingleByte ", ch);
- int iWin = getCurrentWindow();
-
- // check for room in the output array
- if (iOut >= out.length -2)
- {
- throw new EndOfOutputException();
- }
-
- // Output command byte followed by
- out[iOut++] = (byte)(SQ0 + iWin);
-
- // Letter that fits the current dynamic window
- if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
- {
- ch -= dynamicOffset[iWin];
- out[iOut++] = (byte)(ch | 0x80);
- }
-
- // Letter that fits the current static window
- else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
- {
- ch -= staticOffset[iWin];
- out[iOut++] = (byte)ch;
- }
- else
- {
- throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
- }
- // advance input pointer
- iIn ++;
- Debug.out("New input: ", iIn);
- }
-
- /** output a run of characters in Unicode mode
- A run of Unicode mode consists of characters which are all in the
- range of non-compressible characters or isolated occurrence
- of any other characters. Characters in the range 0xE00-0xF2FF must
- be quoted to avoid overlap with the Unicode mode compression command codes.
- Uses and updates the current input and output cursors store in
- the instance variables <i>iIn</i> and <i>iOut</i>.
- NOTE: Characters from surrogate pairs are passed through and unlike single
- byte mode no checks are made for unpaired surrogate characters.
- @param in - input character array
- @param out - output byte array
- @return the next input character to be processed
- **/
- public char outputUnicodeRun(char [] in, byte [] out)
- throws EndOfOutputException
- {
- // current character
- char ch = 0;
-
- while(iIn < in.length)
- {
- // get current input and set default output length
- ch = in[iIn];
- int outlen = 2;
-
- // Characters in these ranges could potentially be compressed.
- // We require 2 or more compressible characters to break the run
- if (isCompressible(ch))
- {
- // check whether we can look ahead
- if( iIn < in.length - 1)
- {
- // DEBUG
- Debug.out("is-comp: ",ch);
- char ch2 = in[iIn + 1];
- if (isCompressible(ch2))
- {
- // at least 2 characters are compressible
- // break the run
- break;
- }
- //DEBUG
- Debug.out("no-comp: ",ch2);
- }
- // If we get here, the current character is only character
- // left in the input or it is followed by a non-compressible
- // character. In neither case do we gain by breaking the
- // run, so we proceed to output the character.
- if (ch >= 0xE000 && ch <= 0xF2FF)
- {
- // Characters in this range need to be escaped
- outlen = 3;
- }
-
- }
- // check that there is enough room to output the character
- if(iOut >= out.length - outlen)
- {
- // DEBUG
- Debug.out("End of Output @", iOut);
- // if we got here, we ran out of space in the output array
- throw new EndOfOutputException();
- }
-
- // output any characters that cannot be compressed,
- if (outlen == 3)
- {
- // output the quote character
- out[iOut++] = UQU;
- }
- // pass the Unicode character in MSB,LSB order
- out[iOut++] = (byte)(ch >>> 8);
- out[iOut++] = (byte)(ch & 0xFF);
-
- // advance input cursor
- iIn++;
- }
-
- // return the last character
- return ch;
- }
-
- static int iNextWindow = 3;
-
- /** redefine a window so it surrounds a given character value
- For now, this function uses window 3 exclusively (window 4
- for extended windows);
- @return true if a window was successfully defined
- @param ch - character around which window is positioned
- @param out - output byte array
- @param fCurUnicodeMode - type of window
- **/
- private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
- throws IllegalInputException, EndOfOutputException
- {
- int iWin = iNextWindow % 8; // simple LRU
- int iPosition = 0;
-
- // iPosition 0 is a reserved value
- if (ch < 0x80)
- {
- throw new IllegalStateException("ch < 0x80");
- //return false;
- }
-
- // Check the fixed offsets
- for (int i = 0; i < fixedOffset.length; i++)
- {
- if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
- {
- iPosition = i;
- break;
- }
- }
-
- if (iPosition != 0)
- {
- // DEBUG
- Debug.out("FIXED position is ", iPosition + 0xF9);
-
- // ch fits in a fixed offset window position
- dynamicOffset[iWin] = fixedOffset[iPosition];
- iPosition += 0xF9;
- }
- else if (ch < 0x3400)
- {
- // calculate a window position command and set the offset
- iPosition = ch >>> 7;
- dynamicOffset[iWin] = ch & 0xFF80;
-
- Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
- }
- else if (ch < 0xE000)
- {
- // attempt to place a window where none can go
- return false;
- }
- else if (ch <= 0xFFFF)
- {
- // calculate a window position command, accounting
- // for the gap in position values, and set the offset
- iPosition = ((ch - gapOffset)>>> 7);
-
- dynamicOffset[iWin] = ch & 0xFF80;
-
- Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
- }
- else
- {
- // if we get here, the character is in the extended range.
- // Always use Window 4 to define an extended window
-
- iPosition = (ch - 0x10000) >>> 7;
- // DEBUG
- Debug.out("Try position Window at ", iPosition);
-
- iPosition |= iWin << 13;
- dynamicOffset[iWin] = ch & 0x1FFF80;
- }
-
- // Outputting window defintion command for the general cases
- if ( iPosition < 0x100 && iOut < out.length-1)
- {
- out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
- out[iOut++] = (byte) (iPosition & 0xFF);
- }
- // Output an extended window definiton command
- else if ( iPosition >= 0x100 && iOut < out.length - 2)
- {
-
- Debug.out("Setting extended window at ", iPosition);
- out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
- out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
- out[iOut++] = (byte) (iPosition & 0xFF);
- }
- else
- {
- throw new EndOfOutputException();
- }
- selectWindow(iWin);
- iNextWindow++;
- return true;
- }
-
- /**
- compress a Unicode character array with some simplifying assumptions
- **/
- public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
- throws IllegalInputException, EndOfInputException, EndOfOutputException
- {
- iIn = iStartIn;
- iOut = iStartOut;
-
-
- while (iIn < in.length)
- {
- int ch;
-
- // previously we switched to a Unicode run
- if (iSCU != -1)
- {
-
- Debug.out("Remaining", in, iIn);
- Debug.out("Output until ["+iOut+"]: ", out);
-
- // output characters as Unicode
- ch = outputUnicodeRun(in, out);
-
- // for single character Unicode runs (3 bytes) use quote
- if (iOut - iSCU == 3 )
- {
- // go back and fix up the SCU to an SQU instead
- out[iSCU] = SQU;
- iSCU = -1;
- continue;
- }
- else
- {
- iSCU = -1;
- fUnicodeMode = true;
- }
- }
- // next, try to output characters as single byte run
- else
- {
- ch = outputSingleByteRun(in, out);
- }
-
- // check whether we still have input
- if (iIn == in.length)
- {
- break; // no more input
- }
-
- // if we get here, we have a consistent value for ch, whether or
- // not it is an regular or extended character. Locate or define a
- // Window for the current character
-
- Debug.out("Output so far: ", out);
- Debug.out("Routing ch="+ch+" for Input", in, iIn);
-
- // Check that we have enough room to output the command byte
- if (iOut >= out.length - 1)
- {
- throw new EndOfOutputException();
- }
-
- // In order to switch away from Unicode mode, it is necessary
- // to select (or define) a window. If the characters that follow
- // the Unicode range are ASCII characters, we can't use them
- // to decide which window to select, since ASCII characters don't
- // influence window settings. This loop looks ahead until it finds
- // one compressible character that isn't in the ASCII range.
- for (int ich = iIn; ch < 0x80; ich++)
- {
- if (ich == in.length || !isCompressible(in[ich]))
- {
- // if there are only ASCII characters left,
- ch = in[iIn];
- break;
- }
- ch = in[ich]; // lookahead for next non-ASCII char
- }
- // The character value contained in ch here will only be used to select
- // output modes. Actual output of characters starts with in[iIn] and
- // only takes place near the top of the loop.
-
- int iprevWindow = getCurrentWindow();
-
- // try to locate a dynamic window
- if (ch < 0x80 || locateWindow(ch, dynamicOffset))
- {
- Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
- // lookahead to use SQn instead of SCn for single
- // character interruptions of runs in current window
- if(!fUnicodeMode && iIn < in.length -1)
- {
- char ch2 = in[iIn+1];
- if (ch2 >= dynamicOffset[iprevWindow] &&
- ch2 < dynamicOffset[iprevWindow] + 0x80)
- {
- quoteSingleByte(ch, out);
- selectWindow(iprevWindow);
- continue;
- }
- }
-
- out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
- fUnicodeMode = false;
- }
- // try to locate a static window
- else if (!fUnicodeMode && locateWindow(ch, staticOffset))
- {
- // static windows are not accessible from Unicode mode
- Debug.out("located a static window", getCurrentWindow());
- quoteSingleByte(ch, out);
- selectWindow(iprevWindow); // restore current Window settings
- continue;
- }
- // try to define a window around ch
- else if (positionWindow(ch, out, fUnicodeMode) )
- {
- fUnicodeMode = false;
- }
- // If all else fails, start a Unicode run
- else
- {
- iSCU = iOut;
- out[iOut++] = SCU;
- continue;
- }
- }
-
- return iOut - iStartOut;
- }
-
- public byte[] compress(String inStr)
- throws IllegalInputException, EndOfInputException
- {
- // Running out of room for output can cause non-optimal
- // compression. In order to not slow down compression too
- // much, not all intermediate state is constantly saved.
-
- byte [] out = new byte[inStr.length() * 2];
- char [] in = inStr.toCharArray();
- //DEBUG
- Debug.out("compress input: ",in);
- reset();
- while(true)
- {
- try
- {
- simpleCompress(in, charsRead(), out, bytesWritten());
- // if we get here things went fine.
- break;
- }
- catch (EndOfOutputException e)
- {
- // create a larger output buffer and continue
- byte [] largerOut = new byte[out.length * 2];
- System.arraycopy(out, 0, largerOut, 0, out.length);
- out = largerOut;
- }
- }
- byte [] trimmedOut = new byte[bytesWritten()];
- System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
- out = trimmedOut;
-
- Debug.out("compress output: ", out);
- return out;
- }
-
- /** reset is only needed to bail out after an exception and
- restart with new input */
- @Override
- public void reset()
- {
- super.reset();
- fUnicodeMode = false;
- iSCU = - 1;
- }
-
- /** returns the number of bytes written **/
- public int bytesWritten()
- {
- return iOut;
- }
-
- /** returns the number of bytes written **/
- public int charsRead()
- {
- return iIn;
- }
-
- }
|