tests. git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2

16 年前 · 4af4fe4451
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -25,6 +25,10 @@
      <action dev="jahlborn" type="add">
        Add primitive support for writing unicode compressed text columns.
      </action>
      <action dev="jahlborn" type="add">
        Add compression code for possible future use; add compression unit
        tests.
      </action>
    </release>
    <release version="1.1.15" date="2008-06-27">
      <action dev="jahlborn" type="fix" issue="1998225">
--- a/src/java/com/healthmarketscience/jackcess/Column.java
+++ b/src/java/com/healthmarketscience/jackcess/Column.java
@@ -41,6 +41,7 @@ import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import com.healthmarketscience.jackcess.scsu.Compress;
 import com.healthmarketscience.jackcess.scsu.EndOfInputException;
 import com.healthmarketscience.jackcess.scsu.Expand;
 import com.healthmarketscience.jackcess.scsu.IllegalInputException;
@@ -1196,23 +1197,13 @@ public class Column implements Comparable<Column> {
    // now, see if it is all printable ASCII
    for(int i = 0; i < text.length(); ++i) {
      char c = text.charAt(i);
      if(!isAsciiCrLfOrTab(c)) {
      if(!Compress.isAsciiCrLfOrTab(c)) {
        return false;
      }
    }
    return true;
  }

  /**
   * Returns true if the character is ASCII, but not a control other than
   * CR, LF and TAB
   */
  private static boolean isAsciiCrLfOrTab(int ch)
  {
    return ((ch >= 0x20 && ch <= 0x7F)                  // ASCII (non control)
            || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB
  }

  @Override
  public String toString() {
    StringBuilder rtn = new StringBuilder();
--- a/src/java/com/healthmarketscience/jackcess/scsu/Compress.java
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java
@@ -0,0 +1,628 @@
 package com.healthmarketscience.jackcess.scsu;

 /**
 * This sample software accompanies Unicode Technical Report #6 and
 * distributed as is by Unicode, Inc., subject to the following:
 *
 * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
 *
 * Permission to use, copy, modify, and distribute this software
 * without fee is hereby granted provided that this copyright notice
 * appears in all copies.
 *
 * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
 * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
 * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
 * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
 *
 *  @author Asmus Freytag
 *
 *  @version 001 Dec 25 1996
 *  @version 002 Jun 25 1997
 *  @version 003 Jul 25 1997
 *  @version 004 Aug 25 1997
 *
 * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
 * and are registered in some jurisdictions.
 **/

 /**
    This class implements a simple compression algorithm
 **/
 /*
    Note on exception handling
        This compressor is designed so that it can be restarted after
        an exception. All operations advancing input and/or output cursor
        (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
        before updating the cursors.
 */
 public class Compress extends SCSU
 {

    /** next input character to be read **/
    private int iIn;

    /** next output byte to be written **/
    private int iOut;

    /** start index of Unicode mode in output array, or -1 if in single byte mode **/
    private int iSCU = -1;

    /** true if the next command byte is of the Uxx family */
    private boolean fUnicodeMode = false;

    /** locate a window for a character given a table of offsets
    @param ch - character
    @param offsetTable - table of window offsets
    @return true if the character fits a window from the table of windows */
    private boolean locateWindow(int ch, int[] offsetTable)
    {
        // always try the current window first
        int iWin = getCurrentWindow();

        // if the character fits the current window
        // just use the current window
        if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
        {
            return true;
        }

        // try all windows in order
        for (iWin = 0; iWin < offsetTable.length; iWin++)
        {
            if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
            {
                selectWindow(iWin);
                return true;
            }
        }
        // none found
        return false;
    }

    /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
    public static boolean isAsciiCrLfOrTab(int ch)
    {
        return    (ch >= 0x20 && ch <= 0x7F)                 // ASCII
                || ch == 0x09 || ch == 0x0A || ch == 0x0D;   // CR/LF or TAB

    }

    /** output a run of characters in single byte mode
        In single byte mode pass through characters in the ASCII range, but
        quote characters overlapping with compression command codes. Runs
        of characters fitting the current window are output as runs of bytes
        in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
        Uses and updates the current input and output cursors store in
        the instance variables <i>iIn</i> and <i>iOut</i>.
        @param in - input character array
        @param out - output byte array
        @return the next chaacter to be processed. This may be an extended character.
    **/
    @SuppressWarnings("fallthrough")
    public int outputSingleByteRun(char [] in, byte [] out)
        throws EndOfOutputException, EndOfInputException, IllegalInputException
    {
        int iWin = getCurrentWindow();
        while(iIn < in.length)
        {
            int outlen = 0;
            byte byte1 = 0;
            byte byte2 = 0;

            // get the input character
            int ch = in[iIn];

            int inlen = 1;

            // Check input for Surrogate pair
            if ( (ch & 0xF800) == 0xD800 )
            {
                if ( (ch & 0xFC00) == 0xDC00 )
                {
                    // low surrogate out of order
                    throw new IllegalInputException("Unpaired low surrogate: "+iIn);
                }
                else
                {
                    // have high surrogate now get low surrogate
                    if ( iIn >= in.length-1)
                    {
                        // premature end of input
                        throw new EndOfInputException();
                    }
                    // get the char
                    int ch2 = in[iIn+1];

                    // make sure it's a low surrogate
                    if ( (ch2 & 0xFC00) != 0xDC00 )
                    {
                        // a low surrogate was required
                        throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
                    }

                    // combine the two values
                    ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
                    // ch = ch<<10 + ch2 - 0x36F0000;

                    inlen = 2;
                 }
            }

            // ASCII Letter, NUL, CR, LF and TAB are always passed through
            if (isAsciiCrLfOrTab(ch) || ch == 0)
            {
                // pass through directcly
                byte2 = (byte)(ch & 0x7F);
                outlen = 1;
            }

            // All other control codes must be quoted
            else if (ch < 0x20)
            {
                byte1 = SQ0;
                byte2 = (byte)(ch);
                outlen = 2;
            }

            // Letters that fit the current dynamic window
            else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
            {
                ch -= dynamicOffset[iWin];
                byte2 = (byte)(ch | 0x80);
                outlen = 1;
            }

            // check for room in the output array
            if (iOut + outlen >= out.length)
            {
                throw new EndOfOutputException();
            }

            switch(outlen)
            {
                default:
                    // need to use some other compression mode for this
                    // character so we terminate this loop

                    return ch; // input not finished

                    // output the characters
                case 2:
                    out[iOut++] = byte1;
                    // fall through
                case 1:
                    out[iOut++] = byte2;
                    break;
            }
            // advance input pointer
            iIn += inlen;
        }
        return 0; // input all used up
    }

    /** quote a single character in single byte mode
    Quoting a character (aka 'non-locking shift') gives efficient access
    to characters that occur in isolation--usually punctuation characters.
    When quoting a character from a dynamic window use 0x80 - 0xFF, when
    quoting a character from a static window use 0x00-0x7f.
    @param ch - character to be quoted
    @param out - output byte array
    **/

    private void quoteSingleByte(int ch, byte [] out)
        throws EndOfOutputException
    {
        Debug.out("Quoting SingleByte ", ch);
        int iWin = getCurrentWindow();

        // check for room in the output array
        if (iOut >= out.length -2)
        {
            throw new EndOfOutputException();
        }

        // Output command byte followed by
        out[iOut++] = (byte)(SQ0 + iWin);

        // Letter that fits the current dynamic window
        if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
        {
            ch -= dynamicOffset[iWin];
            out[iOut++] = (byte)(ch | 0x80);
        }

        // Letter that fits the current static window
        else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
        {
            ch -= staticOffset[iWin];
            out[iOut++] = (byte)ch;
        }
        else
        {
            throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
        }
        // advance input pointer
        iIn ++;
        Debug.out("New input: ", iIn);
    }

    /** output a run of characters in Unicode mode
    A run of Unicode mode consists of characters which are all in the
    range of non-compressible characters or isolated occurrence
    of any other characters. Characters in the range 0xE00-0xF2FF must
    be quoted to avoid overlap with the Unicode mode compression command codes.
    Uses and updates the current input and output cursors store in
    the instance variables <i>iIn</i> and <i>iOut</i>.
    NOTE: Characters from surrogate pairs are passed through and unlike single
    byte mode no checks are made for unpaired surrogate characters.
    @param in - input character array
    @param out - output byte array
    @return the next input character to be processed
    **/
    public char outputUnicodeRun(char [] in, byte [] out)
        throws EndOfOutputException
    {
        // current character
        char ch = 0;

        while(iIn < in.length)
        {
            // get current input and set default output length
            ch = in[iIn];
            int outlen = 2;

            // Characters in these ranges could potentially be compressed.
            // We require 2 or more compressible characters to break the run
            if (isCompressible(ch))
            {
                // check whether we can look ahead
                if( iIn < in.length - 1)
                {
                    // DEBUG
                    Debug.out("is-comp: ",ch);
                    char ch2 = in[iIn + 1];
                    if (isCompressible(ch2))
                    {
                        // at least 2 characters are compressible
                        // break the run
                        break;
                    }
                    //DEBUG
                    Debug.out("no-comp: ",ch2);
                }
                // If we get here, the current character is only character
                // left in the input or it is followed by a non-compressible
                // character. In neither case do we gain by breaking the
                // run, so we proceed to output the character.
                if (ch >= 0xE000 && ch <= 0xF2FF)
                {
                    // Characters in this range need to be escaped
                    outlen = 3;
                }

            }
            // check that there is enough room to output the character
            if(iOut >= out.length - outlen)
            {
                // DEBUG
                Debug.out("End of Output @", iOut);
                // if we got here, we ran out of space in the output array
                throw new EndOfOutputException();
            }

            // output any characters that cannot be compressed,
            if (outlen == 3)
            {
                // output the quote character
                out[iOut++] = UQU;
            }
            // pass the Unicode character in MSB,LSB order
            out[iOut++] = (byte)(ch >>> 8);
            out[iOut++] = (byte)(ch & 0xFF);

            // advance input cursor
            iIn++;
        }

        // return the last character
        return ch;
    }

    static int iNextWindow = 3;

    /** redefine a window so it surrounds a given character value
        For now, this function uses window 3 exclusively (window 4
        for extended windows);
        @return true if a window was successfully defined
        @param ch - character around which window is positioned
        @param out - output byte array
        @param fCurUnicodeMode - type of window
     **/
    private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
        throws IllegalInputException, EndOfOutputException
    {
        int iWin = iNextWindow % 8; // simple LRU
        int iPosition = 0;

        // iPosition 0 is a reserved value
        if (ch < 0x80)
        {
            throw new IllegalStateException("ch < 0x80");
            //return false;
        }

        // Check the fixed offsets
        for (int i = 0; i < fixedOffset.length; i++)
        {
            if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
            {
                iPosition = i;
                break;
            }
        }

        if (iPosition != 0)
        {
            // DEBUG
            Debug.out("FIXED position is ", iPosition + 0xF9);

            // ch fits in a fixed offset window position
            dynamicOffset[iWin] = fixedOffset[iPosition];
            iPosition += 0xF9;
        }
        else if (ch < 0x3400)
        {
            // calculate a window position command and set the offset
            iPosition = ch >>> 7;
            dynamicOffset[iWin] = ch & 0xFF80;

            Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
        }
        else if (ch < 0xE000)
        {
            // attempt to place a window where none can go
            return false;
        }
        else if (ch <= 0xFFFF)
        {
            // calculate a window position command, accounting
            // for the gap in position values, and set the offset
            iPosition =  ((ch - gapOffset)>>> 7);

            dynamicOffset[iWin] = ch & 0xFF80;

            Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
        }
        else
        {
            // if we get here, the character is in the extended range.
            // Always use Window 4 to define an extended window

            iPosition = (ch - 0x10000) >>> 7;
            // DEBUG
            Debug.out("Try position Window at ", iPosition);

            iPosition |= iWin << 13;
            dynamicOffset[iWin] = ch & 0x1FFF80;
        }

        // Outputting window defintion command for the general cases
        if ( iPosition < 0x100 && iOut < out.length-1)
        {
            out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
            out[iOut++] = (byte) (iPosition & 0xFF);
        }
        // Output an extended window definiton command
        else if ( iPosition >= 0x100 && iOut < out.length - 2)
        {

            Debug.out("Setting extended window at ", iPosition);
            out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
            out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
            out[iOut++] = (byte) (iPosition & 0xFF);
        }
        else
        {
            throw new EndOfOutputException();
        }
        selectWindow(iWin);
        iNextWindow++;
        return true;
    }

    /**
    compress a Unicode character array with some simplifying assumptions
    **/
    public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
        throws IllegalInputException, EndOfInputException, EndOfOutputException
    {
        iIn = iStartIn;
        iOut = iStartOut;


        while (iIn < in.length)
        {
            int ch;

            // previously we switched to a Unicode run
            if (iSCU != -1)
            {

                Debug.out("Remaining", in, iIn);
                Debug.out("Output until ["+iOut+"]: ", out);

                // output characters as Unicode
                ch = outputUnicodeRun(in, out);

                // for single character Unicode runs (3 bytes) use quote
                if (iOut - iSCU == 3 )
                {
                    // go back and fix up the SCU to an SQU instead
                    out[iSCU] = SQU;
                    iSCU = -1;
                    continue;
                }
                else
                {
                    iSCU = -1;
                    fUnicodeMode = true;
                }
            }
            // next, try to output characters as single byte run
            else
            {
                ch = outputSingleByteRun(in, out);
            }

            // check whether we still have input
            if (iIn == in.length)
            {
                break; // no more input
            }

            // if we get here, we have a consistent value for ch, whether or
            // not it is an regular or extended character. Locate or define a
            // Window for the current character

            Debug.out("Output so far: ", out);
            Debug.out("Routing ch="+ch+" for Input", in, iIn);

            // Check that we have enough room to output the command byte
            if (iOut >= out.length - 1)
            {
                throw new EndOfOutputException();
            }

            // In order to switch away from Unicode mode, it is necessary
            // to select (or define) a window. If the characters that follow
            // the Unicode range are ASCII characters, we can't use them
            // to decide which window to select, since ASCII characters don't
            // influence window settings. This loop looks ahead until it finds
            // one compressible character that isn't in the ASCII range.
            for (int ich = iIn; ch < 0x80; ich++)
            {
                if (ich == in.length || !isCompressible(in[ich]))
                {
                    // if there are only ASCII characters left,
                    ch = in[iIn];
                    break;
                }
                ch = in[ich]; // lookahead for next non-ASCII char
            }
            // The character value contained in ch here will only be used to select
            // output modes. Actual output of characters starts with in[iIn] and
            // only takes place near the top of the loop.

            int iprevWindow = getCurrentWindow();

            // try to locate a dynamic window
            if (ch < 0x80 || locateWindow(ch, dynamicOffset))
            {
                Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
                // lookahead to use SQn instead of SCn for single
                // character interruptions of runs in current window
                if(!fUnicodeMode && iIn < in.length -1)
                {
                    char ch2 = in[iIn+1];
                    if (ch2 >= dynamicOffset[iprevWindow] &&
                        ch2 <  dynamicOffset[iprevWindow] + 0x80)
                    {
                        quoteSingleByte(ch, out);
                        selectWindow(iprevWindow);
                        continue;
                    }
                }

                out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
                fUnicodeMode = false;
            }
            // try to locate a static window
            else if (!fUnicodeMode && locateWindow(ch, staticOffset))
            {
                // static windows are not accessible from Unicode mode
                Debug.out("located a static window", getCurrentWindow());
                quoteSingleByte(ch, out);
                selectWindow(iprevWindow); // restore current Window settings
                continue;
            }
            // try to define a window around ch
            else if (positionWindow(ch, out, fUnicodeMode) )
            {
                fUnicodeMode = false;
            }
            // If all else fails, start a Unicode run
            else
            {
                iSCU = iOut;
                out[iOut++] = SCU;
                continue;
            }
        }

        return iOut - iStartOut;
    }

    public byte[] compress(String inStr)
        throws IllegalInputException, EndOfInputException
    {
        // Running out of room for output can cause non-optimal
        // compression. In order to not slow down compression too
        // much, not all intermediate state is constantly saved.

        byte [] out = new byte[inStr.length() * 2];
        char [] in = inStr.toCharArray();
        //DEBUG
        Debug.out("compress input: ",in);
        reset();
        while(true)
        {
            try
            {
                simpleCompress(in, charsRead(), out, bytesWritten());
                // if we get here things went fine.
                break;
            }
            catch (EndOfOutputException e)
            {
                // create a larger output buffer and continue
                byte [] largerOut = new byte[out.length * 2];
                System.arraycopy(out, 0, largerOut, 0, out.length);
                out = largerOut;
            }
        }
        byte [] trimmedOut = new byte[bytesWritten()];
        System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
        out = trimmedOut;

        Debug.out("compress output: ", out);
        return out;
    }

    /** reset is only needed to bail out after an exception and
        restart with new input */
    @Override
    public void reset()
    {
        super.reset();
        fUnicodeMode = false;
        iSCU = - 1;
    }

    /** returns the number of bytes written **/
    public int bytesWritten()
    {
        return iOut;
    }

    /** returns the number of bytes written **/
    public int charsRead()
    {
        return iIn;
    }

 }
--- a/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java
+++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java
@@ -0,0 +1,48 @@
 package com.healthmarketscience.jackcess.scsu;

 /**
 * This sample software accompanies Unicode Technical Report #6 and
 * distributed as is by Unicode, Inc., subject to the following:
 *
 * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
 *
 * Permission to use, copy, modify, and distribute this software
 * without fee is hereby granted provided that this copyright notice
 * appears in all copies.
 *
 * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
 * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
 * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
 * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
 *
 *  @author Asmus Freytag
 *
 *  @version 001 Dec 25 1996
 *  @version 002 Jun 25 1997
 *  @version 003 Jul 25 1997
 *
 * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
 * and are registered in some jurisdictions.
 **/
 /**
 * The input string or input byte array ended prematurely
 */
 public class EndOfOutputException
    extends java.lang.Exception

 {

   private static final long serialVersionUID = 1L;
  
   public EndOfOutputException(){
    super("The input string or input byte array ended prematurely");
    }

    public EndOfOutputException(String s) {
 	super(s);
    }
 }
--- a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
@@ -411,6 +411,7 @@ public class Expand extends SCSU

    /** reset is called to start with new input, w/o creating a new
        instance */
    @Override
    public void reset()
    {
        iOut = 0;
--- a/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java
+++ b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java
@@ -0,0 +1,574 @@
 package com.healthmarketscience.jackcess.scsu;

 import java.io.*;
 import java.util.*;

 /**
 * This sample software accompanies Unicode Technical Report #6 and
 * distributed as is by Unicode, Inc., subject to the following:
 *
 * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
 *
 * Permission to use, copy, modify, and distribute this software
 * without fee is hereby granted provided that this copyright notice
 * appears in all copies.
 *
 * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
 * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
 * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
 * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
 *
 *  @author Asmus Freytag
 *
 *  @version 001 Dec 25 1996
 *  @version 002 Jun 25 1997
 *  @version 003 Jul 25 1997
 *  @version 004 Aug 25 1997
 *  @version 005 Sep 30 1998
 *
 * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
 * and are registered in some jurisdictions.
 **/

 /**
 	Class CompressMain

 	A small commandline driver interface for the compression routines
 	Use the /? to get usage
 */
 public class CompressMain
 {
 	static void usage()
 	{
 		System.err.println("java CompressMain /?               : this usage information\n");
 		System.err.println("java CompressMain /random    	   : random test\n");
 		System.err.println("java CompressMain /suite           : suite test\n");
 		System.err.println("java CompressMain /suite <file>    : file test (file data may include \\uXXXX)\n");
 		System.err.println("java CompressMain <string>    	   : string test (string may include \\uXXXX)\n");
 		System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n");
 		System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n");
 		System.err.println("java CompressMain /expand <file>   : expands into Unicode files\n");
 		System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n");
 		System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n");
 		System.err.println("java CompressMain /parse <files>   : parses \\uXXXX into binary Unicode\n");
 	}

    static void analyze(String text, int inlength, String result, int outlength)
    {
        boolean fSuccess = text.equals(result);
        Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED");
        if (!fSuccess && result != null)
        {
            int iLim = Math.min(text.length(), result.length());
            for (int i = 0; i < iLim; i++)
            {
                if (text.charAt(i) != result.charAt(i))
                {
                    Debug.out("First Mismatch at  "+ i +"=", result.charAt(i) );
                    Debug.out("Original character "+ i +"=", text.charAt(i) );
                    break;
                }
            }
        }
        else
        {
            Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes.");
            Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
        }
    }

    static void test2(String text)
    {
        byte bytes[] = null;
        String result = null;
        Debug.out("SCSU:\n");
        Compress compressor = new Compress();
        try
        {
            bytes = compressor.compress(text);
            Expand display = new Expand();
            result = display.expand(bytes);
            Debug.out("Input:  ", text.toCharArray());
            Debug.out("Result: ", result.toCharArray());
            Debug.out("");
            Expand expander = new Expand();
            result = expander.expand(bytes);
        }
        catch (Exception e)
        {
            System.out.println(e);
        }
        int inlength = compressor.charsRead();
        int outlength = compressor.bytesWritten();
        analyze(text, inlength, result, outlength);
    }

    static void test(String text) throws Exception
    {
      test(text, false);
    }

  static void test(String text, boolean shouldFail)
      throws Exception
    {
        // Create an instance of the compressor
        Compress compressor = new Compress();

        byte [] bytes = null;
        String result = null;
        Exception failure = null;
        try {
            // perform compression
            bytes = compressor.compress(text);
        }
        catch(Exception e)
        {
            failure = e;
        }

        if(shouldFail) {
          if(failure == null) {
            throw new RuntimeException("Did not fail");
          }
          return;
        }

        if(failure != null) {
          throw failure;
        }

        Expand expander = new Expand();
        // perform expansion
        result = expander.expand(bytes);

        // analyze the results
        int inlength = compressor.charsRead();
        int outlength = compressor.bytesWritten();
        analyze(text, inlength, result, outlength);

    }

    public static void display(byte [] input)
    {
        try
        {
            Expand expand = new Expand();
            String text = expand.expand(input);
            Debug.out(text.toCharArray());
        }
        catch (Exception e)
        {
            System.out.println(e);
        }
    }

    public static String parse(String input)
    {
        StringTokenizer st = new StringTokenizer(input, "\\", true);
        Debug.out("Input: ", input);

        StringBuffer sb = new StringBuffer();

        while(st.hasMoreTokens())
        {
            String token = st.nextToken();
                    Debug.out("Token: ", token);
            if (token.charAt(0) == '\\' && token.length() == 1)
            {
                if(st.hasMoreTokens())
                {
                    token = st.nextToken();
                }
                if(token.charAt(0) == 'u')
                {
                    Debug.out("Token: "+ token+ " ", sb.toString());
                    String hexnum;
                    if (token.length() > 5)
                    {
                        hexnum = token.substring(1,5);
                        token = token.substring(5);
                    }
                    else
                    {
                        hexnum = token.substring(1);
                        token = "";
                    }
                    sb.append((char)Integer.parseInt(hexnum, 16));
                }
            }
            sb.append(token);
        }
        return sb.toString();
    }

    public static void randomTest(int nTest)
      throws Exception
    {
        Random random = new Random();

        for(int n=0; n < nTest; n++)
        {
            int iLen = (int) (20 * random.nextFloat());
            StringBuffer sb = new StringBuffer(iLen);

            for(int i = 0; i < iLen; i++)
            {
                sb.append((char) (0xFFFF * random.nextFloat()));
            }

            test(sb.toString());
        }
    }

    @SuppressWarnings("deprecation")
    public static void fileTest(String name)
        throws Exception
    {
        DataInputStream dis = new DataInputStream(new FileInputStream(name));

        int iLine = 0;

        while(dis.available() != 0)
        {
            String line = dis.readLine();
            Debug.out("Line "+ iLine++ +" "+line);
            test(parse(line), false ); //false);// initially no debug info
        }
    }

    public static void displayFile(String name)
            throws IOException
    {
        DataInputStream dis = new DataInputStream(new FileInputStream(name));

        byte bytes[] = new byte[dis.available()];
        dis.read(bytes);
        display(bytes);
    }

    public static void decodeTest(String name)
           throws IOException
    {
        DataInputStream dis = new DataInputStream(new FileInputStream(name));

        byte bytes[] = new byte[dis.available()];
        dis.read(bytes);

        Expand expand = new Expand();

        char [] chars = null;
        try
        {
            String text = expand.expand(bytes);
            chars = text.toCharArray();
        }
        catch (Exception e)
        {
            System.out.println(e);
        }
        int inlength = expand.bytesRead();
        int iDot = name.lastIndexOf('.');
        StringBuffer sb = new StringBuffer(name);
        sb.setLength(iDot + 1);
        sb.append("txt");
        String outName = sb.toString();

        int outlength = expand.charsWritten();

        Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%.");

        if (chars == null)
            return;

        writeUnicodeFile(outName, chars);
    }

    /** most of the next 3 functions should not be needed by JDK11 and later */
    private static int iMSB = 1;

    public static String readUnicodeFile(String name)
    {
        try
        {
            FileInputStream dis = new FileInputStream(name);

            byte b[] = new byte[2];
            StringBuffer sb = new StringBuffer();
            char ch = 0;

            iMSB = 1;
            int i = 0;
            for(i = 0; (dis.available() != 0); i++)
            {
                b[i%2] = (byte) dis.read();

                if ((i & 1) == 1)
                {
                    ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]);
                }
                else
                {
                    continue;
                }
                if (i == 1 && ch == '\uFEFF')
                    continue; // throw away byte order mark

                if (i == 1 && ch == '\uFFFE')
                {
                    iMSB ++;  // flip byte order
                    continue; // throw away byte order mark
                }
                sb.append(ch);
             }

            return sb.toString();
        }
        catch (IOException e)
        {
            System.err.println(e);
            return "";
        }
    }

    public static void writeUnicodeFile(String outName, char [] chars)
            throws IOException
    {
        DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
        if ((iMSB & 1) == 1)
        {
            dos.writeByte(0xFF);
            dos.writeByte(0xFE);
        }
        else
        {
            dos.writeByte(0xFE);
            dos.writeByte(0xFF);
        }
        byte b[] = new byte[2];
        for (int ich = 0; ich < chars.length; ich++)
        {
            b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8);
            b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF);
            dos.write(b, 0, 2);
        }
    }

    static void byteswap(String name)
        throws IOException
    {
        String text = readUnicodeFile(name);
        char chars[] = text.toCharArray();
        writeUnicodeFile(name, chars);
    }

    @SuppressWarnings("deprecation")
    public static void parseFile(String name)
        throws IOException
    {
        DataInputStream dis = new DataInputStream(new FileInputStream(name));

        byte bytes[] = new byte[dis.available()];
        dis.read(bytes);

        // simplistic test
        int bom = (char) bytes[0] + (char) bytes[1];
        if (bom == 131069)
        {
            // FEFF or FFFE detected (either one sums to 131069)
            Debug.out(name + " is already in Unicode!");
            return;
        }

        // definitely assumes an ASCII file at this point
        String text = new String(bytes, 0);

        char chars[] = parse(text).toCharArray();
        writeUnicodeFile(name, chars);
        return;
    }

    public static void encodeTest(String name)
        throws Exception
    {
        String text = readUnicodeFile(name);

        // Create an instance of the compressor
        Compress compressor = new Compress();

        byte [] bytes = null;

        // perform compression
        bytes = compressor.compress(text);

        int inlength = compressor.charsRead();
        int iDot = name.lastIndexOf('.');
        StringBuffer sb = new StringBuffer(name);
        sb.setLength(iDot + 1);
        sb.append("csu");
        String outName = sb.toString();

        DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
        dos.write(bytes, 0, bytes.length);

        int outlength = compressor.bytesWritten();

        Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
    }

    public static void roundtripTest(String name)
      throws Exception
    {
      test(readUnicodeFile(name), false);// no debug info
    }

    /** The Main function */
    public static void main(String args[])
      throws Exception
    {
        int iArg = args.length;

        try
        {
            if (iArg != 0)
            {
                if (args[0].equalsIgnoreCase("/compress"))
                {
                    while (--iArg > 0)
                    {
                        encodeTest(args[args.length - iArg]);
                    }
                }
                else if (args[0].equalsIgnoreCase("/parse"))
                {
                    while (--iArg > 0)
                    {
                        parseFile(args[args.length - iArg]);
                    }
                }
                else if (args[0].equalsIgnoreCase("/expand"))
                {
                    while (--iArg > 0)
                    {
                        decodeTest(args[args.length - iArg]);
                    }
                }
                else if (args[0].equalsIgnoreCase("/display"))
                {
                    while (--iArg > 0)
                    {
                        displayFile(args[args.length - iArg]);
                    }
                }
                else if (args[0].equalsIgnoreCase("/roundtrip"))
                {
                    while (--iArg > 0)
                    {
                        roundtripTest(args[args.length - iArg]);
                    }
                }
                else if (args[0].equalsIgnoreCase("/byteswap"))
                {
                    while (--iArg > 0)
                    {
                        byteswap(args[args.length - iArg]);
                    }
                }else if (args[0].equalsIgnoreCase("/random"))
                {
                    randomTest(8);
                }
                else if (args[0].equalsIgnoreCase("/suite"))
                {
                    if (iArg == 1)
                    {
                        suiteTest();
                    }
                    else
                    {
                        while (--iArg > 0)
                        {
                            fileTest(args[args.length - iArg]);
                        }
                    }
                }
    			else if (args[0].equalsIgnoreCase("/?"))
    			{
    				usage();
    			}
                else
                {
                    while (iArg > 0)
                    {
                        test2(parse(args[--iArg]));
                    }
                }
            }
            else
            {
                usage();
            }
        }
        catch (IOException e)
        {
            System.err.println(e);
        }
        try
        {
            System.err.println("Done. Press enter to exit");
            System.in.read();
        }
        catch (IOException e)
        {

        }
    }

    static void suiteTest()
      throws Exception
    {
        Debug.out("Standard Compression test suite:");
        test("Hello \u9292 \u9192 World!");
        test("Hell\u0429o \u9292 \u9192 W\u00e4rld!");
        test("Hell\u0429o \u9292 \u9292W\u00e4rld!");

        test("\u0648\u06c8"); // catch missing reset
        test("\u0648\u06c8");

        test("\u4444\uE001"); // lowest quotable
        test("\u4444\uf2FF"); // highest quotable
        test("\u4444\uf188\u4444");
        test("\u4444\uf188\uf288");
        test("\u4444\uf188abc\0429\uf288");
        test("\u9292\u2222");
        test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!");
        test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
        test("Hello World!123456");
        test("Hello W\u0081\u011f\u0082!"); // Latin 1 run

        test("abc\u0301\u0302");  // uses SQn for u301 u302
        test("abc\u4411d");      // uses SQU
        test("abc\u4411\u4412d");// uses SCU
        test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5
        test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data
        test("\u9292\u2222");
        test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000");
        test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c");
        test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002");

        test(""); // empty input
        test("\u0000"); // smallest BMP character
        test("\uFFFF"); // largest BMP character

        test("\ud800\udc00"); // smallest surrogate
        test("\ud8ff\udcff"); // largest surrogate pair


        Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:");
        test("\ud800 \udc00", true); // unpaired surrogate (1)
        test("\udc00", true); // unpaired surrogate (2)
        test("\ud800", true); // unpaired surrogate (3)
   }
 }
--- a/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java
+++ b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java
@@ -0,0 +1,47 @@
 /*
 Copyright (c) 2007 Health Market Science, Inc.

 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.

 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 USA

 You can contact Health Market Science at info@healthmarketscience.com
 or at the following address:

 Health Market Science
 2700 Horizon Drive
 Suite 200
 King of Prussia, PA 19406
 */

 package com.healthmarketscience.jackcess.scsu;

 import junit.framework.TestCase;

 /**
 * @author James Ahlborn
 */
 public class CompressTest extends TestCase
 {

  public CompressTest(String name) throws Exception {
    super(name);
  }

  public void testCompression() throws Exception
  {
    CompressMain.suiteTest();
  }

 }