Add compression code for possible future use; add compression unit

tests. git-svn-id: https://svn.code.sf.net/p/jackcess/code/jackcess/trunk@365 f203690c-595d-4dc9-a70b-905162fa7fd2
author: James Ahlborn <jtahlborn@yahoo.com> 2008-07-22 19:00:25 +0000
committer: James Ahlborn <jtahlborn@yahoo.com> 2008-07-22 19:00:25 +0000
commit: 4af4fe44512c2c885498794c7a5a6e878318b2ef (patch)
tree: 3eb2a1966bd1fdf7777741492a256f645b1e07cf
parent: d1359adfbf6199a305b062c0a8da0d05e49e84a7 (diff)
download: jackcess-4af4fe44512c2c885498794c7a5a6e878318b2ef.tar.gz
jackcess-4af4fe44512c2c885498794c7a5a6e878318b2ef.zip
7 files changed, 1304 insertions, 11 deletions
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index cef026e..be4bedf 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -25,6 +25,10 @@
       <action dev="jahlborn" type="add">
         Add primitive support for writing unicode compressed text columns.
       </action>
+      <action dev="jahlborn" type="add">
+        Add compression code for possible future use; add compression unit
+        tests.
+      </action>
     </release>
     <release version="1.1.15" date="2008-06-27">
       <action dev="jahlborn" type="fix" issue="1998225">
diff --git a/src/java/com/healthmarketscience/jackcess/Column.java b/src/java/com/healthmarketscience/jackcess/Column.java
index 8d27ae4..570d104 100644
--- a/src/java/com/healthmarketscience/jackcess/Column.java
+++ b/src/java/com/healthmarketscience/jackcess/Column.java
@@ -41,6 +41,7 @@ import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import com.healthmarketscience.jackcess.scsu.Compress;
 import com.healthmarketscience.jackcess.scsu.EndOfInputException;
 import com.healthmarketscience.jackcess.scsu.Expand;
 import com.healthmarketscience.jackcess.scsu.IllegalInputException;
@@ -1196,23 +1197,13 @@ public class Column implements Comparable<Column> {
     // now, see if it is all printable ASCII
     for(int i = 0; i < text.length(); ++i) {
       char c = text.charAt(i);
-      if(!isAsciiCrLfOrTab(c)) {
+      if(!Compress.isAsciiCrLfOrTab(c)) {
         return false;
       }
     }
     return true;
   }
 
-  /**
-   * Returns true if the character is ASCII, but not a control other than
-   * CR, LF and TAB
-   */
-  private static boolean isAsciiCrLfOrTab(int ch)
-  {
-    return ((ch >= 0x20 && ch <= 0x7F)                  // ASCII (non control)
-            || ch == 0x09 || ch == 0x0A || ch == 0x0D); // CR/LF or TAB
-  }
-
   @Override
   public String toString() {
     StringBuilder rtn = new StringBuilder();
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Compress.java b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java
new file mode 100644
index 0000000..c5f7360
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Compress.java
@@ -0,0 +1,628 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+    This class implements a simple compression algorithm
+ **/
+/*
+    Note on exception handling
+        This compressor is designed so that it can be restarted after
+        an exception. All operations advancing input and/or output cursor
+        (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
+        before updating the cursors.
+*/
+public class Compress extends SCSU
+{
+
+    /** next input character to be read **/
+    private int iIn;
+
+    /** next output byte to be written **/
+    private int iOut;
+
+    /** start index of Unicode mode in output array, or -1 if in single byte mode **/
+    private int iSCU = -1;
+
+    /** true if the next command byte is of the Uxx family */
+    private boolean fUnicodeMode = false;
+
+    /** locate a window for a character given a table of offsets
+    @param ch - character
+    @param offsetTable - table of window offsets
+    @return true if the character fits a window from the table of windows */
+    private boolean locateWindow(int ch, int[] offsetTable)
+    {
+        // always try the current window first
+        int iWin = getCurrentWindow();
+
+        // if the character fits the current window
+        // just use the current window
+        if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
+        {
+            return true;
+        }
+
+        // try all windows in order
+        for (iWin = 0; iWin < offsetTable.length; iWin++)
+        {
+            if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
+            {
+                selectWindow(iWin);
+                return true;
+            }
+        }
+        // none found
+        return false;
+    }
+
+    /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
+    public static boolean isAsciiCrLfOrTab(int ch)
+    {
+        return    (ch >= 0x20 && ch <= 0x7F)                 // ASCII
+                || ch == 0x09 || ch == 0x0A || ch == 0x0D;   // CR/LF or TAB
+
+    }
+
+    /** output a run of characters in single byte mode
+        In single byte mode pass through characters in the ASCII range, but
+        quote characters overlapping with compression command codes. Runs
+        of characters fitting the current window are output as runs of bytes
+        in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
+        Uses and updates the current input and output cursors store in
+        the instance variables <i>iIn</i> and <i>iOut</i>.
+        @param in - input character array
+        @param out - output byte array
+        @return the next chaacter to be processed. This may be an extended character.
+    **/
+    @SuppressWarnings("fallthrough")
+    public int outputSingleByteRun(char [] in, byte [] out)
+        throws EndOfOutputException, EndOfInputException, IllegalInputException
+    {
+        int iWin = getCurrentWindow();
+        while(iIn < in.length)
+        {
+            int outlen = 0;
+            byte byte1 = 0;
+            byte byte2 = 0;
+
+            // get the input character
+            int ch = in[iIn];
+
+            int inlen = 1;
+
+            // Check input for Surrogate pair
+            if ( (ch & 0xF800) == 0xD800 )
+            {
+                if ( (ch & 0xFC00) == 0xDC00 )
+                {
+                    // low surrogate out of order
+                    throw new IllegalInputException("Unpaired low surrogate: "+iIn);
+                }
+                else
+                {
+                    // have high surrogate now get low surrogate
+                    if ( iIn >= in.length-1)
+                    {
+                        // premature end of input
+                        throw new EndOfInputException();
+                    }
+                    // get the char
+                    int ch2 = in[iIn+1];
+
+                    // make sure it's a low surrogate
+                    if ( (ch2 & 0xFC00) != 0xDC00 )
+                    {
+                        // a low surrogate was required
+                        throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
+                    }
+
+                    // combine the two values
+                    ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
+                    // ch = ch<<10 + ch2 - 0x36F0000;
+
+                    inlen = 2;
+                 }
+            }
+
+            // ASCII Letter, NUL, CR, LF and TAB are always passed through
+            if (isAsciiCrLfOrTab(ch) || ch == 0)
+            {
+                // pass through directcly
+                byte2 = (byte)(ch & 0x7F);
+                outlen = 1;
+            }
+
+            // All other control codes must be quoted
+            else if (ch < 0x20)
+            {
+                byte1 = SQ0;
+                byte2 = (byte)(ch);
+                outlen = 2;
+            }
+
+            // Letters that fit the current dynamic window
+            else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
+            {
+                ch -= dynamicOffset[iWin];
+                byte2 = (byte)(ch | 0x80);
+                outlen = 1;
+            }
+
+            // check for room in the output array
+            if (iOut + outlen >= out.length)
+            {
+                throw new EndOfOutputException();
+            }
+
+            switch(outlen)
+            {
+                default:
+                    // need to use some other compression mode for this
+                    // character so we terminate this loop
+
+                    return ch; // input not finished
+
+                    // output the characters
+                case 2:
+                    out[iOut++] = byte1;
+                    // fall through
+                case 1:
+                    out[iOut++] = byte2;
+                    break;
+            }
+            // advance input pointer
+            iIn += inlen;
+        }
+        return 0; // input all used up
+    }
+
+    /** quote a single character in single byte mode
+    Quoting a character (aka 'non-locking shift') gives efficient access
+    to characters that occur in isolation--usually punctuation characters.
+    When quoting a character from a dynamic window use 0x80 - 0xFF, when
+    quoting a character from a static window use 0x00-0x7f.
+    @param ch - character to be quoted
+    @param out - output byte array
+    **/
+
+    private void quoteSingleByte(int ch, byte [] out)
+        throws EndOfOutputException
+    {
+        Debug.out("Quoting SingleByte ", ch);
+        int iWin = getCurrentWindow();
+
+        // check for room in the output array
+        if (iOut >= out.length -2)
+        {
+            throw new EndOfOutputException();
+        }
+
+        // Output command byte followed by
+        out[iOut++] = (byte)(SQ0 + iWin);
+
+        // Letter that fits the current dynamic window
+        if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
+        {
+            ch -= dynamicOffset[iWin];
+            out[iOut++] = (byte)(ch | 0x80);
+        }
+
+        // Letter that fits the current static window
+        else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
+        {
+            ch -= staticOffset[iWin];
+            out[iOut++] = (byte)ch;
+        }
+        else
+        {
+            throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
+        }
+        // advance input pointer
+        iIn ++;
+        Debug.out("New input: ", iIn);
+    }
+
+    /** output a run of characters in Unicode mode
+    A run of Unicode mode consists of characters which are all in the
+    range of non-compressible characters or isolated occurrence
+    of any other characters. Characters in the range 0xE00-0xF2FF must
+    be quoted to avoid overlap with the Unicode mode compression command codes.
+    Uses and updates the current input and output cursors store in
+    the instance variables <i>iIn</i> and <i>iOut</i>.
+    NOTE: Characters from surrogate pairs are passed through and unlike single
+    byte mode no checks are made for unpaired surrogate characters.
+    @param in - input character array
+    @param out - output byte array
+    @return the next input character to be processed
+    **/
+    public char outputUnicodeRun(char [] in, byte [] out)
+        throws EndOfOutputException
+    {
+        // current character
+        char ch = 0;
+
+        while(iIn < in.length)
+        {
+            // get current input and set default output length
+            ch = in[iIn];
+            int outlen = 2;
+
+            // Characters in these ranges could potentially be compressed.
+            // We require 2 or more compressible characters to break the run
+            if (isCompressible(ch))
+            {
+                // check whether we can look ahead
+                if( iIn < in.length - 1)
+                {
+                    // DEBUG
+                    Debug.out("is-comp: ",ch);
+                    char ch2 = in[iIn + 1];
+                    if (isCompressible(ch2))
+                    {
+                        // at least 2 characters are compressible
+                        // break the run
+                        break;
+                    }
+                    //DEBUG
+                    Debug.out("no-comp: ",ch2);
+                }
+                // If we get here, the current character is only character
+                // left in the input or it is followed by a non-compressible
+                // character. In neither case do we gain by breaking the
+                // run, so we proceed to output the character.
+                if (ch >= 0xE000 && ch <= 0xF2FF)
+                {
+                    // Characters in this range need to be escaped
+                    outlen = 3;
+                }
+
+            }
+            // check that there is enough room to output the character
+            if(iOut >= out.length - outlen)
+            {
+                // DEBUG
+                Debug.out("End of Output @", iOut);
+                // if we got here, we ran out of space in the output array
+                throw new EndOfOutputException();
+            }
+
+            // output any characters that cannot be compressed,
+            if (outlen == 3)
+            {
+                // output the quote character
+                out[iOut++] = UQU;
+            }
+            // pass the Unicode character in MSB,LSB order
+            out[iOut++] = (byte)(ch >>> 8);
+            out[iOut++] = (byte)(ch & 0xFF);
+
+            // advance input cursor
+            iIn++;
+        }
+
+        // return the last character
+        return ch;
+    }
+
+    static int iNextWindow = 3;
+
+    /** redefine a window so it surrounds a given character value
+        For now, this function uses window 3 exclusively (window 4
+        for extended windows);
+        @return true if a window was successfully defined
+        @param ch - character around which window is positioned
+        @param out - output byte array
+        @param fCurUnicodeMode - type of window
+     **/
+    private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
+        throws IllegalInputException, EndOfOutputException
+    {
+        int iWin = iNextWindow % 8; // simple LRU
+        int iPosition = 0;
+
+        // iPosition 0 is a reserved value
+        if (ch < 0x80)
+        {
+            throw new IllegalStateException("ch < 0x80");
+            //return false;
+        }
+
+        // Check the fixed offsets
+        for (int i = 0; i < fixedOffset.length; i++)
+        {
+            if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
+            {
+                iPosition = i;
+                break;
+            }
+        }
+
+        if (iPosition != 0)
+        {
+            // DEBUG
+            Debug.out("FIXED position is ", iPosition + 0xF9);
+
+            // ch fits in a fixed offset window position
+            dynamicOffset[iWin] = fixedOffset[iPosition];
+            iPosition += 0xF9;
+        }
+        else if (ch < 0x3400)
+        {
+            // calculate a window position command and set the offset
+            iPosition = ch >>> 7;
+            dynamicOffset[iWin] = ch & 0xFF80;
+
+            Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
+        }
+        else if (ch < 0xE000)
+        {
+            // attempt to place a window where none can go
+            return false;
+        }
+        else if (ch <= 0xFFFF)
+        {
+            // calculate a window position command, accounting
+            // for the gap in position values, and set the offset
+            iPosition =  ((ch - gapOffset)>>> 7);
+
+            dynamicOffset[iWin] = ch & 0xFF80;
+
+            Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
+        }
+        else
+        {
+            // if we get here, the character is in the extended range.
+            // Always use Window 4 to define an extended window
+
+            iPosition = (ch - 0x10000) >>> 7;
+            // DEBUG
+            Debug.out("Try position Window at ", iPosition);
+
+            iPosition |= iWin << 13;
+            dynamicOffset[iWin] = ch & 0x1FFF80;
+        }
+
+        // Outputting window defintion command for the general cases
+        if ( iPosition < 0x100 && iOut < out.length-1)
+        {
+            out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
+            out[iOut++] = (byte) (iPosition & 0xFF);
+        }
+        // Output an extended window definiton command
+        else if ( iPosition >= 0x100 && iOut < out.length - 2)
+        {
+
+            Debug.out("Setting extended window at ", iPosition);
+            out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
+            out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
+            out[iOut++] = (byte) (iPosition & 0xFF);
+        }
+        else
+        {
+            throw new EndOfOutputException();
+        }
+        selectWindow(iWin);
+        iNextWindow++;
+        return true;
+    }
+
+    /**
+    compress a Unicode character array with some simplifying assumptions
+    **/
+    public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
+        throws IllegalInputException, EndOfInputException, EndOfOutputException
+    {
+        iIn = iStartIn;
+        iOut = iStartOut;
+
+
+        while (iIn < in.length)
+        {
+            int ch;
+
+            // previously we switched to a Unicode run
+            if (iSCU != -1)
+            {
+
+                Debug.out("Remaining", in, iIn);
+                Debug.out("Output until ["+iOut+"]: ", out);
+
+                // output characters as Unicode
+                ch = outputUnicodeRun(in, out);
+
+                // for single character Unicode runs (3 bytes) use quote
+                if (iOut - iSCU == 3 )
+                {
+                    // go back and fix up the SCU to an SQU instead
+                    out[iSCU] = SQU;
+                    iSCU = -1;
+                    continue;
+                }
+                else
+                {
+                    iSCU = -1;
+                    fUnicodeMode = true;
+                }
+            }
+            // next, try to output characters as single byte run
+            else
+            {
+                ch = outputSingleByteRun(in, out);
+            }
+
+            // check whether we still have input
+            if (iIn == in.length)
+            {
+                break; // no more input
+            }
+
+            // if we get here, we have a consistent value for ch, whether or
+            // not it is an regular or extended character. Locate or define a
+            // Window for the current character
+
+            Debug.out("Output so far: ", out);
+            Debug.out("Routing ch="+ch+" for Input", in, iIn);
+
+            // Check that we have enough room to output the command byte
+            if (iOut >= out.length - 1)
+            {
+                throw new EndOfOutputException();
+            }
+
+            // In order to switch away from Unicode mode, it is necessary
+            // to select (or define) a window. If the characters that follow
+            // the Unicode range are ASCII characters, we can't use them
+            // to decide which window to select, since ASCII characters don't
+            // influence window settings. This loop looks ahead until it finds
+            // one compressible character that isn't in the ASCII range.
+            for (int ich = iIn; ch < 0x80; ich++)
+            {
+                if (ich == in.length || !isCompressible(in[ich]))
+                {
+                    // if there are only ASCII characters left,
+                    ch = in[iIn];
+                    break;
+                }
+                ch = in[ich]; // lookahead for next non-ASCII char
+            }
+            // The character value contained in ch here will only be used to select
+            // output modes. Actual output of characters starts with in[iIn] and
+            // only takes place near the top of the loop.
+
+            int iprevWindow = getCurrentWindow();
+
+            // try to locate a dynamic window
+            if (ch < 0x80 || locateWindow(ch, dynamicOffset))
+            {
+                Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
+                // lookahead to use SQn instead of SCn for single
+                // character interruptions of runs in current window
+                if(!fUnicodeMode && iIn < in.length -1)
+                {
+                    char ch2 = in[iIn+1];
+                    if (ch2 >= dynamicOffset[iprevWindow] &&
+                        ch2 <  dynamicOffset[iprevWindow] + 0x80)
+                    {
+                        quoteSingleByte(ch, out);
+                        selectWindow(iprevWindow);
+                        continue;
+                    }
+                }
+
+                out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
+                fUnicodeMode = false;
+            }
+            // try to locate a static window
+            else if (!fUnicodeMode && locateWindow(ch, staticOffset))
+            {
+                // static windows are not accessible from Unicode mode
+                Debug.out("located a static window", getCurrentWindow());
+                quoteSingleByte(ch, out);
+                selectWindow(iprevWindow); // restore current Window settings
+                continue;
+            }
+            // try to define a window around ch
+            else if (positionWindow(ch, out, fUnicodeMode) )
+            {
+                fUnicodeMode = false;
+            }
+            // If all else fails, start a Unicode run
+            else
+            {
+                iSCU = iOut;
+                out[iOut++] = SCU;
+                continue;
+            }
+        }
+
+        return iOut - iStartOut;
+    }
+
+    public byte[] compress(String inStr)
+        throws IllegalInputException, EndOfInputException
+    {
+        // Running out of room for output can cause non-optimal
+        // compression. In order to not slow down compression too
+        // much, not all intermediate state is constantly saved.
+
+        byte [] out = new byte[inStr.length() * 2];
+        char [] in = inStr.toCharArray();
+        //DEBUG
+        Debug.out("compress input: ",in);
+        reset();
+        while(true)
+        {
+            try
+            {
+                simpleCompress(in, charsRead(), out, bytesWritten());
+                // if we get here things went fine.
+                break;
+            }
+            catch (EndOfOutputException e)
+            {
+                // create a larger output buffer and continue
+                byte [] largerOut = new byte[out.length * 2];
+                System.arraycopy(out, 0, largerOut, 0, out.length);
+                out = largerOut;
+            }
+        }
+        byte [] trimmedOut = new byte[bytesWritten()];
+        System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
+        out = trimmedOut;
+
+        Debug.out("compress output: ", out);
+        return out;
+    }
+
+    /** reset is only needed to bail out after an exception and
+        restart with new input */
+    @Override
+    public void reset()
+    {
+        super.reset();
+        fUnicodeMode = false;
+        iSCU = - 1;
+    }
+
+    /** returns the number of bytes written **/
+    public int bytesWritten()
+    {
+        return iOut;
+    }
+
+    /** returns the number of bytes written **/
+    public int charsRead()
+    {
+        return iIn;
+    }
+
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java
new file mode 100644
index 0000000..501d195
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfOutputException.java
@@ -0,0 +1,48 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+/**
+ * The input string or input byte array ended prematurely
+ */
+public class EndOfOutputException
+    extends java.lang.Exception
+
+{
+
+   private static final long serialVersionUID = 1L;
+  
+   public EndOfOutputException(){
+    super("The input string or input byte array ended prematurely");
+    }
+
+    public EndOfOutputException(String s) {
+	super(s);
+    }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
index 9605554..4858044 100644
--- a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
@@ -411,6 +411,7 @@ public class Expand extends SCSU
 
     /** reset is called to start with new input, w/o creating a new
         instance */
+    @Override
     public void reset()
     {
         iOut = 0;
diff --git a/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java
new file mode 100644
index 0000000..af3063d
--- /dev/null
+++ b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressMain.java
@@ -0,0 +1,574 @@
+package com.healthmarketscience.jackcess.scsu;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *  @version 005 Sep 30 1998
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+	Class CompressMain
+
+	A small commandline driver interface for the compression routines
+	Use the /? to get usage
+*/
+public class CompressMain
+{
+	static void usage()
+	{
+		System.err.println("java CompressMain /?               : this usage information\n");
+		System.err.println("java CompressMain /random    	   : random test\n");
+		System.err.println("java CompressMain /suite           : suite test\n");
+		System.err.println("java CompressMain /suite <file>    : file test (file data may include \\uXXXX)\n");
+		System.err.println("java CompressMain <string>    	   : string test (string may include \\uXXXX)\n");
+		System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n");
+		System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n");
+		System.err.println("java CompressMain /expand <file>   : expands into Unicode files\n");
+		System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n");
+		System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n");
+		System.err.println("java CompressMain /parse <files>   : parses \\uXXXX into binary Unicode\n");
+	}
+
+    static void analyze(String text, int inlength, String result, int outlength)
+    {
+        boolean fSuccess = text.equals(result);
+        Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED");
+        if (!fSuccess && result != null)
+        {
+            int iLim = Math.min(text.length(), result.length());
+            for (int i = 0; i < iLim; i++)
+            {
+                if (text.charAt(i) != result.charAt(i))
+                {
+                    Debug.out("First Mismatch at  "+ i +"=", result.charAt(i) );
+                    Debug.out("Original character "+ i +"=", text.charAt(i) );
+                    break;
+                }
+            }
+        }
+        else
+        {
+            Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes.");
+            Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
+        }
+    }
+
+    static void test2(String text)
+    {
+        byte bytes[] = null;
+        String result = null;
+        Debug.out("SCSU:\n");
+        Compress compressor = new Compress();
+        try
+        {
+            bytes = compressor.compress(text);
+            Expand display = new Expand();
+            result = display.expand(bytes);
+            Debug.out("Input:  ", text.toCharArray());
+            Debug.out("Result: ", result.toCharArray());
+            Debug.out("");
+            Expand expander = new Expand();
+            result = expander.expand(bytes);
+        }
+        catch (Exception e)
+        {
+            System.out.println(e);
+        }
+        int inlength = compressor.charsRead();
+        int outlength = compressor.bytesWritten();
+        analyze(text, inlength, result, outlength);
+    }
+
+    static void test(String text) throws Exception
+    {
+      test(text, false);
+    }
+
+  static void test(String text, boolean shouldFail)
+      throws Exception
+    {
+        // Create an instance of the compressor
+        Compress compressor = new Compress();
+
+        byte [] bytes = null;
+        String result = null;
+        Exception failure = null;
+        try {
+            // perform compression
+            bytes = compressor.compress(text);
+        }
+        catch(Exception e)
+        {
+            failure = e;
+        }
+
+        if(shouldFail) {
+          if(failure == null) {
+            throw new RuntimeException("Did not fail");
+          }
+          return;
+        }
+
+        if(failure != null) {
+          throw failure;
+        }
+
+        Expand expander = new Expand();
+        // perform expansion
+        result = expander.expand(bytes);
+
+        // analyze the results
+        int inlength = compressor.charsRead();
+        int outlength = compressor.bytesWritten();
+        analyze(text, inlength, result, outlength);
+
+    }
+
+    public static void display(byte [] input)
+    {
+        try
+        {
+            Expand expand = new Expand();
+            String text = expand.expand(input);
+            Debug.out(text.toCharArray());
+        }
+        catch (Exception e)
+        {
+            System.out.println(e);
+        }
+    }
+
+    public static String parse(String input)
+    {
+        StringTokenizer st = new StringTokenizer(input, "\\", true);
+        Debug.out("Input: ", input);
+
+        StringBuffer sb = new StringBuffer();
+
+        while(st.hasMoreTokens())
+        {
+            String token = st.nextToken();
+                    Debug.out("Token: ", token);
+            if (token.charAt(0) == '\\' && token.length() == 1)
+            {
+                if(st.hasMoreTokens())
+                {
+                    token = st.nextToken();
+                }
+                if(token.charAt(0) == 'u')
+                {
+                    Debug.out("Token: "+ token+ " ", sb.toString());
+                    String hexnum;
+                    if (token.length() > 5)
+                    {
+                        hexnum = token.substring(1,5);
+                        token = token.substring(5);
+                    }
+                    else
+                    {
+                        hexnum = token.substring(1);
+                        token = "";
+                    }
+                    sb.append((char)Integer.parseInt(hexnum, 16));
+                }
+            }
+            sb.append(token);
+        }
+        return sb.toString();
+    }
+
+    public static void randomTest(int nTest)
+      throws Exception
+    {
+        Random random = new Random();
+
+        for(int n=0; n < nTest; n++)
+        {
+            int iLen = (int) (20 * random.nextFloat());
+            StringBuffer sb = new StringBuffer(iLen);
+
+            for(int i = 0; i < iLen; i++)
+            {
+                sb.append((char) (0xFFFF * random.nextFloat()));
+            }
+
+            test(sb.toString());
+        }
+    }
+
+    @SuppressWarnings("deprecation")
+    public static void fileTest(String name)
+        throws Exception
+    {
+        DataInputStream dis = new DataInputStream(new FileInputStream(name));
+
+        int iLine = 0;
+
+        while(dis.available() != 0)
+        {
+            String line = dis.readLine();
+            Debug.out("Line "+ iLine++ +" "+line);
+            test(parse(line), false ); //false);// initially no debug info
+        }
+    }
+
+    public static void displayFile(String name)
+            throws IOException
+    {
+        DataInputStream dis = new DataInputStream(new FileInputStream(name));
+
+        byte bytes[] = new byte[dis.available()];
+        dis.read(bytes);
+        display(bytes);
+    }
+
+    public static void decodeTest(String name)
+           throws IOException
+    {
+        DataInputStream dis = new DataInputStream(new FileInputStream(name));
+
+        byte bytes[] = new byte[dis.available()];
+        dis.read(bytes);
+
+        Expand expand = new Expand();
+
+        char [] chars = null;
+        try
+        {
+            String text = expand.expand(bytes);
+            chars = text.toCharArray();
+        }
+        catch (Exception e)
+        {
+            System.out.println(e);
+        }
+        int inlength = expand.bytesRead();
+        int iDot = name.lastIndexOf('.');
+        StringBuffer sb = new StringBuffer(name);
+        sb.setLength(iDot + 1);
+        sb.append("txt");
+        String outName = sb.toString();
+
+        int outlength = expand.charsWritten();
+
+        Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%.");
+
+        if (chars == null)
+            return;
+
+        writeUnicodeFile(outName, chars);
+    }
+
+    /** most of the next 3 functions should not be needed by JDK11 and later */
+    private static int iMSB = 1;
+
+    public static String readUnicodeFile(String name)
+    {
+        try
+        {
+            FileInputStream dis = new FileInputStream(name);
+
+            byte b[] = new byte[2];
+            StringBuffer sb = new StringBuffer();
+            char ch = 0;
+
+            iMSB = 1;
+            int i = 0;
+            for(i = 0; (dis.available() != 0); i++)
+            {
+                b[i%2] = (byte) dis.read();
+
+                if ((i & 1) == 1)
+                {
+                    ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]);
+                }
+                else
+                {
+                    continue;
+                }
+                if (i == 1 && ch == '\uFEFF')
+                    continue; // throw away byte order mark
+
+                if (i == 1 && ch == '\uFFFE')
+                {
+                    iMSB ++;  // flip byte order
+                    continue; // throw away byte order mark
+                }
+                sb.append(ch);
+             }
+
+            return sb.toString();
+        }
+        catch (IOException e)
+        {
+            System.err.println(e);
+            return "";
+        }
+    }
+
+    public static void writeUnicodeFile(String outName, char [] chars)
+            throws IOException
+    {
+        DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
+        if ((iMSB & 1) == 1)
+        {
+            dos.writeByte(0xFF);
+            dos.writeByte(0xFE);
+        }
+        else
+        {
+            dos.writeByte(0xFE);
+            dos.writeByte(0xFF);
+        }
+        byte b[] = new byte[2];
+        for (int ich = 0; ich < chars.length; ich++)
+        {
+            b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8);
+            b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF);
+            dos.write(b, 0, 2);
+        }
+    }
+
+    static void byteswap(String name)
+        throws IOException
+    {
+        String text = readUnicodeFile(name);
+        char chars[] = text.toCharArray();
+        writeUnicodeFile(name, chars);
+    }
+
+    @SuppressWarnings("deprecation")
+    public static void parseFile(String name)
+        throws IOException
+    {
+        DataInputStream dis = new DataInputStream(new FileInputStream(name));
+
+        byte bytes[] = new byte[dis.available()];
+        dis.read(bytes);
+
+        // simplistic test
+        int bom = (char) bytes[0] + (char) bytes[1];
+        if (bom == 131069)
+        {
+            // FEFF or FFFE detected (either one sums to 131069)
+            Debug.out(name + " is already in Unicode!");
+            return;
+        }
+
+        // definitely assumes an ASCII file at this point
+        String text = new String(bytes, 0);
+
+        char chars[] = parse(text).toCharArray();
+        writeUnicodeFile(name, chars);
+        return;
+    }
+
+    public static void encodeTest(String name)
+        throws Exception
+    {
+        String text = readUnicodeFile(name);
+
+        // Create an instance of the compressor
+        Compress compressor = new Compress();
+
+        byte [] bytes = null;
+
+        // perform compression
+        bytes = compressor.compress(text);
+
+        int inlength = compressor.charsRead();
+        int iDot = name.lastIndexOf('.');
+        StringBuffer sb = new StringBuffer(name);
+        sb.setLength(iDot + 1);
+        sb.append("csu");
+        String outName = sb.toString();
+
+        DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
+        dos.write(bytes, 0, bytes.length);
+
+        int outlength = compressor.bytesWritten();
+
+        Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
+    }
+
+    public static void roundtripTest(String name)
+      throws Exception
+    {
+      test(readUnicodeFile(name), false);// no debug info
+    }
+
+    /** The Main function */
+    public static void main(String args[])
+      throws Exception
+    {
+        int iArg = args.length;
+
+        try
+        {
+            if (iArg != 0)
+            {
+                if (args[0].equalsIgnoreCase("/compress"))
+                {
+                    while (--iArg > 0)
+                    {
+                        encodeTest(args[args.length - iArg]);
+                    }
+                }
+                else if (args[0].equalsIgnoreCase("/parse"))
+                {
+                    while (--iArg > 0)
+                    {
+                        parseFile(args[args.length - iArg]);
+                    }
+                }
+                else if (args[0].equalsIgnoreCase("/expand"))
+                {
+                    while (--iArg > 0)
+                    {
+                        decodeTest(args[args.length - iArg]);
+                    }
+                }
+                else if (args[0].equalsIgnoreCase("/display"))
+                {
+                    while (--iArg > 0)
+                    {
+                        displayFile(args[args.length - iArg]);
+                    }
+                }
+                else if (args[0].equalsIgnoreCase("/roundtrip"))
+                {
+                    while (--iArg > 0)
+                    {
+                        roundtripTest(args[args.length - iArg]);
+                    }
+                }
+                else if (args[0].equalsIgnoreCase("/byteswap"))
+                {
+                    while (--iArg > 0)
+                    {
+                        byteswap(args[args.length - iArg]);
+                    }
+                }else if (args[0].equalsIgnoreCase("/random"))
+                {
+                    randomTest(8);
+                }
+                else if (args[0].equalsIgnoreCase("/suite"))
+                {
+                    if (iArg == 1)
+                    {
+                        suiteTest();
+                    }
+                    else
+                    {
+                        while (--iArg > 0)
+                        {
+                            fileTest(args[args.length - iArg]);
+                        }
+                    }
+                }
+    			else if (args[0].equalsIgnoreCase("/?"))
+    			{
+    				usage();
+    			}
+                else
+                {
+                    while (iArg > 0)
+                    {
+                        test2(parse(args[--iArg]));
+                    }
+                }
+            }
+            else
+            {
+                usage();
+            }
+        }
+        catch (IOException e)
+        {
+            System.err.println(e);
+        }
+        try
+        {
+            System.err.println("Done. Press enter to exit");
+            System.in.read();
+        }
+        catch (IOException e)
+        {
+
+        }
+    }
+
+    static void suiteTest()
+      throws Exception
+    {
+        Debug.out("Standard Compression test suite:");
+        test("Hello \u9292 \u9192 World!");
+        test("Hell\u0429o \u9292 \u9192 W\u00e4rld!");
+        test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
+
+        test("\u0648\u06c8"); // catch missing reset
+        test("\u0648\u06c8");
+
+        test("\u4444\uE001"); // lowest quotable
+        test("\u4444\uf2FF"); // highest quotable
+        test("\u4444\uf188\u4444");
+        test("\u4444\uf188\uf288");
+        test("\u4444\uf188abc\0429\uf288");
+        test("\u9292\u2222");
+        test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!");
+        test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
+        test("Hello World!123456");
+        test("Hello W\u0081\u011f\u0082!"); // Latin 1 run
+
+        test("abc\u0301\u0302");  // uses SQn for u301 u302
+        test("abc\u4411d");      // uses SQU
+        test("abc\u4411\u4412d");// uses SCU
+        test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5
+        test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data
+        test("\u9292\u2222");
+        test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000");
+        test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c");
+        test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002");
+
+        test(""); // empty input
+        test("\u0000"); // smallest BMP character
+        test("\uFFFF"); // largest BMP character
+
+        test("\ud800\udc00"); // smallest surrogate
+        test("\ud8ff\udcff"); // largest surrogate pair
+
+
+        Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:");
+        test("\ud800 \udc00", true); // unpaired surrogate (1)
+        test("\udc00", true); // unpaired surrogate (2)
+        test("\ud800", true); // unpaired surrogate (3)
+   }
+}
diff --git a/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java
new file mode 100644
index 0000000..0f17e6c
--- /dev/null
+++ b/test/src/java/com/healthmarketscience/jackcess/scsu/CompressTest.java
@@ -0,0 +1,47 @@
+/*
+Copyright (c) 2007 Health Market Science, Inc.
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+USA
+
+You can contact Health Market Science at info@healthmarketscience.com
+or at the following address:
+
+Health Market Science
+2700 Horizon Drive
+Suite 200
+King of Prussia, PA 19406
+*/
+
+package com.healthmarketscience.jackcess.scsu;
+
+import junit.framework.TestCase;
+
+/**
+ * @author James Ahlborn
+ */
+public class CompressTest extends TestCase
+{
+
+  public CompressTest(String name) throws Exception {
+    super(name);
+  }
+
+  public void testCompression() throws Exception
+  {
+    CompressMain.suiteTest();
+  }
+
+}
author	James Ahlborn <jtahlborn@yahoo.com>	2008-07-22 19:00:25 +0000
committer	James Ahlborn <jtahlborn@yahoo.com>	2008-07-22 19:00:25 +0000
commit	4af4fe44512c2c885498794c7a5a6e878318b2ef (patch)
tree	3eb2a1966bd1fdf7777741492a256f645b1e07cf
parent	d1359adfbf6199a305b062c0a8da0d05e49e84a7 (diff)
download	jackcess-4af4fe44512c2c885498794c7a5a6e878318b2ef.tar.gz jackcess-4af4fe44512c2c885498794c7a5a6e878318b2ef.zip