5 files changed, 923 insertions, 0 deletions
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Debug.java b/src/java/com/healthmarketscience/jackcess/scsu/Debug.java
new file mode 100644
index 0000000..16a9a42
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Debug.java
@@ -0,0 +1,151 @@
+package com.healthmarketscience.jackcess.scsu;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/*
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * A number of helpful output routines for debugging. Output can be
+ * centrally enabled or disabled by calling Debug.set(true/false);
+ * All methods are statics;
+ */
+
+public class Debug
+{
+  
+    private static final Log LOG = LogFactory.getLog(Debug.class); 
+  
+    // debugging helper
+    public static void out(char [] chars)
+    {
+         out(chars, 0);
+    }
+
+    public static void out(char [] chars, int iStart)
+    {
+        if (!LOG.isDebugEnabled()) return;
+        StringBuffer msg = new StringBuffer();
+
+        for (int i = iStart; i < chars.length; i++)
+        {
+            if (chars[i] >= 0 && chars[i] <= 26)
+            {
+                msg.append("^"+(char)(chars[i]+0x40));
+            }
+            else if (chars[i] <= 255)
+            {
+                msg.append(chars[i]);
+            }
+            else
+            {
+                msg.append("\\u"+Integer.toString(chars[i],16));
+            }
+        }
+        LOG.debug(msg.toString());
+    }
+
+    public static void out(byte [] bytes)
+    {
+        out(bytes, 0);
+    }
+    public static void out(byte [] bytes, int iStart)
+    {
+        if (!LOG.isDebugEnabled()) return;
+        StringBuffer msg = new StringBuffer();
+
+        for (int i = iStart; i < bytes.length; i++)
+        {
+            msg.append(bytes[i]+",");
+        }
+        LOG.debug(msg.toString());
+    }
+
+    public static void out(String str)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(str);
+    }
+
+    public static void out(String msg, int iData)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg + iData);
+    }
+    public static void out(String msg, char ch)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg + "[U+"+Integer.toString(ch,16)+"]" + ch);
+    }
+    public static void out(String msg, byte bData)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg + bData);
+    }
+    public static void out(String msg, String str)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg + str);
+    }
+    public static void out(String msg, char [] data)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg);
+        out(data);
+    }
+    public static void out(String msg, byte [] data)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg);
+        out(data);
+    }
+    public static void out(String msg, char [] data, int iStart)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg +"("+iStart+"): ");
+        out(data, iStart);
+    }
+    public static void out(String msg, byte [] data, int iStart)
+    {
+        if (!LOG.isDebugEnabled()) return;
+
+        LOG.debug(msg+"("+iStart+"): ");
+        out(data, iStart);
+    }
+}
+\ No newline at end of file
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java b/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java
new file mode 100644
index 0000000..7d79d4b
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java
@@ -0,0 +1,46 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+/**
+ * The input string or input byte array ended prematurely
+ *
+ */
+public class EndOfInputException
+    extends java.lang.Exception
+{
+   public EndOfInputException(){
+    super("The input string or input byte array ended prematurely");
+    }
+
+    public EndOfInputException(String s) {
+	super(s);
+    }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
new file mode 100644
index 0000000..a6e44b1
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
@@ -0,0 +1,429 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/*
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *  @version 005 Sep 30 1998  
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+ /**
+    Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
+
+    <H2>Notes on the Java implementation</H2>
+
+    A limitation of Java is the exclusive use of a signed byte data type.
+    The following work arounds are required:
+
+    Copying a byte to an integer variable and adding 256 for 'negative'
+    bytes gives an integer in the range 0-255.
+
+    Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
+    char values is unsigned.
+
+    Extended characters require an int to store them. The sign is not an
+    issue because only 1024*1024 + 65536 extended characters exist.
+
+**/
+public class Expand extends SCSU
+{
+    /** (re-)define (and select) a dynamic window
+    A sliding window position cannot start at any Unicode value,
+    so rather than providing an absolute offset, this function takes
+    an index value which selects among the possible starting values.
+
+    Most scripts in Unicode start on or near a half-block boundary
+    so the default behaviour is to multiply the index by 0x80. Han,
+    Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
+    show very poor locality--therefore no sliding window can be set
+    there. A jumpOffset is added to the index value to skip that region,
+    and only 167 index values total are required to select all eligible
+    half-blocks.
+
+    Finally, a few scripts straddle half block boundaries. For them, a
+    table of fixed offsets is used, and the index values from 0xF9 to
+    0xFF are used to select these special offsets.
+
+    After (re-)defining a windows location it is selected so it is ready
+    for use.
+
+    Recall that all Windows are of the same length (128 code positions).
+
+    @param iWindow - index of the window to be (re-)defined
+    @param bOffset - index for the new offset value
+    **/
+	// @005 protected <-- private here and elsewhere
+    protected void defineWindow(int iWindow, byte bOffset)
+        throws IllegalInputException
+    {
+        int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
+
+        // 0 is a reserved value
+        if (iOffset == 0)
+        {
+            throw new IllegalInputException();
+        }
+        else if (iOffset < gapThreshold)
+        {
+            dynamicOffset[iWindow] = iOffset << 7;
+        }
+        else if (iOffset < reservedStart)
+        {
+            dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
+        }
+        else if (iOffset < fixedThreshold)
+        {
+            // more reserved values
+            throw new IllegalInputException("iOffset == "+iOffset);
+        }
+        else
+        {
+            dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
+        }
+
+        // make the redefined window the active one
+        selectWindow(iWindow);
+    }
+
+    /** (re-)define (and select) a window as an extended dynamic window
+    The surrogate area in Unicode allows access to 2**20 codes beyond the
+    first 64K codes by combining one of 1024 characters from the High
+    Surrogate Area with one of 1024 characters from the Low Surrogate
+    Area (see Unicode 2.0 for the details).
+
+    The tags SDX and UDX set the window such that each subsequent byte in
+    the range 80 to FF represents a surrogate pair. The following diagram
+    shows how the bits in the two bytes following the SDX or UDX, and a
+    subsequent data byte, map onto the bits in the resulting surrogate pair.
+
+     hbyte         lbyte          data
+    nnnwwwww      zzzzzyyy      1xxxxxxx
+
+     high-surrogate     low-surrogate
+    110110wwwwwzzzzz   110111yyyxxxxxxx
+
+    @param chOffset - Since the three top bits of chOffset are not needed to
+    set the location of the extended Window, they are used instead
+    to select the window, thereby reducing the number of needed command codes.
+    The bottom 13 bits of chOffset are used to calculate the offset relative to
+    a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
+    **/
+    protected void defineExtendedWindow(char chOffset)
+    {
+        // The top 3 bits of iOffsetHi are the window index
+        int iWindow = chOffset >>> 13;
+
+        // Calculate the new offset
+        dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
+
+        // make the redefined window the active one
+        selectWindow(iWindow);
+    }
+
+    /** string buffer length used by the following functions */
+    protected int iOut = 0;
+
+    /** input cursor used by the following functions */
+    protected int iIn = 0;
+
+    /** expand input that is in Unicode mode
+    @param in input byte array to be expanded
+    @param iCur starting index
+    @param sb string buffer to which to append expanded input
+    @return the index for the lastc byte processed
+    **/
+    protected int expandUnicode(byte []in, int iCur, StringBuffer sb)
+        throws IllegalInputException, EndOfInputException
+    {
+        for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
+        {
+            byte b = in[iCur];
+
+            if (b >= UC0 && b <= UC7)
+            {
+                Debug.out("SelectWindow: ", b);
+                selectWindow(b - UC0);
+                return iCur;
+            }
+            else if (b >= UD0 && b <= UD7)
+            {
+                defineWindow( b - UD0, in[iCur+1]);
+                return iCur + 1;
+            }
+            else if (b == UDX)
+            {
+                if( iCur >= in.length - 2)
+                {
+                    break; // buffer error
+                }
+                defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
+                return iCur + 2;
+            }
+            else if (b == UQU)
+            {
+                if( iCur >= in.length - 2)
+                {
+                    break; // error
+                }
+                // Skip command byte and output Unicode character
+                iCur++;
+            }
+
+            // output a Unicode character
+            char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
+            sb.append((char)ch);
+            iOut++;
+        }
+
+        if( iCur == in.length)
+        {
+            return iCur;
+        }
+
+        // Error condition
+        throw new EndOfInputException();
+    }
+
+    /** assemble a char from two bytes
+    In Java bytes are signed quantities, while chars are unsigned
+    @return the character
+    @param hi most significant byte
+    @param lo least significant byte
+    */
+    public static char charFromTwoBytes(byte hi, byte lo)
+    {
+        char ch = (char)(lo >= 0 ? lo : 256 + lo);
+        return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
+    }
+
+    /** expand portion of the input that is in single byte mode **/
+    protected String expandSingleByte(byte []in)
+        throws IllegalInputException, EndOfInputException
+    {
+
+        /* Allocate the output buffer. Because of control codes, generally
+        each byte of input results in fewer than one character of
+        output. Using in.length as an intial allocation length should avoid
+        the need to reallocate in mid-stream. The exception to this rule are
+        surrogates. */
+        StringBuffer sb = new StringBuffer(in.length);
+        iOut = 0;
+
+        // Loop until all input is exhausted or an error occurred
+        int iCur;
+        Loop:
+        for( iCur = 0; iCur < in.length; iCur++ )
+        {
+            // DEBUG Debug.out("Expanding: ", iCur);
+
+            // Default behaviour is that ASCII characters are passed through
+            // (staticOffset[0] == 0) and characters with the high bit on are
+            // offset by the current dynamic (or sliding) window (this.iWindow)
+            int iStaticWindow = 0;
+            int iDynamicWindow = getCurrentWindow();
+
+            switch(in[iCur])
+            {
+                // Quote from a static Window
+            case SQ0:
+            case SQ1:
+            case SQ2:
+            case SQ3:
+            case SQ4:
+            case SQ5:
+            case SQ6:
+            case SQ7:
+                Debug.out("SQn:", iStaticWindow);
+                // skip the command byte and check for length
+                if( iCur >= in.length - 1)
+                {
+                    Debug.out("SQn missing argument: ", in, iCur);
+                    break Loop;  // buffer length error
+                }
+                // Select window pair to quote from
+                iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
+                iCur ++;
+
+                // FALL THROUGH
+
+            default:
+                // output as character
+                if(in[iCur] >= 0)
+                {
+                    // use static window
+                    int ch = in[iCur] + staticOffset[iStaticWindow];
+                    sb.append((char)ch);
+                    iOut++;
+                }
+                else
+                {
+                    // use dynamic window
+                    int ch = (in[iCur] + 256); // adjust for signed bytes
+                    ch -= 0x80;                // reduce to range 00..7F
+                    ch += dynamicOffset[iDynamicWindow];
+
+                    //DEBUG
+                    Debug.out("Dynamic: ", (char) ch);
+
+                    if (ch < 1<<16)
+                    {
+                        // in Unicode range, output directly
+                        sb.append((char)ch);
+                        iOut++;
+                    }
+                    else
+                    {
+                        // this is an extension character
+                        Debug.out("Extension character: ", ch);
+
+                        // compute and append the two surrogates:
+                        // translate from 10000..10FFFF to 0..FFFFF
+                        ch -= 0x10000;
+
+                        // high surrogate = top 10 bits added to D800
+                        sb.append((char)(0xD800 + (ch>>10)));
+                        iOut++;
+
+                        // low surrogate = bottom 10 bits added to DC00
+                        sb.append((char)(0xDC00 + (ch & ~0xFC00)));
+                        iOut++;
+                    }
+                }
+                break;
+
+                // define a dynamic window as extended
+            case SDX:
+                iCur += 2;
+                if( iCur >= in.length)
+                {
+                    Debug.out("SDn missing argument: ", in, iCur -1);
+                    break Loop;  // buffer length error
+                }
+                defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
+                break;
+
+                // Position a dynamic Window
+            case SD0:
+            case SD1:
+            case SD2:
+            case SD3:
+            case SD4:
+            case SD5:
+            case SD6:
+            case SD7:
+                iCur ++;
+                if( iCur >= in.length)
+                {
+                    Debug.out("SDn missing argument: ", in, iCur -1);
+                    break Loop;  // buffer length error
+                }
+                defineWindow(in[iCur-1] - SD0, in[iCur]);
+                break;
+
+                // Select a new dynamic Window
+            case SC0:
+            case SC1:
+            case SC2:
+            case SC3:
+            case SC4:
+            case SC5:
+            case SC6:
+            case SC7:
+                selectWindow(in[iCur] - SC0);
+                break;
+            case SCU:
+                // switch to Unicode mode and continue parsing
+                iCur = expandUnicode(in, iCur+1, sb);
+                // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
+                break;
+
+            case SQU:
+                // directly extract one Unicode character
+                iCur += 2;
+                if( iCur >= in.length)
+                {
+                     Debug.out("SQU missing argument: ", in, iCur - 2);
+                     break Loop;  // buffer length error
+                }
+                else
+                {
+                    char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
+
+                    Debug.out("Quoted: ", ch);
+                    sb.append((char)ch);
+                    iOut++;
+                }
+                break;
+
+             case Srs:
+                throw new IllegalInputException();
+                // break;
+            }
+        }
+
+        if( iCur >= in.length)
+        {
+            //SUCCESS: all input used up
+            sb.setLength(iOut);
+            iIn = iCur;
+            return sb.toString();
+        }
+
+        Debug.out("Length ==" + in.length+" iCur =", iCur);
+        //ERROR: premature end of input
+        throw new EndOfInputException();
+    }
+
+    /** expand a byte array containing compressed Unicode */
+    public String expand (byte []in)
+        throws IllegalInputException, EndOfInputException
+    {
+        String str = expandSingleByte(in);
+        Debug.out("expand output: ", str.toCharArray());
+        return str;
+    }
+
+
+    /** reset is called to start with new input, w/o creating a new
+        instance */
+    public void reset()
+    {
+        iOut = 0;
+        iIn = 0;
+        super.reset();
+    }
+
+    public int charsWritten()
+    {
+        return iOut;
+    }
+
+    public int bytesRead()
+    {
+        return iIn;
+    }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java b/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java
new file mode 100644
index 0000000..358e8bc
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java
@@ -0,0 +1,45 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+/**
+ * The input character array or input byte array contained
+ * illegal sequences of bytes or characters
+ */
+public class IllegalInputException extends java.lang.Exception
+{
+   public IllegalInputException(){
+    super("The input character array or input byte array contained illegal sequences of bytes or characters");
+    }
+
+    public IllegalInputException(String s) {
+	super(s);
+    }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java b/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java
new file mode 100644
index 0000000..da3af58
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java
@@ -0,0 +1,252 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/*
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ *  @author Asmus Freytag
+ *
+ *  @version 001 Dec 25 1996
+ *  @version 002 Jun 25 1997
+ *  @version 003 Jul 25 1997
+ *  @version 004 Aug 25 1997
+ *  @version 005 Sep 30 1998
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+ /**
+    Encoding text data in Unicode often requires more storage than using
+    an existing 8-bit character set and limited to the subset of characters
+    actually found in the text. The Unicode Compression Algorithm reduces
+    the necessary storage while retaining the universality of Unicode.
+    A full description of the algorithm can be found in document
+    http://www.unicode.org/unicode/reports/tr6.html
+
+    Summary
+
+    The goal of the Unicode Compression Algorithm is the abilty to
+    * Express all code points in Unicode
+    * Approximate storage size for traditional character sets
+    * Work well for short strings
+    * Provide transparency for Latin-1 data
+    * Support very simple decoders
+    * Support simple as well as sophisticated encoders
+
+    If needed, further compression can be achieved by layering standard
+    file or disk-block based compression algorithms on top.
+
+    <H2>Features</H2>
+
+    Languages using small alphabets would contain runs of characters that
+    are coded close together in Unicode. These runs are interrupted only
+    by punctuation characters, which are themselves coded in proximity to
+    each other in Unicode (usually in the ASCII range).
+
+    Two basic mechanisms in the compression algorithm account for these two
+    cases, sliding windows and static windows. A window is an area of 128
+    consecutive characters in Unicode. In the compressed data stream, each
+    character from a sliding window would be represented as a byte between
+    0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
+    TAB) would always mean an ASCII character (or control).
+
+    <H2>Notes on the Java implementation</H2>
+
+    A limitation of Java is the exclusive use of a signed byte data type.
+    The following work arounds are required:
+
+    Copying a byte to an integer variable and adding 256 for 'negative'
+    bytes gives an integer in the range 0-255.
+
+    Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
+    char values is unsigned.
+
+    Extended characters require an int to store them. The sign is not an
+    issue because only 1024*1024 + 65536 extended characters exist.
+
+**/
+public abstract class SCSU
+{
+    /** Single Byte mode command values */
+
+    /** SQ<i>n</i> Quote from Window . <p>
+    If the following byte is less than 0x80, quote from
+    static window <i>n</i>, else quote from dynamic window <i>n</i>.
+    */
+
+    static final byte SQ0 = 0x01; // Quote from window pair 0
+    static final byte SQ1 = 0x02; // Quote from window pair 1
+    static final byte SQ2 = 0x03; // Quote from window pair 2
+    static final byte SQ3 = 0x04; // Quote from window pair 3
+    static final byte SQ4 = 0x05; // Quote from window pair 4
+    static final byte SQ5 = 0x06; // Quote from window pair 5
+    static final byte SQ6 = 0x07; // Quote from window pair 6
+    static final byte SQ7 = 0x08; // Quote from window pair 7
+
+    static final byte SDX = 0x0B; // Define a window as extended
+    static final byte Srs = 0x0C; // reserved
+
+    static final byte SQU = 0x0E; // Quote a single Unicode character
+    static final byte SCU = 0x0F; // Change to Unicode mode
+
+    /** SC<i>n</i> Change to Window <i>n</i>. <p>
+    If the following bytes are less than 0x80, interpret them
+    as command bytes or pass them through, else add the offset
+    for dynamic window <i>n</i>. */
+    static final byte SC0 = 0x10; // Select window 0
+    static final byte SC1 = 0x11; // Select window 1
+    static final byte SC2 = 0x12; // Select window 2
+    static final byte SC3 = 0x13; // Select window 3
+    static final byte SC4 = 0x14; // Select window 4
+    static final byte SC5 = 0x15; // Select window 5
+    static final byte SC6 = 0x16; // Select window 6
+    static final byte SC7 = 0x17; // Select window 7
+    static final byte SD0 = 0x18; // Define and select window 0
+    static final byte SD1 = 0x19; // Define and select window 1
+    static final byte SD2 = 0x1A; // Define and select window 2
+    static final byte SD3 = 0x1B; // Define and select window 3
+    static final byte SD4 = 0x1C; // Define and select window 4
+    static final byte SD5 = 0x1D; // Define and select window 5
+    static final byte SD6 = 0x1E; // Define and select window 6
+    static final byte SD7 = 0x1F; // Define and select window 7
+
+    static final byte UC0 = (byte) 0xE0; // Select window 0
+    static final byte UC1 = (byte) 0xE1; // Select window 1
+    static final byte UC2 = (byte) 0xE2; // Select window 2
+    static final byte UC3 = (byte) 0xE3; // Select window 3
+    static final byte UC4 = (byte) 0xE4; // Select window 4
+    static final byte UC5 = (byte) 0xE5; // Select window 5
+    static final byte UC6 = (byte) 0xE6; // Select window 6
+    static final byte UC7 = (byte) 0xE7; // Select window 7
+    static final byte UD0 = (byte) 0xE8; // Define and select window 0
+    static final byte UD1 = (byte) 0xE9; // Define and select window 1
+    static final byte UD2 = (byte) 0xEA; // Define and select window 2
+    static final byte UD3 = (byte) 0xEB; // Define and select window 3
+    static final byte UD4 = (byte) 0xEC; // Define and select window 4
+    static final byte UD5 = (byte) 0xED; // Define and select window 5
+    static final byte UD6 = (byte) 0xEE; // Define and select window 6
+    static final byte UD7 = (byte) 0xEF; // Define and select window 7
+
+    static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
+    static final byte UDX = (byte) 0xF1; // Define a Window as extended
+    static final byte Urs = (byte) 0xF2; // reserved
+
+    /** constant offsets for the 8 static windows */
+    static final int staticOffset[] =
+    {
+        0x0000, // ASCII for quoted tags
+        0x0080, // Latin - 1 Supplement (for access to punctuation)
+        0x0100, // Latin Extended-A
+        0x0300, // Combining Diacritical Marks
+        0x2000, // General Punctuation
+        0x2080, // Currency Symbols
+        0x2100, // Letterlike Symbols and Number Forms
+        0x3000  // CJK Symbols and punctuation
+    };
+
+    /** initial offsets for the 8 dynamic (sliding) windows */
+    static final int initialDynamicOffset[] =
+    {
+        0x0080, // Latin-1
+        0x00C0, // Latin Extended A   //@005 fixed from 0x0100
+        0x0400, // Cyrillic
+        0x0600, // Arabic
+        0x0900, // Devanagari
+        0x3040, // Hiragana
+        0x30A0, // Katakana
+        0xFF00  // Fullwidth ASCII
+    };
+
+    /** dynamic window offsets, intitialize to default values. */
+    int dynamicOffset[] =
+    {
+        initialDynamicOffset[0],
+        initialDynamicOffset[1],
+        initialDynamicOffset[2],
+        initialDynamicOffset[3],
+        initialDynamicOffset[4],
+        initialDynamicOffset[5],
+        initialDynamicOffset[6],
+        initialDynamicOffset[7]
+    };
+
+    // The following method is common to encoder and decoder
+
+    private int iWindow = 0;    // current active window
+
+    /** select the active dynamic window **/
+    protected void selectWindow(int iWindow)
+    {
+        this.iWindow = iWindow;
+    }
+
+    /** select the active dynamic window **/
+    protected int getCurrentWindow()
+    {
+        return this.iWindow;
+    }
+
+    /**
+       These values are used in defineWindow
+     **/
+
+    /**
+     * Unicode code points from 3400 to E000 are not adressible by
+     * dynamic window, since in these areas no short run alphabets are
+     * found. Therefore add gapOffset to all values from gapThreshold */
+    static final int gapThreshold = 0x68;
+    static final int gapOffset = 0xAC00;
+
+    /* values between reservedStart and fixedThreshold are reserved */
+    static final int reservedStart = 0xA8;
+
+    /* use table of predefined fixed offsets for values from fixedThreshold */
+    static final int fixedThreshold = 0xF9;
+
+    /** Table of fixed predefined Offsets, and byte values that index into  **/
+    static final int fixedOffset[] =
+    {
+        /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
+        /* 0xFA */ 0x0250, // IPA extensions
+        /* 0xFB */ 0x0370, // Greek
+        /* 0xFC */ 0x0530, // Armenian
+        /* 0xFD */ 0x3040, // Hiragana
+        /* 0xFE */ 0x30A0, // Katakana
+        /* 0xFF */ 0xFF60  // Halfwidth Katakana
+    };
+
+    /** whether a character is compressible */
+    public static boolean isCompressible(char ch)
+    {
+        return (ch < 0x3400 || ch >= 0xE000);
+    }
+
+    /** reset is only needed to bail out after an exception and
+        restart with new input */
+    public void reset()
+    {
+
+        // reset the dynamic windows
+        for (int i = 0; i < dynamicOffset.length; i++)
+        {
+            dynamicOffset[i] = initialDynamicOffset[i];
+        }
+        this.iWindow = 0;
+    }
+}
+\ No newline at end of file