summaryrefslogtreecommitdiffstats
path: root/src/java/com/healthmarketscience/jackcess/scsu
diff options
context:
space:
mode:
Diffstat (limited to 'src/java/com/healthmarketscience/jackcess/scsu')
-rw-r--r--src/java/com/healthmarketscience/jackcess/scsu/Debug.java151
-rw-r--r--src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java46
-rw-r--r--src/java/com/healthmarketscience/jackcess/scsu/Expand.java429
-rw-r--r--src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java45
-rw-r--r--src/java/com/healthmarketscience/jackcess/scsu/SCSU.java252
5 files changed, 923 insertions, 0 deletions
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Debug.java b/src/java/com/healthmarketscience/jackcess/scsu/Debug.java
new file mode 100644
index 0000000..16a9a42
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Debug.java
@@ -0,0 +1,151 @@
+package com.healthmarketscience.jackcess.scsu;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/*
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ * @author Asmus Freytag
+ *
+ * @version 001 Dec 25 1996
+ * @version 002 Jun 25 1997
+ * @version 003 Jul 25 1997
+ * @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+/**
+ * A number of helpful output routines for debugging. Output can be
+ * centrally enabled or disabled by calling Debug.set(true/false);
+ * All methods are statics;
+ */
+
+public class Debug
+{
+
+ private static final Log LOG = LogFactory.getLog(Debug.class);
+
+ // debugging helper
+ public static void out(char [] chars)
+ {
+ out(chars, 0);
+ }
+
+ public static void out(char [] chars, int iStart)
+ {
+ if (!LOG.isDebugEnabled()) return;
+ StringBuffer msg = new StringBuffer();
+
+ for (int i = iStart; i < chars.length; i++)
+ {
+ if (chars[i] >= 0 && chars[i] <= 26)
+ {
+ msg.append("^"+(char)(chars[i]+0x40));
+ }
+ else if (chars[i] <= 255)
+ {
+ msg.append(chars[i]);
+ }
+ else
+ {
+ msg.append("\\u"+Integer.toString(chars[i],16));
+ }
+ }
+ LOG.debug(msg.toString());
+ }
+
+ public static void out(byte [] bytes)
+ {
+ out(bytes, 0);
+ }
+ public static void out(byte [] bytes, int iStart)
+ {
+ if (!LOG.isDebugEnabled()) return;
+ StringBuffer msg = new StringBuffer();
+
+ for (int i = iStart; i < bytes.length; i++)
+ {
+ msg.append(bytes[i]+",");
+ }
+ LOG.debug(msg.toString());
+ }
+
+ public static void out(String str)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(str);
+ }
+
+ public static void out(String msg, int iData)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg + iData);
+ }
+ public static void out(String msg, char ch)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg + "[U+"+Integer.toString(ch,16)+"]" + ch);
+ }
+ public static void out(String msg, byte bData)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg + bData);
+ }
+ public static void out(String msg, String str)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg + str);
+ }
+ public static void out(String msg, char [] data)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg);
+ out(data);
+ }
+ public static void out(String msg, byte [] data)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg);
+ out(data);
+ }
+ public static void out(String msg, char [] data, int iStart)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg +"("+iStart+"): ");
+ out(data, iStart);
+ }
+ public static void out(String msg, byte [] data, int iStart)
+ {
+ if (!LOG.isDebugEnabled()) return;
+
+ LOG.debug(msg+"("+iStart+"): ");
+ out(data, iStart);
+ }
+} \ No newline at end of file
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java b/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java
new file mode 100644
index 0000000..7d79d4b
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java
@@ -0,0 +1,46 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ * @author Asmus Freytag
+ *
+ * @version 001 Dec 25 1996
+ * @version 002 Jun 25 1997
+ * @version 003 Jul 25 1997
+ * @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+/**
+ * The input string or input byte array ended prematurely
+ *
+ */
+public class EndOfInputException
+ extends java.lang.Exception
+{
+ public EndOfInputException(){
+ super("The input string or input byte array ended prematurely");
+ }
+
+ public EndOfInputException(String s) {
+ super(s);
+ }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
new file mode 100644
index 0000000..a6e44b1
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java
@@ -0,0 +1,429 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/*
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ * @author Asmus Freytag
+ *
+ * @version 001 Dec 25 1996
+ * @version 002 Jun 25 1997
+ * @version 003 Jul 25 1997
+ * @version 004 Aug 25 1997
+ * @version 005 Sep 30 1998
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+ /**
+ Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
+
+ <H2>Notes on the Java implementation</H2>
+
+ A limitation of Java is the exclusive use of a signed byte data type.
+ The following work arounds are required:
+
+ Copying a byte to an integer variable and adding 256 for 'negative'
+ bytes gives an integer in the range 0-255.
+
+ Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
+ char values is unsigned.
+
+ Extended characters require an int to store them. The sign is not an
+ issue because only 1024*1024 + 65536 extended characters exist.
+
+**/
+public class Expand extends SCSU
+{
+ /** (re-)define (and select) a dynamic window
+ A sliding window position cannot start at any Unicode value,
+ so rather than providing an absolute offset, this function takes
+ an index value which selects among the possible starting values.
+
+ Most scripts in Unicode start on or near a half-block boundary
+ so the default behaviour is to multiply the index by 0x80. Han,
+ Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
+ show very poor locality--therefore no sliding window can be set
+ there. A jumpOffset is added to the index value to skip that region,
+ and only 167 index values total are required to select all eligible
+ half-blocks.
+
+ Finally, a few scripts straddle half block boundaries. For them, a
+ table of fixed offsets is used, and the index values from 0xF9 to
+ 0xFF are used to select these special offsets.
+
+ After (re-)defining a windows location it is selected so it is ready
+ for use.
+
+ Recall that all Windows are of the same length (128 code positions).
+
+ @param iWindow - index of the window to be (re-)defined
+ @param bOffset - index for the new offset value
+ **/
+ // @005 protected <-- private here and elsewhere
+ protected void defineWindow(int iWindow, byte bOffset)
+ throws IllegalInputException
+ {
+ int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
+
+ // 0 is a reserved value
+ if (iOffset == 0)
+ {
+ throw new IllegalInputException();
+ }
+ else if (iOffset < gapThreshold)
+ {
+ dynamicOffset[iWindow] = iOffset << 7;
+ }
+ else if (iOffset < reservedStart)
+ {
+ dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
+ }
+ else if (iOffset < fixedThreshold)
+ {
+ // more reserved values
+ throw new IllegalInputException("iOffset == "+iOffset);
+ }
+ else
+ {
+ dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
+ }
+
+ // make the redefined window the active one
+ selectWindow(iWindow);
+ }
+
+ /** (re-)define (and select) a window as an extended dynamic window
+ The surrogate area in Unicode allows access to 2**20 codes beyond the
+ first 64K codes by combining one of 1024 characters from the High
+ Surrogate Area with one of 1024 characters from the Low Surrogate
+ Area (see Unicode 2.0 for the details).
+
+ The tags SDX and UDX set the window such that each subsequent byte in
+ the range 80 to FF represents a surrogate pair. The following diagram
+ shows how the bits in the two bytes following the SDX or UDX, and a
+ subsequent data byte, map onto the bits in the resulting surrogate pair.
+
+ hbyte lbyte data
+ nnnwwwww zzzzzyyy 1xxxxxxx
+
+ high-surrogate low-surrogate
+ 110110wwwwwzzzzz 110111yyyxxxxxxx
+
+ @param chOffset - Since the three top bits of chOffset are not needed to
+ set the location of the extended Window, they are used instead
+ to select the window, thereby reducing the number of needed command codes.
+ The bottom 13 bits of chOffset are used to calculate the offset relative to
+ a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
+ **/
+ protected void defineExtendedWindow(char chOffset)
+ {
+ // The top 3 bits of iOffsetHi are the window index
+ int iWindow = chOffset >>> 13;
+
+ // Calculate the new offset
+ dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
+
+ // make the redefined window the active one
+ selectWindow(iWindow);
+ }
+
+ /** string buffer length used by the following functions */
+ protected int iOut = 0;
+
+ /** input cursor used by the following functions */
+ protected int iIn = 0;
+
+ /** expand input that is in Unicode mode
+ @param in input byte array to be expanded
+ @param iCur starting index
+ @param sb string buffer to which to append expanded input
+ @return the index for the lastc byte processed
+ **/
+ protected int expandUnicode(byte []in, int iCur, StringBuffer sb)
+ throws IllegalInputException, EndOfInputException
+ {
+ for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
+ {
+ byte b = in[iCur];
+
+ if (b >= UC0 && b <= UC7)
+ {
+ Debug.out("SelectWindow: ", b);
+ selectWindow(b - UC0);
+ return iCur;
+ }
+ else if (b >= UD0 && b <= UD7)
+ {
+ defineWindow( b - UD0, in[iCur+1]);
+ return iCur + 1;
+ }
+ else if (b == UDX)
+ {
+ if( iCur >= in.length - 2)
+ {
+ break; // buffer error
+ }
+ defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
+ return iCur + 2;
+ }
+ else if (b == UQU)
+ {
+ if( iCur >= in.length - 2)
+ {
+ break; // error
+ }
+ // Skip command byte and output Unicode character
+ iCur++;
+ }
+
+ // output a Unicode character
+ char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
+ sb.append((char)ch);
+ iOut++;
+ }
+
+ if( iCur == in.length)
+ {
+ return iCur;
+ }
+
+ // Error condition
+ throw new EndOfInputException();
+ }
+
+ /** assemble a char from two bytes
+ In Java bytes are signed quantities, while chars are unsigned
+ @return the character
+ @param hi most significant byte
+ @param lo least significant byte
+ */
+ public static char charFromTwoBytes(byte hi, byte lo)
+ {
+ char ch = (char)(lo >= 0 ? lo : 256 + lo);
+ return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
+ }
+
+ /** expand portion of the input that is in single byte mode **/
+ protected String expandSingleByte(byte []in)
+ throws IllegalInputException, EndOfInputException
+ {
+
+ /* Allocate the output buffer. Because of control codes, generally
+ each byte of input results in fewer than one character of
+ output. Using in.length as an intial allocation length should avoid
+ the need to reallocate in mid-stream. The exception to this rule are
+ surrogates. */
+ StringBuffer sb = new StringBuffer(in.length);
+ iOut = 0;
+
+ // Loop until all input is exhausted or an error occurred
+ int iCur;
+ Loop:
+ for( iCur = 0; iCur < in.length; iCur++ )
+ {
+ // DEBUG Debug.out("Expanding: ", iCur);
+
+ // Default behaviour is that ASCII characters are passed through
+ // (staticOffset[0] == 0) and characters with the high bit on are
+ // offset by the current dynamic (or sliding) window (this.iWindow)
+ int iStaticWindow = 0;
+ int iDynamicWindow = getCurrentWindow();
+
+ switch(in[iCur])
+ {
+ // Quote from a static Window
+ case SQ0:
+ case SQ1:
+ case SQ2:
+ case SQ3:
+ case SQ4:
+ case SQ5:
+ case SQ6:
+ case SQ7:
+ Debug.out("SQn:", iStaticWindow);
+ // skip the command byte and check for length
+ if( iCur >= in.length - 1)
+ {
+ Debug.out("SQn missing argument: ", in, iCur);
+ break Loop; // buffer length error
+ }
+ // Select window pair to quote from
+ iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
+ iCur ++;
+
+ // FALL THROUGH
+
+ default:
+ // output as character
+ if(in[iCur] >= 0)
+ {
+ // use static window
+ int ch = in[iCur] + staticOffset[iStaticWindow];
+ sb.append((char)ch);
+ iOut++;
+ }
+ else
+ {
+ // use dynamic window
+ int ch = (in[iCur] + 256); // adjust for signed bytes
+ ch -= 0x80; // reduce to range 00..7F
+ ch += dynamicOffset[iDynamicWindow];
+
+ //DEBUG
+ Debug.out("Dynamic: ", (char) ch);
+
+ if (ch < 1<<16)
+ {
+ // in Unicode range, output directly
+ sb.append((char)ch);
+ iOut++;
+ }
+ else
+ {
+ // this is an extension character
+ Debug.out("Extension character: ", ch);
+
+ // compute and append the two surrogates:
+ // translate from 10000..10FFFF to 0..FFFFF
+ ch -= 0x10000;
+
+ // high surrogate = top 10 bits added to D800
+ sb.append((char)(0xD800 + (ch>>10)));
+ iOut++;
+
+ // low surrogate = bottom 10 bits added to DC00
+ sb.append((char)(0xDC00 + (ch & ~0xFC00)));
+ iOut++;
+ }
+ }
+ break;
+
+ // define a dynamic window as extended
+ case SDX:
+ iCur += 2;
+ if( iCur >= in.length)
+ {
+ Debug.out("SDn missing argument: ", in, iCur -1);
+ break Loop; // buffer length error
+ }
+ defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
+ break;
+
+ // Position a dynamic Window
+ case SD0:
+ case SD1:
+ case SD2:
+ case SD3:
+ case SD4:
+ case SD5:
+ case SD6:
+ case SD7:
+ iCur ++;
+ if( iCur >= in.length)
+ {
+ Debug.out("SDn missing argument: ", in, iCur -1);
+ break Loop; // buffer length error
+ }
+ defineWindow(in[iCur-1] - SD0, in[iCur]);
+ break;
+
+ // Select a new dynamic Window
+ case SC0:
+ case SC1:
+ case SC2:
+ case SC3:
+ case SC4:
+ case SC5:
+ case SC6:
+ case SC7:
+ selectWindow(in[iCur] - SC0);
+ break;
+ case SCU:
+ // switch to Unicode mode and continue parsing
+ iCur = expandUnicode(in, iCur+1, sb);
+ // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
+ break;
+
+ case SQU:
+ // directly extract one Unicode character
+ iCur += 2;
+ if( iCur >= in.length)
+ {
+ Debug.out("SQU missing argument: ", in, iCur - 2);
+ break Loop; // buffer length error
+ }
+ else
+ {
+ char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
+
+ Debug.out("Quoted: ", ch);
+ sb.append((char)ch);
+ iOut++;
+ }
+ break;
+
+ case Srs:
+ throw new IllegalInputException();
+ // break;
+ }
+ }
+
+ if( iCur >= in.length)
+ {
+ //SUCCESS: all input used up
+ sb.setLength(iOut);
+ iIn = iCur;
+ return sb.toString();
+ }
+
+ Debug.out("Length ==" + in.length+" iCur =", iCur);
+ //ERROR: premature end of input
+ throw new EndOfInputException();
+ }
+
+ /** expand a byte array containing compressed Unicode */
+ public String expand (byte []in)
+ throws IllegalInputException, EndOfInputException
+ {
+ String str = expandSingleByte(in);
+ Debug.out("expand output: ", str.toCharArray());
+ return str;
+ }
+
+
+ /** reset is called to start with new input, w/o creating a new
+ instance */
+ public void reset()
+ {
+ iOut = 0;
+ iIn = 0;
+ super.reset();
+ }
+
+ public int charsWritten()
+ {
+ return iOut;
+ }
+
+ public int bytesRead()
+ {
+ return iIn;
+ }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java b/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java
new file mode 100644
index 0000000..358e8bc
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java
@@ -0,0 +1,45 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/**
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ * @author Asmus Freytag
+ *
+ * @version 001 Dec 25 1996
+ * @version 002 Jun 25 1997
+ * @version 003 Jul 25 1997
+ * @version 004 Aug 25 1997
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+/**
+ * The input character array or input byte array contained
+ * illegal sequences of bytes or characters
+ */
+public class IllegalInputException extends java.lang.Exception
+{
+ public IllegalInputException(){
+ super("The input character array or input byte array contained illegal sequences of bytes or characters");
+ }
+
+ public IllegalInputException(String s) {
+ super(s);
+ }
+}
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java b/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java
new file mode 100644
index 0000000..da3af58
--- /dev/null
+++ b/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java
@@ -0,0 +1,252 @@
+package com.healthmarketscience.jackcess.scsu;
+
+/*
+ * This sample software accompanies Unicode Technical Report #6 and
+ * distributed as is by Unicode, Inc., subject to the following:
+ *
+ * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software
+ * without fee is hereby granted provided that this copyright notice
+ * appears in all copies.
+ *
+ * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
+ * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
+ * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
+ * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
+ * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
+ * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
+ * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
+ *
+ * @author Asmus Freytag
+ *
+ * @version 001 Dec 25 1996
+ * @version 002 Jun 25 1997
+ * @version 003 Jul 25 1997
+ * @version 004 Aug 25 1997
+ * @version 005 Sep 30 1998
+ *
+ * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
+ * and are registered in some jurisdictions.
+ **/
+
+ /**
+ Encoding text data in Unicode often requires more storage than using
+ an existing 8-bit character set and limited to the subset of characters
+ actually found in the text. The Unicode Compression Algorithm reduces
+ the necessary storage while retaining the universality of Unicode.
+ A full description of the algorithm can be found in document
+ http://www.unicode.org/unicode/reports/tr6.html
+
+ Summary
+
+ The goal of the Unicode Compression Algorithm is the abilty to
+ * Express all code points in Unicode
+ * Approximate storage size for traditional character sets
+ * Work well for short strings
+ * Provide transparency for Latin-1 data
+ * Support very simple decoders
+ * Support simple as well as sophisticated encoders
+
+ If needed, further compression can be achieved by layering standard
+ file or disk-block based compression algorithms on top.
+
+ <H2>Features</H2>
+
+ Languages using small alphabets would contain runs of characters that
+ are coded close together in Unicode. These runs are interrupted only
+ by punctuation characters, which are themselves coded in proximity to
+ each other in Unicode (usually in the ASCII range).
+
+ Two basic mechanisms in the compression algorithm account for these two
+ cases, sliding windows and static windows. A window is an area of 128
+ consecutive characters in Unicode. In the compressed data stream, each
+ character from a sliding window would be represented as a byte between
+ 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
+ TAB) would always mean an ASCII character (or control).
+
+ <H2>Notes on the Java implementation</H2>
+
+ A limitation of Java is the exclusive use of a signed byte data type.
+ The following work arounds are required:
+
+ Copying a byte to an integer variable and adding 256 for 'negative'
+ bytes gives an integer in the range 0-255.
+
+ Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
+ char values is unsigned.
+
+ Extended characters require an int to store them. The sign is not an
+ issue because only 1024*1024 + 65536 extended characters exist.
+
+**/
+public abstract class SCSU
+{
+ /** Single Byte mode command values */
+
+ /** SQ<i>n</i> Quote from Window . <p>
+ If the following byte is less than 0x80, quote from
+ static window <i>n</i>, else quote from dynamic window <i>n</i>.
+ */
+
+ static final byte SQ0 = 0x01; // Quote from window pair 0
+ static final byte SQ1 = 0x02; // Quote from window pair 1
+ static final byte SQ2 = 0x03; // Quote from window pair 2
+ static final byte SQ3 = 0x04; // Quote from window pair 3
+ static final byte SQ4 = 0x05; // Quote from window pair 4
+ static final byte SQ5 = 0x06; // Quote from window pair 5
+ static final byte SQ6 = 0x07; // Quote from window pair 6
+ static final byte SQ7 = 0x08; // Quote from window pair 7
+
+ static final byte SDX = 0x0B; // Define a window as extended
+ static final byte Srs = 0x0C; // reserved
+
+ static final byte SQU = 0x0E; // Quote a single Unicode character
+ static final byte SCU = 0x0F; // Change to Unicode mode
+
+ /** SC<i>n</i> Change to Window <i>n</i>. <p>
+ If the following bytes are less than 0x80, interpret them
+ as command bytes or pass them through, else add the offset
+ for dynamic window <i>n</i>. */
+ static final byte SC0 = 0x10; // Select window 0
+ static final byte SC1 = 0x11; // Select window 1
+ static final byte SC2 = 0x12; // Select window 2
+ static final byte SC3 = 0x13; // Select window 3
+ static final byte SC4 = 0x14; // Select window 4
+ static final byte SC5 = 0x15; // Select window 5
+ static final byte SC6 = 0x16; // Select window 6
+ static final byte SC7 = 0x17; // Select window 7
+ static final byte SD0 = 0x18; // Define and select window 0
+ static final byte SD1 = 0x19; // Define and select window 1
+ static final byte SD2 = 0x1A; // Define and select window 2
+ static final byte SD3 = 0x1B; // Define and select window 3
+ static final byte SD4 = 0x1C; // Define and select window 4
+ static final byte SD5 = 0x1D; // Define and select window 5
+ static final byte SD6 = 0x1E; // Define and select window 6
+ static final byte SD7 = 0x1F; // Define and select window 7
+
+ static final byte UC0 = (byte) 0xE0; // Select window 0
+ static final byte UC1 = (byte) 0xE1; // Select window 1
+ static final byte UC2 = (byte) 0xE2; // Select window 2
+ static final byte UC3 = (byte) 0xE3; // Select window 3
+ static final byte UC4 = (byte) 0xE4; // Select window 4
+ static final byte UC5 = (byte) 0xE5; // Select window 5
+ static final byte UC6 = (byte) 0xE6; // Select window 6
+ static final byte UC7 = (byte) 0xE7; // Select window 7
+ static final byte UD0 = (byte) 0xE8; // Define and select window 0
+ static final byte UD1 = (byte) 0xE9; // Define and select window 1
+ static final byte UD2 = (byte) 0xEA; // Define and select window 2
+ static final byte UD3 = (byte) 0xEB; // Define and select window 3
+ static final byte UD4 = (byte) 0xEC; // Define and select window 4
+ static final byte UD5 = (byte) 0xED; // Define and select window 5
+ static final byte UD6 = (byte) 0xEE; // Define and select window 6
+ static final byte UD7 = (byte) 0xEF; // Define and select window 7
+
+ static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
+ static final byte UDX = (byte) 0xF1; // Define a Window as extended
+ static final byte Urs = (byte) 0xF2; // reserved
+
+ /** constant offsets for the 8 static windows */
+ static final int staticOffset[] =
+ {
+ 0x0000, // ASCII for quoted tags
+ 0x0080, // Latin - 1 Supplement (for access to punctuation)
+ 0x0100, // Latin Extended-A
+ 0x0300, // Combining Diacritical Marks
+ 0x2000, // General Punctuation
+ 0x2080, // Currency Symbols
+ 0x2100, // Letterlike Symbols and Number Forms
+ 0x3000 // CJK Symbols and punctuation
+ };
+
+ /** initial offsets for the 8 dynamic (sliding) windows */
+ static final int initialDynamicOffset[] =
+ {
+ 0x0080, // Latin-1
+ 0x00C0, // Latin Extended A //@005 fixed from 0x0100
+ 0x0400, // Cyrillic
+ 0x0600, // Arabic
+ 0x0900, // Devanagari
+ 0x3040, // Hiragana
+ 0x30A0, // Katakana
+ 0xFF00 // Fullwidth ASCII
+ };
+
+ /** dynamic window offsets, intitialize to default values. */
+ int dynamicOffset[] =
+ {
+ initialDynamicOffset[0],
+ initialDynamicOffset[1],
+ initialDynamicOffset[2],
+ initialDynamicOffset[3],
+ initialDynamicOffset[4],
+ initialDynamicOffset[5],
+ initialDynamicOffset[6],
+ initialDynamicOffset[7]
+ };
+
+ // The following method is common to encoder and decoder
+
+ private int iWindow = 0; // current active window
+
+ /** select the active dynamic window **/
+ protected void selectWindow(int iWindow)
+ {
+ this.iWindow = iWindow;
+ }
+
+ /** select the active dynamic window **/
+ protected int getCurrentWindow()
+ {
+ return this.iWindow;
+ }
+
+ /**
+ These values are used in defineWindow
+ **/
+
+ /**
+ * Unicode code points from 3400 to E000 are not adressible by
+ * dynamic window, since in these areas no short run alphabets are
+ * found. Therefore add gapOffset to all values from gapThreshold */
+ static final int gapThreshold = 0x68;
+ static final int gapOffset = 0xAC00;
+
+ /* values between reservedStart and fixedThreshold are reserved */
+ static final int reservedStart = 0xA8;
+
+ /* use table of predefined fixed offsets for values from fixedThreshold */
+ static final int fixedThreshold = 0xF9;
+
+ /** Table of fixed predefined Offsets, and byte values that index into **/
+ static final int fixedOffset[] =
+ {
+ /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
+ /* 0xFA */ 0x0250, // IPA extensions
+ /* 0xFB */ 0x0370, // Greek
+ /* 0xFC */ 0x0530, // Armenian
+ /* 0xFD */ 0x3040, // Hiragana
+ /* 0xFE */ 0x30A0, // Katakana
+ /* 0xFF */ 0xFF60 // Halfwidth Katakana
+ };
+
+ /** whether a character is compressible */
+ public static boolean isCompressible(char ch)
+ {
+ return (ch < 0x3400 || ch >= 0xE000);
+ }
+
+ /** reset is only needed to bail out after an exception and
+ restart with new input */
+ public void reset()
+ {
+
+ // reset the dynamic windows
+ for (int i = 0; i < dynamicOffset.length; i++)
+ {
+ dynamicOffset[i] = initialDynamicOffset[i];
+ }
+ this.iWindow = 0;
+ }
+} \ No newline at end of file