diff options
Diffstat (limited to 'src/java/com/healthmarketscience/jackcess/scsu')
5 files changed, 923 insertions, 0 deletions
diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Debug.java b/src/java/com/healthmarketscience/jackcess/scsu/Debug.java new file mode 100644 index 0000000..16a9a42 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/Debug.java @@ -0,0 +1,151 @@ +package com.healthmarketscience.jackcess.scsu; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/* + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + +/** + * A number of helpful output routines for debugging. Output can be + * centrally enabled or disabled by calling Debug.set(true/false); + * All methods are statics; + */ + +public class Debug +{ + + private static final Log LOG = LogFactory.getLog(Debug.class); + + // debugging helper + public static void out(char [] chars) + { + out(chars, 0); + } + + public static void out(char [] chars, int iStart) + { + if (!LOG.isDebugEnabled()) return; + StringBuffer msg = new StringBuffer(); + + for (int i = iStart; i < chars.length; i++) + { + if (chars[i] >= 0 && chars[i] <= 26) + { + msg.append("^"+(char)(chars[i]+0x40)); + } + else if (chars[i] <= 255) + { + msg.append(chars[i]); + } + else + { + msg.append("\\u"+Integer.toString(chars[i],16)); + } + } + LOG.debug(msg.toString()); + } + + public static void out(byte [] bytes) + { + out(bytes, 0); + } + public static void out(byte [] bytes, int iStart) + { + if (!LOG.isDebugEnabled()) return; + StringBuffer msg = new StringBuffer(); + + for (int i = iStart; i < bytes.length; i++) + { + msg.append(bytes[i]+","); + } + LOG.debug(msg.toString()); + } + + public static void out(String str) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(str); + } + + public static void out(String msg, int iData) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg + iData); + } + public static void out(String msg, char ch) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg + "[U+"+Integer.toString(ch,16)+"]" + ch); + } + public static void out(String msg, byte bData) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg + bData); + } + public static void out(String msg, String str) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg + str); + } + public static void out(String msg, char [] data) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg); + out(data); + } + public static void out(String msg, byte [] data) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg); + out(data); + } + public static void out(String msg, char [] data, int iStart) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg +"("+iStart+"): "); + out(data, iStart); + } + public static void out(String msg, byte [] data, int iStart) + { + if (!LOG.isDebugEnabled()) return; + + LOG.debug(msg+"("+iStart+"): "); + out(data, iStart); + } +}
\ No newline at end of file diff --git a/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java b/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java new file mode 100644 index 0000000..7d79d4b --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/EndOfInputException.java @@ -0,0 +1,46 @@ +package com.healthmarketscience.jackcess.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ +/** + * The input string or input byte array ended prematurely + * + */ +public class EndOfInputException + extends java.lang.Exception +{ + public EndOfInputException(){ + super("The input string or input byte array ended prematurely"); + } + + public EndOfInputException(String s) { + super(s); + } +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/Expand.java b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java new file mode 100644 index 0000000..a6e44b1 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/Expand.java @@ -0,0 +1,429 @@ +package com.healthmarketscience.jackcess.scsu; + +/* + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * @version 005 Sep 30 1998 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + + /** + Reference decoder for the Standard Compression Scheme for Unicode (SCSU) + + <H2>Notes on the Java implementation</H2> + + A limitation of Java is the exclusive use of a signed byte data type. + The following work arounds are required: + + Copying a byte to an integer variable and adding 256 for 'negative' + bytes gives an integer in the range 0-255. + + Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on + char values is unsigned. + + Extended characters require an int to store them. The sign is not an + issue because only 1024*1024 + 65536 extended characters exist. + +**/ +public class Expand extends SCSU +{ + /** (re-)define (and select) a dynamic window + A sliding window position cannot start at any Unicode value, + so rather than providing an absolute offset, this function takes + an index value which selects among the possible starting values. + + Most scripts in Unicode start on or near a half-block boundary + so the default behaviour is to multiply the index by 0x80. Han, + Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF + show very poor locality--therefore no sliding window can be set + there. A jumpOffset is added to the index value to skip that region, + and only 167 index values total are required to select all eligible + half-blocks. + + Finally, a few scripts straddle half block boundaries. For them, a + table of fixed offsets is used, and the index values from 0xF9 to + 0xFF are used to select these special offsets. + + After (re-)defining a windows location it is selected so it is ready + for use. + + Recall that all Windows are of the same length (128 code positions). + + @param iWindow - index of the window to be (re-)defined + @param bOffset - index for the new offset value + **/ + // @005 protected <-- private here and elsewhere + protected void defineWindow(int iWindow, byte bOffset) + throws IllegalInputException + { + int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset); + + // 0 is a reserved value + if (iOffset == 0) + { + throw new IllegalInputException(); + } + else if (iOffset < gapThreshold) + { + dynamicOffset[iWindow] = iOffset << 7; + } + else if (iOffset < reservedStart) + { + dynamicOffset[iWindow] = (iOffset << 7) + gapOffset; + } + else if (iOffset < fixedThreshold) + { + // more reserved values + throw new IllegalInputException("iOffset == "+iOffset); + } + else + { + dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold]; + } + + // make the redefined window the active one + selectWindow(iWindow); + } + + /** (re-)define (and select) a window as an extended dynamic window + The surrogate area in Unicode allows access to 2**20 codes beyond the + first 64K codes by combining one of 1024 characters from the High + Surrogate Area with one of 1024 characters from the Low Surrogate + Area (see Unicode 2.0 for the details). + + The tags SDX and UDX set the window such that each subsequent byte in + the range 80 to FF represents a surrogate pair. The following diagram + shows how the bits in the two bytes following the SDX or UDX, and a + subsequent data byte, map onto the bits in the resulting surrogate pair. + + hbyte lbyte data + nnnwwwww zzzzzyyy 1xxxxxxx + + high-surrogate low-surrogate + 110110wwwwwzzzzz 110111yyyxxxxxxx + + @param chOffset - Since the three top bits of chOffset are not needed to + set the location of the extended Window, they are used instead + to select the window, thereby reducing the number of needed command codes. + The bottom 13 bits of chOffset are used to calculate the offset relative to + a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair. + **/ + protected void defineExtendedWindow(char chOffset) + { + // The top 3 bits of iOffsetHi are the window index + int iWindow = chOffset >>> 13; + + // Calculate the new offset + dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16); + + // make the redefined window the active one + selectWindow(iWindow); + } + + /** string buffer length used by the following functions */ + protected int iOut = 0; + + /** input cursor used by the following functions */ + protected int iIn = 0; + + /** expand input that is in Unicode mode + @param in input byte array to be expanded + @param iCur starting index + @param sb string buffer to which to append expanded input + @return the index for the lastc byte processed + **/ + protected int expandUnicode(byte []in, int iCur, StringBuffer sb) + throws IllegalInputException, EndOfInputException + { + for( ; iCur < in.length-1; iCur+=2 ) // step by 2: + { + byte b = in[iCur]; + + if (b >= UC0 && b <= UC7) + { + Debug.out("SelectWindow: ", b); + selectWindow(b - UC0); + return iCur; + } + else if (b >= UD0 && b <= UD7) + { + defineWindow( b - UD0, in[iCur+1]); + return iCur + 1; + } + else if (b == UDX) + { + if( iCur >= in.length - 2) + { + break; // buffer error + } + defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2])); + return iCur + 2; + } + else if (b == UQU) + { + if( iCur >= in.length - 2) + { + break; // error + } + // Skip command byte and output Unicode character + iCur++; + } + + // output a Unicode character + char ch = charFromTwoBytes(in[iCur], in[iCur+1]); + sb.append((char)ch); + iOut++; + } + + if( iCur == in.length) + { + return iCur; + } + + // Error condition + throw new EndOfInputException(); + } + + /** assemble a char from two bytes + In Java bytes are signed quantities, while chars are unsigned + @return the character + @param hi most significant byte + @param lo least significant byte + */ + public static char charFromTwoBytes(byte hi, byte lo) + { + char ch = (char)(lo >= 0 ? lo : 256 + lo); + return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8)); + } + + /** expand portion of the input that is in single byte mode **/ + protected String expandSingleByte(byte []in) + throws IllegalInputException, EndOfInputException + { + + /* Allocate the output buffer. Because of control codes, generally + each byte of input results in fewer than one character of + output. Using in.length as an intial allocation length should avoid + the need to reallocate in mid-stream. The exception to this rule are + surrogates. */ + StringBuffer sb = new StringBuffer(in.length); + iOut = 0; + + // Loop until all input is exhausted or an error occurred + int iCur; + Loop: + for( iCur = 0; iCur < in.length; iCur++ ) + { + // DEBUG Debug.out("Expanding: ", iCur); + + // Default behaviour is that ASCII characters are passed through + // (staticOffset[0] == 0) and characters with the high bit on are + // offset by the current dynamic (or sliding) window (this.iWindow) + int iStaticWindow = 0; + int iDynamicWindow = getCurrentWindow(); + + switch(in[iCur]) + { + // Quote from a static Window + case SQ0: + case SQ1: + case SQ2: + case SQ3: + case SQ4: + case SQ5: + case SQ6: + case SQ7: + Debug.out("SQn:", iStaticWindow); + // skip the command byte and check for length + if( iCur >= in.length - 1) + { + Debug.out("SQn missing argument: ", in, iCur); + break Loop; // buffer length error + } + // Select window pair to quote from + iDynamicWindow = iStaticWindow = in[iCur] - SQ0; + iCur ++; + + // FALL THROUGH + + default: + // output as character + if(in[iCur] >= 0) + { + // use static window + int ch = in[iCur] + staticOffset[iStaticWindow]; + sb.append((char)ch); + iOut++; + } + else + { + // use dynamic window + int ch = (in[iCur] + 256); // adjust for signed bytes + ch -= 0x80; // reduce to range 00..7F + ch += dynamicOffset[iDynamicWindow]; + + //DEBUG + Debug.out("Dynamic: ", (char) ch); + + if (ch < 1<<16) + { + // in Unicode range, output directly + sb.append((char)ch); + iOut++; + } + else + { + // this is an extension character + Debug.out("Extension character: ", ch); + + // compute and append the two surrogates: + // translate from 10000..10FFFF to 0..FFFFF + ch -= 0x10000; + + // high surrogate = top 10 bits added to D800 + sb.append((char)(0xD800 + (ch>>10))); + iOut++; + + // low surrogate = bottom 10 bits added to DC00 + sb.append((char)(0xDC00 + (ch & ~0xFC00))); + iOut++; + } + } + break; + + // define a dynamic window as extended + case SDX: + iCur += 2; + if( iCur >= in.length) + { + Debug.out("SDn missing argument: ", in, iCur -1); + break Loop; // buffer length error + } + defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur])); + break; + + // Position a dynamic Window + case SD0: + case SD1: + case SD2: + case SD3: + case SD4: + case SD5: + case SD6: + case SD7: + iCur ++; + if( iCur >= in.length) + { + Debug.out("SDn missing argument: ", in, iCur -1); + break Loop; // buffer length error + } + defineWindow(in[iCur-1] - SD0, in[iCur]); + break; + + // Select a new dynamic Window + case SC0: + case SC1: + case SC2: + case SC3: + case SC4: + case SC5: + case SC6: + case SC7: + selectWindow(in[iCur] - SC0); + break; + case SCU: + // switch to Unicode mode and continue parsing + iCur = expandUnicode(in, iCur+1, sb); + // DEBUG Debug.out("Expanded Unicode range until: ", iCur); + break; + + case SQU: + // directly extract one Unicode character + iCur += 2; + if( iCur >= in.length) + { + Debug.out("SQU missing argument: ", in, iCur - 2); + break Loop; // buffer length error + } + else + { + char ch = charFromTwoBytes(in[iCur-1], in[iCur]); + + Debug.out("Quoted: ", ch); + sb.append((char)ch); + iOut++; + } + break; + + case Srs: + throw new IllegalInputException(); + // break; + } + } + + if( iCur >= in.length) + { + //SUCCESS: all input used up + sb.setLength(iOut); + iIn = iCur; + return sb.toString(); + } + + Debug.out("Length ==" + in.length+" iCur =", iCur); + //ERROR: premature end of input + throw new EndOfInputException(); + } + + /** expand a byte array containing compressed Unicode */ + public String expand (byte []in) + throws IllegalInputException, EndOfInputException + { + String str = expandSingleByte(in); + Debug.out("expand output: ", str.toCharArray()); + return str; + } + + + /** reset is called to start with new input, w/o creating a new + instance */ + public void reset() + { + iOut = 0; + iIn = 0; + super.reset(); + } + + public int charsWritten() + { + return iOut; + } + + public int bytesRead() + { + return iIn; + } +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java b/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java new file mode 100644 index 0000000..358e8bc --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/IllegalInputException.java @@ -0,0 +1,45 @@ +package com.healthmarketscience.jackcess.scsu; + +/** + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright © 1996-1997 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ +/** + * The input character array or input byte array contained + * illegal sequences of bytes or characters + */ +public class IllegalInputException extends java.lang.Exception +{ + public IllegalInputException(){ + super("The input character array or input byte array contained illegal sequences of bytes or characters"); + } + + public IllegalInputException(String s) { + super(s); + } +} diff --git a/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java b/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java new file mode 100644 index 0000000..da3af58 --- /dev/null +++ b/src/java/com/healthmarketscience/jackcess/scsu/SCSU.java @@ -0,0 +1,252 @@ +package com.healthmarketscience.jackcess.scsu; + +/* + * This sample software accompanies Unicode Technical Report #6 and + * distributed as is by Unicode, Inc., subject to the following: + * + * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software + * without fee is hereby granted provided that this copyright notice + * appears in all copies. + * + * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE + * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND + * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND + * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING + * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. + * + * @author Asmus Freytag + * + * @version 001 Dec 25 1996 + * @version 002 Jun 25 1997 + * @version 003 Jul 25 1997 + * @version 004 Aug 25 1997 + * @version 005 Sep 30 1998 + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., + * and are registered in some jurisdictions. + **/ + + /** + Encoding text data in Unicode often requires more storage than using + an existing 8-bit character set and limited to the subset of characters + actually found in the text. The Unicode Compression Algorithm reduces + the necessary storage while retaining the universality of Unicode. + A full description of the algorithm can be found in document + http://www.unicode.org/unicode/reports/tr6.html + + Summary + + The goal of the Unicode Compression Algorithm is the abilty to + * Express all code points in Unicode + * Approximate storage size for traditional character sets + * Work well for short strings + * Provide transparency for Latin-1 data + * Support very simple decoders + * Support simple as well as sophisticated encoders + + If needed, further compression can be achieved by layering standard + file or disk-block based compression algorithms on top. + + <H2>Features</H2> + + Languages using small alphabets would contain runs of characters that + are coded close together in Unicode. These runs are interrupted only + by punctuation characters, which are themselves coded in proximity to + each other in Unicode (usually in the ASCII range). + + Two basic mechanisms in the compression algorithm account for these two + cases, sliding windows and static windows. A window is an area of 128 + consecutive characters in Unicode. In the compressed data stream, each + character from a sliding window would be represented as a byte between + 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and + TAB) would always mean an ASCII character (or control). + + <H2>Notes on the Java implementation</H2> + + A limitation of Java is the exclusive use of a signed byte data type. + The following work arounds are required: + + Copying a byte to an integer variable and adding 256 for 'negative' + bytes gives an integer in the range 0-255. + + Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on + char values is unsigned. + + Extended characters require an int to store them. The sign is not an + issue because only 1024*1024 + 65536 extended characters exist. + +**/ +public abstract class SCSU +{ + /** Single Byte mode command values */ + + /** SQ<i>n</i> Quote from Window . <p> + If the following byte is less than 0x80, quote from + static window <i>n</i>, else quote from dynamic window <i>n</i>. + */ + + static final byte SQ0 = 0x01; // Quote from window pair 0 + static final byte SQ1 = 0x02; // Quote from window pair 1 + static final byte SQ2 = 0x03; // Quote from window pair 2 + static final byte SQ3 = 0x04; // Quote from window pair 3 + static final byte SQ4 = 0x05; // Quote from window pair 4 + static final byte SQ5 = 0x06; // Quote from window pair 5 + static final byte SQ6 = 0x07; // Quote from window pair 6 + static final byte SQ7 = 0x08; // Quote from window pair 7 + + static final byte SDX = 0x0B; // Define a window as extended + static final byte Srs = 0x0C; // reserved + + static final byte SQU = 0x0E; // Quote a single Unicode character + static final byte SCU = 0x0F; // Change to Unicode mode + + /** SC<i>n</i> Change to Window <i>n</i>. <p> + If the following bytes are less than 0x80, interpret them + as command bytes or pass them through, else add the offset + for dynamic window <i>n</i>. */ + static final byte SC0 = 0x10; // Select window 0 + static final byte SC1 = 0x11; // Select window 1 + static final byte SC2 = 0x12; // Select window 2 + static final byte SC3 = 0x13; // Select window 3 + static final byte SC4 = 0x14; // Select window 4 + static final byte SC5 = 0x15; // Select window 5 + static final byte SC6 = 0x16; // Select window 6 + static final byte SC7 = 0x17; // Select window 7 + static final byte SD0 = 0x18; // Define and select window 0 + static final byte SD1 = 0x19; // Define and select window 1 + static final byte SD2 = 0x1A; // Define and select window 2 + static final byte SD3 = 0x1B; // Define and select window 3 + static final byte SD4 = 0x1C; // Define and select window 4 + static final byte SD5 = 0x1D; // Define and select window 5 + static final byte SD6 = 0x1E; // Define and select window 6 + static final byte SD7 = 0x1F; // Define and select window 7 + + static final byte UC0 = (byte) 0xE0; // Select window 0 + static final byte UC1 = (byte) 0xE1; // Select window 1 + static final byte UC2 = (byte) 0xE2; // Select window 2 + static final byte UC3 = (byte) 0xE3; // Select window 3 + static final byte UC4 = (byte) 0xE4; // Select window 4 + static final byte UC5 = (byte) 0xE5; // Select window 5 + static final byte UC6 = (byte) 0xE6; // Select window 6 + static final byte UC7 = (byte) 0xE7; // Select window 7 + static final byte UD0 = (byte) 0xE8; // Define and select window 0 + static final byte UD1 = (byte) 0xE9; // Define and select window 1 + static final byte UD2 = (byte) 0xEA; // Define and select window 2 + static final byte UD3 = (byte) 0xEB; // Define and select window 3 + static final byte UD4 = (byte) 0xEC; // Define and select window 4 + static final byte UD5 = (byte) 0xED; // Define and select window 5 + static final byte UD6 = (byte) 0xEE; // Define and select window 6 + static final byte UD7 = (byte) 0xEF; // Define and select window 7 + + static final byte UQU = (byte) 0xF0; // Quote a single Unicode character + static final byte UDX = (byte) 0xF1; // Define a Window as extended + static final byte Urs = (byte) 0xF2; // reserved + + /** constant offsets for the 8 static windows */ + static final int staticOffset[] = + { + 0x0000, // ASCII for quoted tags + 0x0080, // Latin - 1 Supplement (for access to punctuation) + 0x0100, // Latin Extended-A + 0x0300, // Combining Diacritical Marks + 0x2000, // General Punctuation + 0x2080, // Currency Symbols + 0x2100, // Letterlike Symbols and Number Forms + 0x3000 // CJK Symbols and punctuation + }; + + /** initial offsets for the 8 dynamic (sliding) windows */ + static final int initialDynamicOffset[] = + { + 0x0080, // Latin-1 + 0x00C0, // Latin Extended A //@005 fixed from 0x0100 + 0x0400, // Cyrillic + 0x0600, // Arabic + 0x0900, // Devanagari + 0x3040, // Hiragana + 0x30A0, // Katakana + 0xFF00 // Fullwidth ASCII + }; + + /** dynamic window offsets, intitialize to default values. */ + int dynamicOffset[] = + { + initialDynamicOffset[0], + initialDynamicOffset[1], + initialDynamicOffset[2], + initialDynamicOffset[3], + initialDynamicOffset[4], + initialDynamicOffset[5], + initialDynamicOffset[6], + initialDynamicOffset[7] + }; + + // The following method is common to encoder and decoder + + private int iWindow = 0; // current active window + + /** select the active dynamic window **/ + protected void selectWindow(int iWindow) + { + this.iWindow = iWindow; + } + + /** select the active dynamic window **/ + protected int getCurrentWindow() + { + return this.iWindow; + } + + /** + These values are used in defineWindow + **/ + + /** + * Unicode code points from 3400 to E000 are not adressible by + * dynamic window, since in these areas no short run alphabets are + * found. Therefore add gapOffset to all values from gapThreshold */ + static final int gapThreshold = 0x68; + static final int gapOffset = 0xAC00; + + /* values between reservedStart and fixedThreshold are reserved */ + static final int reservedStart = 0xA8; + + /* use table of predefined fixed offsets for values from fixedThreshold */ + static final int fixedThreshold = 0xF9; + + /** Table of fixed predefined Offsets, and byte values that index into **/ + static final int fixedOffset[] = + { + /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A + /* 0xFA */ 0x0250, // IPA extensions + /* 0xFB */ 0x0370, // Greek + /* 0xFC */ 0x0530, // Armenian + /* 0xFD */ 0x3040, // Hiragana + /* 0xFE */ 0x30A0, // Katakana + /* 0xFF */ 0xFF60 // Halfwidth Katakana + }; + + /** whether a character is compressible */ + public static boolean isCompressible(char ch) + { + return (ch < 0x3400 || ch >= 0xE000); + } + + /** reset is only needed to bail out after an exception and + restart with new input */ + public void reset() + { + + // reset the dynamic windows + for (int i = 0; i < dynamicOffset.length; i++) + { + dynamicOffset[i] = initialDynamicOffset[i]; + } + this.iWindow = 0; + } +}
\ No newline at end of file |