123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- package com.healthmarketscience.jackcess.scsu;
-
- /*
- * This sample software accompanies Unicode Technical Report #6 and
- * distributed as is by Unicode, Inc., subject to the following:
- *
- * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
- *
- * Permission to use, copy, modify, and distribute this software
- * without fee is hereby granted provided that this copyright notice
- * appears in all copies.
- *
- * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
- * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
- * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
- * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
- * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
- * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
- * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
- *
- * @author Asmus Freytag
- *
- * @version 001 Dec 25 1996
- * @version 002 Jun 25 1997
- * @version 003 Jul 25 1997
- * @version 004 Aug 25 1997
- * @version 005 Sep 30 1998
- *
- * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
- * and are registered in some jurisdictions.
- **/
-
- /**
- Encoding text data in Unicode often requires more storage than using
- an existing 8-bit character set and limited to the subset of characters
- actually found in the text. The Unicode Compression Algorithm reduces
- the necessary storage while retaining the universality of Unicode.
- A full description of the algorithm can be found in document
- http://www.unicode.org/unicode/reports/tr6.html
-
- Summary
-
- The goal of the Unicode Compression Algorithm is the abilty to
- * Express all code points in Unicode
- * Approximate storage size for traditional character sets
- * Work well for short strings
- * Provide transparency for Latin-1 data
- * Support very simple decoders
- * Support simple as well as sophisticated encoders
-
- If needed, further compression can be achieved by layering standard
- file or disk-block based compression algorithms on top.
-
- <H2>Features</H2>
-
- Languages using small alphabets would contain runs of characters that
- are coded close together in Unicode. These runs are interrupted only
- by punctuation characters, which are themselves coded in proximity to
- each other in Unicode (usually in the ASCII range).
-
- Two basic mechanisms in the compression algorithm account for these two
- cases, sliding windows and static windows. A window is an area of 128
- consecutive characters in Unicode. In the compressed data stream, each
- character from a sliding window would be represented as a byte between
- 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
- TAB) would always mean an ASCII character (or control).
-
- <H2>Notes on the Java implementation</H2>
-
- A limitation of Java is the exclusive use of a signed byte data type.
- The following work arounds are required:
-
- Copying a byte to an integer variable and adding 256 for 'negative'
- bytes gives an integer in the range 0-255.
-
- Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
- char values is unsigned.
-
- Extended characters require an int to store them. The sign is not an
- issue because only 1024*1024 + 65536 extended characters exist.
-
- **/
- public abstract class SCSU
- {
- /** Single Byte mode command values */
-
- /** SQ<i>n</i> Quote from Window . <p>
- If the following byte is less than 0x80, quote from
- static window <i>n</i>, else quote from dynamic window <i>n</i>.
- */
-
- static final byte SQ0 = 0x01; // Quote from window pair 0
- static final byte SQ1 = 0x02; // Quote from window pair 1
- static final byte SQ2 = 0x03; // Quote from window pair 2
- static final byte SQ3 = 0x04; // Quote from window pair 3
- static final byte SQ4 = 0x05; // Quote from window pair 4
- static final byte SQ5 = 0x06; // Quote from window pair 5
- static final byte SQ6 = 0x07; // Quote from window pair 6
- static final byte SQ7 = 0x08; // Quote from window pair 7
-
- static final byte SDX = 0x0B; // Define a window as extended
- static final byte Srs = 0x0C; // reserved
-
- static final byte SQU = 0x0E; // Quote a single Unicode character
- static final byte SCU = 0x0F; // Change to Unicode mode
-
- /** SC<i>n</i> Change to Window <i>n</i>. <p>
- If the following bytes are less than 0x80, interpret them
- as command bytes or pass them through, else add the offset
- for dynamic window <i>n</i>. */
- static final byte SC0 = 0x10; // Select window 0
- static final byte SC1 = 0x11; // Select window 1
- static final byte SC2 = 0x12; // Select window 2
- static final byte SC3 = 0x13; // Select window 3
- static final byte SC4 = 0x14; // Select window 4
- static final byte SC5 = 0x15; // Select window 5
- static final byte SC6 = 0x16; // Select window 6
- static final byte SC7 = 0x17; // Select window 7
- static final byte SD0 = 0x18; // Define and select window 0
- static final byte SD1 = 0x19; // Define and select window 1
- static final byte SD2 = 0x1A; // Define and select window 2
- static final byte SD3 = 0x1B; // Define and select window 3
- static final byte SD4 = 0x1C; // Define and select window 4
- static final byte SD5 = 0x1D; // Define and select window 5
- static final byte SD6 = 0x1E; // Define and select window 6
- static final byte SD7 = 0x1F; // Define and select window 7
-
- static final byte UC0 = (byte) 0xE0; // Select window 0
- static final byte UC1 = (byte) 0xE1; // Select window 1
- static final byte UC2 = (byte) 0xE2; // Select window 2
- static final byte UC3 = (byte) 0xE3; // Select window 3
- static final byte UC4 = (byte) 0xE4; // Select window 4
- static final byte UC5 = (byte) 0xE5; // Select window 5
- static final byte UC6 = (byte) 0xE6; // Select window 6
- static final byte UC7 = (byte) 0xE7; // Select window 7
- static final byte UD0 = (byte) 0xE8; // Define and select window 0
- static final byte UD1 = (byte) 0xE9; // Define and select window 1
- static final byte UD2 = (byte) 0xEA; // Define and select window 2
- static final byte UD3 = (byte) 0xEB; // Define and select window 3
- static final byte UD4 = (byte) 0xEC; // Define and select window 4
- static final byte UD5 = (byte) 0xED; // Define and select window 5
- static final byte UD6 = (byte) 0xEE; // Define and select window 6
- static final byte UD7 = (byte) 0xEF; // Define and select window 7
-
- static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
- static final byte UDX = (byte) 0xF1; // Define a Window as extended
- static final byte Urs = (byte) 0xF2; // reserved
-
- /** constant offsets for the 8 static windows */
- static final int staticOffset[] =
- {
- 0x0000, // ASCII for quoted tags
- 0x0080, // Latin - 1 Supplement (for access to punctuation)
- 0x0100, // Latin Extended-A
- 0x0300, // Combining Diacritical Marks
- 0x2000, // General Punctuation
- 0x2080, // Currency Symbols
- 0x2100, // Letterlike Symbols and Number Forms
- 0x3000 // CJK Symbols and punctuation
- };
-
- /** initial offsets for the 8 dynamic (sliding) windows */
- static final int initialDynamicOffset[] =
- {
- 0x0080, // Latin-1
- 0x00C0, // Latin Extended A //@005 fixed from 0x0100
- 0x0400, // Cyrillic
- 0x0600, // Arabic
- 0x0900, // Devanagari
- 0x3040, // Hiragana
- 0x30A0, // Katakana
- 0xFF00 // Fullwidth ASCII
- };
-
- /** dynamic window offsets, intitialize to default values. */
- int dynamicOffset[] =
- {
- initialDynamicOffset[0],
- initialDynamicOffset[1],
- initialDynamicOffset[2],
- initialDynamicOffset[3],
- initialDynamicOffset[4],
- initialDynamicOffset[5],
- initialDynamicOffset[6],
- initialDynamicOffset[7]
- };
-
- // The following method is common to encoder and decoder
-
- private int iWindow = 0; // current active window
-
- /** select the active dynamic window **/
- protected void selectWindow(int iWindow)
- {
- this.iWindow = iWindow;
- }
-
- /** select the active dynamic window **/
- protected int getCurrentWindow()
- {
- return this.iWindow;
- }
-
- /**
- These values are used in defineWindow
- **/
-
- /**
- * Unicode code points from 3400 to E000 are not adressible by
- * dynamic window, since in these areas no short run alphabets are
- * found. Therefore add gapOffset to all values from gapThreshold */
- static final int gapThreshold = 0x68;
- static final int gapOffset = 0xAC00;
-
- /* values between reservedStart and fixedThreshold are reserved */
- static final int reservedStart = 0xA8;
-
- /* use table of predefined fixed offsets for values from fixedThreshold */
- static final int fixedThreshold = 0xF9;
-
- /** Table of fixed predefined Offsets, and byte values that index into **/
- static final int fixedOffset[] =
- {
- /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
- /* 0xFA */ 0x0250, // IPA extensions
- /* 0xFB */ 0x0370, // Greek
- /* 0xFC */ 0x0530, // Armenian
- /* 0xFD */ 0x3040, // Hiragana
- /* 0xFE */ 0x30A0, // Katakana
- /* 0xFF */ 0xFF60 // Halfwidth Katakana
- };
-
- /** whether a character is compressible */
- public static boolean isCompressible(char ch)
- {
- return (ch < 0x3400 || ch >= 0xE000);
- }
-
- /** reset is only needed to bail out after an exception and
- restart with new input */
- public void reset()
- {
-
- // reset the dynamic windows
- for (int i = 0; i < dynamicOffset.length; i++)
- {
- dynamicOffset[i] = initialDynamicOffset[i];
- }
- this.iWindow = 0;
- }
- }
|