Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

SCSU.java 9.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. package com.healthmarketscience.jackcess.scsu;
  2. /*
  3. * This sample software accompanies Unicode Technical Report #6 and
  4. * distributed as is by Unicode, Inc., subject to the following:
  5. *
  6. * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
  7. *
  8. * Permission to use, copy, modify, and distribute this software
  9. * without fee is hereby granted provided that this copyright notice
  10. * appears in all copies.
  11. *
  12. * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
  13. * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
  14. * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  16. * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
  17. * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
  18. * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
  19. * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  20. *
  21. * @author Asmus Freytag
  22. *
  23. * @version 001 Dec 25 1996
  24. * @version 002 Jun 25 1997
  25. * @version 003 Jul 25 1997
  26. * @version 004 Aug 25 1997
  27. * @version 005 Sep 30 1998
  28. *
  29. * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
  30. * and are registered in some jurisdictions.
  31. **/
  32. /**
  33. Encoding text data in Unicode often requires more storage than using
  34. an existing 8-bit character set and limited to the subset of characters
  35. actually found in the text. The Unicode Compression Algorithm reduces
  36. the necessary storage while retaining the universality of Unicode.
  37. A full description of the algorithm can be found in document
  38. http://www.unicode.org/unicode/reports/tr6.html
  39. Summary
  40. The goal of the Unicode Compression Algorithm is the abilty to
  41. * Express all code points in Unicode
  42. * Approximate storage size for traditional character sets
  43. * Work well for short strings
  44. * Provide transparency for Latin-1 data
  45. * Support very simple decoders
  46. * Support simple as well as sophisticated encoders
  47. If needed, further compression can be achieved by layering standard
  48. file or disk-block based compression algorithms on top.
  49. <H2>Features</H2>
  50. Languages using small alphabets would contain runs of characters that
  51. are coded close together in Unicode. These runs are interrupted only
  52. by punctuation characters, which are themselves coded in proximity to
  53. each other in Unicode (usually in the ASCII range).
  54. Two basic mechanisms in the compression algorithm account for these two
  55. cases, sliding windows and static windows. A window is an area of 128
  56. consecutive characters in Unicode. In the compressed data stream, each
  57. character from a sliding window would be represented as a byte between
  58. 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
  59. TAB) would always mean an ASCII character (or control).
  60. <H2>Notes on the Java implementation</H2>
  61. A limitation of Java is the exclusive use of a signed byte data type.
  62. The following work arounds are required:
  63. Copying a byte to an integer variable and adding 256 for 'negative'
  64. bytes gives an integer in the range 0-255.
  65. Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
  66. char values is unsigned.
  67. Extended characters require an int to store them. The sign is not an
  68. issue because only 1024*1024 + 65536 extended characters exist.
  69. **/
  70. public abstract class SCSU
  71. {
  72. /** Single Byte mode command values */
  73. /** SQ<i>n</i> Quote from Window . <p>
  74. If the following byte is less than 0x80, quote from
  75. static window <i>n</i>, else quote from dynamic window <i>n</i>.
  76. */
  77. static final byte SQ0 = 0x01; // Quote from window pair 0
  78. static final byte SQ1 = 0x02; // Quote from window pair 1
  79. static final byte SQ2 = 0x03; // Quote from window pair 2
  80. static final byte SQ3 = 0x04; // Quote from window pair 3
  81. static final byte SQ4 = 0x05; // Quote from window pair 4
  82. static final byte SQ5 = 0x06; // Quote from window pair 5
  83. static final byte SQ6 = 0x07; // Quote from window pair 6
  84. static final byte SQ7 = 0x08; // Quote from window pair 7
  85. static final byte SDX = 0x0B; // Define a window as extended
  86. static final byte Srs = 0x0C; // reserved
  87. static final byte SQU = 0x0E; // Quote a single Unicode character
  88. static final byte SCU = 0x0F; // Change to Unicode mode
  89. /** SC<i>n</i> Change to Window <i>n</i>. <p>
  90. If the following bytes are less than 0x80, interpret them
  91. as command bytes or pass them through, else add the offset
  92. for dynamic window <i>n</i>. */
  93. static final byte SC0 = 0x10; // Select window 0
  94. static final byte SC1 = 0x11; // Select window 1
  95. static final byte SC2 = 0x12; // Select window 2
  96. static final byte SC3 = 0x13; // Select window 3
  97. static final byte SC4 = 0x14; // Select window 4
  98. static final byte SC5 = 0x15; // Select window 5
  99. static final byte SC6 = 0x16; // Select window 6
  100. static final byte SC7 = 0x17; // Select window 7
  101. static final byte SD0 = 0x18; // Define and select window 0
  102. static final byte SD1 = 0x19; // Define and select window 1
  103. static final byte SD2 = 0x1A; // Define and select window 2
  104. static final byte SD3 = 0x1B; // Define and select window 3
  105. static final byte SD4 = 0x1C; // Define and select window 4
  106. static final byte SD5 = 0x1D; // Define and select window 5
  107. static final byte SD6 = 0x1E; // Define and select window 6
  108. static final byte SD7 = 0x1F; // Define and select window 7
  109. static final byte UC0 = (byte) 0xE0; // Select window 0
  110. static final byte UC1 = (byte) 0xE1; // Select window 1
  111. static final byte UC2 = (byte) 0xE2; // Select window 2
  112. static final byte UC3 = (byte) 0xE3; // Select window 3
  113. static final byte UC4 = (byte) 0xE4; // Select window 4
  114. static final byte UC5 = (byte) 0xE5; // Select window 5
  115. static final byte UC6 = (byte) 0xE6; // Select window 6
  116. static final byte UC7 = (byte) 0xE7; // Select window 7
  117. static final byte UD0 = (byte) 0xE8; // Define and select window 0
  118. static final byte UD1 = (byte) 0xE9; // Define and select window 1
  119. static final byte UD2 = (byte) 0xEA; // Define and select window 2
  120. static final byte UD3 = (byte) 0xEB; // Define and select window 3
  121. static final byte UD4 = (byte) 0xEC; // Define and select window 4
  122. static final byte UD5 = (byte) 0xED; // Define and select window 5
  123. static final byte UD6 = (byte) 0xEE; // Define and select window 6
  124. static final byte UD7 = (byte) 0xEF; // Define and select window 7
  125. static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
  126. static final byte UDX = (byte) 0xF1; // Define a Window as extended
  127. static final byte Urs = (byte) 0xF2; // reserved
  128. /** constant offsets for the 8 static windows */
  129. static final int staticOffset[] =
  130. {
  131. 0x0000, // ASCII for quoted tags
  132. 0x0080, // Latin - 1 Supplement (for access to punctuation)
  133. 0x0100, // Latin Extended-A
  134. 0x0300, // Combining Diacritical Marks
  135. 0x2000, // General Punctuation
  136. 0x2080, // Currency Symbols
  137. 0x2100, // Letterlike Symbols and Number Forms
  138. 0x3000 // CJK Symbols and punctuation
  139. };
  140. /** initial offsets for the 8 dynamic (sliding) windows */
  141. static final int initialDynamicOffset[] =
  142. {
  143. 0x0080, // Latin-1
  144. 0x00C0, // Latin Extended A //@005 fixed from 0x0100
  145. 0x0400, // Cyrillic
  146. 0x0600, // Arabic
  147. 0x0900, // Devanagari
  148. 0x3040, // Hiragana
  149. 0x30A0, // Katakana
  150. 0xFF00 // Fullwidth ASCII
  151. };
  152. /** dynamic window offsets, intitialize to default values. */
  153. int dynamicOffset[] =
  154. {
  155. initialDynamicOffset[0],
  156. initialDynamicOffset[1],
  157. initialDynamicOffset[2],
  158. initialDynamicOffset[3],
  159. initialDynamicOffset[4],
  160. initialDynamicOffset[5],
  161. initialDynamicOffset[6],
  162. initialDynamicOffset[7]
  163. };
  164. // The following method is common to encoder and decoder
  165. private int iWindow = 0; // current active window
  166. /** select the active dynamic window **/
  167. protected void selectWindow(int iWindow)
  168. {
  169. this.iWindow = iWindow;
  170. }
  171. /** select the active dynamic window **/
  172. protected int getCurrentWindow()
  173. {
  174. return this.iWindow;
  175. }
  176. /**
  177. These values are used in defineWindow
  178. **/
  179. /**
  180. * Unicode code points from 3400 to E000 are not adressible by
  181. * dynamic window, since in these areas no short run alphabets are
  182. * found. Therefore add gapOffset to all values from gapThreshold */
  183. static final int gapThreshold = 0x68;
  184. static final int gapOffset = 0xAC00;
  185. /* values between reservedStart and fixedThreshold are reserved */
  186. static final int reservedStart = 0xA8;
  187. /* use table of predefined fixed offsets for values from fixedThreshold */
  188. static final int fixedThreshold = 0xF9;
  189. /** Table of fixed predefined Offsets, and byte values that index into **/
  190. static final int fixedOffset[] =
  191. {
  192. /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
  193. /* 0xFA */ 0x0250, // IPA extensions
  194. /* 0xFB */ 0x0370, // Greek
  195. /* 0xFC */ 0x0530, // Armenian
  196. /* 0xFD */ 0x3040, // Hiragana
  197. /* 0xFE */ 0x30A0, // Katakana
  198. /* 0xFF */ 0xFF60 // Halfwidth Katakana
  199. };
  200. /** whether a character is compressible */
  201. public static boolean isCompressible(char ch)
  202. {
  203. return (ch < 0x3400 || ch >= 0xE000);
  204. }
  205. /** reset is only needed to bail out after an exception and
  206. restart with new input */
  207. public void reset()
  208. {
  209. // reset the dynamic windows
  210. for (int i = 0; i < dynamicOffset.length; i++)
  211. {
  212. dynamicOffset[i] = initialDynamicOffset[i];
  213. }
  214. this.iWindow = 0;
  215. }
  216. }