選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

Expand.java 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. package com.healthmarketscience.jackcess.scsu;
  2. /*
  3. * This sample software accompanies Unicode Technical Report #6 and
  4. * distributed as is by Unicode, Inc., subject to the following:
  5. *
  6. * Copyright © 1996-1998 Unicode, Inc.. All Rights Reserved.
  7. *
  8. * Permission to use, copy, modify, and distribute this software
  9. * without fee is hereby granted provided that this copyright notice
  10. * appears in all copies.
  11. *
  12. * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
  13. * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
  14. * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  16. * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
  17. * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
  18. * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
  19. * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  20. *
  21. * @author Asmus Freytag
  22. *
  23. * @version 001 Dec 25 1996
  24. * @version 002 Jun 25 1997
  25. * @version 003 Jul 25 1997
  26. * @version 004 Aug 25 1997
  27. * @version 005 Sep 30 1998
  28. *
  29. * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
  30. * and are registered in some jurisdictions.
  31. **/
  32. /**
  33. Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
  34. <H2>Notes on the Java implementation</H2>
  35. A limitation of Java is the exclusive use of a signed byte data type.
  36. The following work arounds are required:
  37. Copying a byte to an integer variable and adding 256 for 'negative'
  38. bytes gives an integer in the range 0-255.
  39. Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
  40. char values is unsigned.
  41. Extended characters require an int to store them. The sign is not an
  42. issue because only 1024*1024 + 65536 extended characters exist.
  43. **/
  44. public class Expand extends SCSU
  45. {
  46. /** (re-)define (and select) a dynamic window
  47. A sliding window position cannot start at any Unicode value,
  48. so rather than providing an absolute offset, this function takes
  49. an index value which selects among the possible starting values.
  50. Most scripts in Unicode start on or near a half-block boundary
  51. so the default behaviour is to multiply the index by 0x80. Han,
  52. Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
  53. show very poor locality--therefore no sliding window can be set
  54. there. A jumpOffset is added to the index value to skip that region,
  55. and only 167 index values total are required to select all eligible
  56. half-blocks.
  57. Finally, a few scripts straddle half block boundaries. For them, a
  58. table of fixed offsets is used, and the index values from 0xF9 to
  59. 0xFF are used to select these special offsets.
  60. After (re-)defining a windows location it is selected so it is ready
  61. for use.
  62. Recall that all Windows are of the same length (128 code positions).
  63. @param iWindow - index of the window to be (re-)defined
  64. @param bOffset - index for the new offset value
  65. **/
  66. // @005 protected <-- private here and elsewhere
  67. protected void defineWindow(int iWindow, byte bOffset)
  68. throws IllegalInputException
  69. {
  70. int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
  71. // 0 is a reserved value
  72. if (iOffset == 0)
  73. {
  74. throw new IllegalInputException();
  75. }
  76. else if (iOffset < gapThreshold)
  77. {
  78. dynamicOffset[iWindow] = iOffset << 7;
  79. }
  80. else if (iOffset < reservedStart)
  81. {
  82. dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
  83. }
  84. else if (iOffset < fixedThreshold)
  85. {
  86. // more reserved values
  87. throw new IllegalInputException("iOffset == "+iOffset);
  88. }
  89. else
  90. {
  91. dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
  92. }
  93. // make the redefined window the active one
  94. selectWindow(iWindow);
  95. }
  96. /** (re-)define (and select) a window as an extended dynamic window
  97. The surrogate area in Unicode allows access to 2**20 codes beyond the
  98. first 64K codes by combining one of 1024 characters from the High
  99. Surrogate Area with one of 1024 characters from the Low Surrogate
  100. Area (see Unicode 2.0 for the details).
  101. The tags SDX and UDX set the window such that each subsequent byte in
  102. the range 80 to FF represents a surrogate pair. The following diagram
  103. shows how the bits in the two bytes following the SDX or UDX, and a
  104. subsequent data byte, map onto the bits in the resulting surrogate pair.
  105. hbyte lbyte data
  106. nnnwwwww zzzzzyyy 1xxxxxxx
  107. high-surrogate low-surrogate
  108. 110110wwwwwzzzzz 110111yyyxxxxxxx
  109. @param chOffset - Since the three top bits of chOffset are not needed to
  110. set the location of the extended Window, they are used instead
  111. to select the window, thereby reducing the number of needed command codes.
  112. The bottom 13 bits of chOffset are used to calculate the offset relative to
  113. a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
  114. **/
  115. protected void defineExtendedWindow(char chOffset)
  116. {
  117. // The top 3 bits of iOffsetHi are the window index
  118. int iWindow = chOffset >>> 13;
  119. // Calculate the new offset
  120. dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
  121. // make the redefined window the active one
  122. selectWindow(iWindow);
  123. }
  124. /** string buffer length used by the following functions */
  125. protected int iOut = 0;
  126. /** input cursor used by the following functions */
  127. protected int iIn = 0;
  128. /** expand input that is in Unicode mode
  129. @param in input byte array to be expanded
  130. @param iCur starting index
  131. @param sb string buffer to which to append expanded input
  132. @return the index for the lastc byte processed
  133. **/
  134. protected int expandUnicode(byte []in, int iCur, StringBuffer sb)
  135. throws IllegalInputException, EndOfInputException
  136. {
  137. for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
  138. {
  139. byte b = in[iCur];
  140. if (b >= UC0 && b <= UC7)
  141. {
  142. Debug.out("SelectWindow: ", b);
  143. selectWindow(b - UC0);
  144. return iCur;
  145. }
  146. else if (b >= UD0 && b <= UD7)
  147. {
  148. defineWindow( b - UD0, in[iCur+1]);
  149. return iCur + 1;
  150. }
  151. else if (b == UDX)
  152. {
  153. if( iCur >= in.length - 2)
  154. {
  155. break; // buffer error
  156. }
  157. defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
  158. return iCur + 2;
  159. }
  160. else if (b == UQU)
  161. {
  162. if( iCur >= in.length - 2)
  163. {
  164. break; // error
  165. }
  166. // Skip command byte and output Unicode character
  167. iCur++;
  168. }
  169. // output a Unicode character
  170. char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
  171. sb.append((char)ch);
  172. iOut++;
  173. }
  174. if( iCur == in.length)
  175. {
  176. return iCur;
  177. }
  178. // Error condition
  179. throw new EndOfInputException();
  180. }
  181. /** assemble a char from two bytes
  182. In Java bytes are signed quantities, while chars are unsigned
  183. @return the character
  184. @param hi most significant byte
  185. @param lo least significant byte
  186. */
  187. public static char charFromTwoBytes(byte hi, byte lo)
  188. {
  189. char ch = (char)(lo >= 0 ? lo : 256 + lo);
  190. return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
  191. }
  192. /** expand portion of the input that is in single byte mode **/
  193. protected String expandSingleByte(byte []in)
  194. throws IllegalInputException, EndOfInputException
  195. {
  196. /* Allocate the output buffer. Because of control codes, generally
  197. each byte of input results in fewer than one character of
  198. output. Using in.length as an intial allocation length should avoid
  199. the need to reallocate in mid-stream. The exception to this rule are
  200. surrogates. */
  201. StringBuffer sb = new StringBuffer(in.length);
  202. iOut = 0;
  203. // Loop until all input is exhausted or an error occurred
  204. int iCur;
  205. Loop:
  206. for( iCur = 0; iCur < in.length; iCur++ )
  207. {
  208. // DEBUG Debug.out("Expanding: ", iCur);
  209. // Default behaviour is that ASCII characters are passed through
  210. // (staticOffset[0] == 0) and characters with the high bit on are
  211. // offset by the current dynamic (or sliding) window (this.iWindow)
  212. int iStaticWindow = 0;
  213. int iDynamicWindow = getCurrentWindow();
  214. switch(in[iCur])
  215. {
  216. // Quote from a static Window
  217. case SQ0:
  218. case SQ1:
  219. case SQ2:
  220. case SQ3:
  221. case SQ4:
  222. case SQ5:
  223. case SQ6:
  224. case SQ7:
  225. Debug.out("SQn:", iStaticWindow);
  226. // skip the command byte and check for length
  227. if( iCur >= in.length - 1)
  228. {
  229. Debug.out("SQn missing argument: ", in, iCur);
  230. break Loop; // buffer length error
  231. }
  232. // Select window pair to quote from
  233. iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
  234. iCur ++;
  235. // FALL THROUGH
  236. default:
  237. // output as character
  238. if(in[iCur] >= 0)
  239. {
  240. // use static window
  241. int ch = in[iCur] + staticOffset[iStaticWindow];
  242. sb.append((char)ch);
  243. iOut++;
  244. }
  245. else
  246. {
  247. // use dynamic window
  248. int ch = (in[iCur] + 256); // adjust for signed bytes
  249. ch -= 0x80; // reduce to range 00..7F
  250. ch += dynamicOffset[iDynamicWindow];
  251. //DEBUG
  252. Debug.out("Dynamic: ", (char) ch);
  253. if (ch < 1<<16)
  254. {
  255. // in Unicode range, output directly
  256. sb.append((char)ch);
  257. iOut++;
  258. }
  259. else
  260. {
  261. // this is an extension character
  262. Debug.out("Extension character: ", ch);
  263. // compute and append the two surrogates:
  264. // translate from 10000..10FFFF to 0..FFFFF
  265. ch -= 0x10000;
  266. // high surrogate = top 10 bits added to D800
  267. sb.append((char)(0xD800 + (ch>>10)));
  268. iOut++;
  269. // low surrogate = bottom 10 bits added to DC00
  270. sb.append((char)(0xDC00 + (ch & ~0xFC00)));
  271. iOut++;
  272. }
  273. }
  274. break;
  275. // define a dynamic window as extended
  276. case SDX:
  277. iCur += 2;
  278. if( iCur >= in.length)
  279. {
  280. Debug.out("SDn missing argument: ", in, iCur -1);
  281. break Loop; // buffer length error
  282. }
  283. defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
  284. break;
  285. // Position a dynamic Window
  286. case SD0:
  287. case SD1:
  288. case SD2:
  289. case SD3:
  290. case SD4:
  291. case SD5:
  292. case SD6:
  293. case SD7:
  294. iCur ++;
  295. if( iCur >= in.length)
  296. {
  297. Debug.out("SDn missing argument: ", in, iCur -1);
  298. break Loop; // buffer length error
  299. }
  300. defineWindow(in[iCur-1] - SD0, in[iCur]);
  301. break;
  302. // Select a new dynamic Window
  303. case SC0:
  304. case SC1:
  305. case SC2:
  306. case SC3:
  307. case SC4:
  308. case SC5:
  309. case SC6:
  310. case SC7:
  311. selectWindow(in[iCur] - SC0);
  312. break;
  313. case SCU:
  314. // switch to Unicode mode and continue parsing
  315. iCur = expandUnicode(in, iCur+1, sb);
  316. // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
  317. break;
  318. case SQU:
  319. // directly extract one Unicode character
  320. iCur += 2;
  321. if( iCur >= in.length)
  322. {
  323. Debug.out("SQU missing argument: ", in, iCur - 2);
  324. break Loop; // buffer length error
  325. }
  326. else
  327. {
  328. char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
  329. Debug.out("Quoted: ", ch);
  330. sb.append((char)ch);
  331. iOut++;
  332. }
  333. break;
  334. case Srs:
  335. throw new IllegalInputException();
  336. // break;
  337. }
  338. }
  339. if( iCur >= in.length)
  340. {
  341. //SUCCESS: all input used up
  342. sb.setLength(iOut);
  343. iIn = iCur;
  344. return sb.toString();
  345. }
  346. Debug.out("Length ==" + in.length+" iCur =", iCur);
  347. //ERROR: premature end of input
  348. throw new EndOfInputException();
  349. }
  350. /** expand a byte array containing compressed Unicode */
  351. public String expand (byte []in)
  352. throws IllegalInputException, EndOfInputException
  353. {
  354. String str = expandSingleByte(in);
  355. Debug.out("expand output: ", str.toCharArray());
  356. return str;
  357. }
  358. /** reset is called to start with new input, w/o creating a new
  359. instance */
  360. public void reset()
  361. {
  362. iOut = 0;
  363. iIn = 0;
  364. super.reset();
  365. }
  366. public int charsWritten()
  367. {
  368. return iOut;
  369. }
  370. public int bytesRead()
  371. {
  372. return iIn;
  373. }
  374. }