You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Compress.java 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. package com.healthmarketscience.jackcess.scsu;
  2. /**
  3. * This sample software accompanies Unicode Technical Report #6 and
  4. * distributed as is by Unicode, Inc., subject to the following:
  5. *
  6. * Copyright 1996-1997 Unicode, Inc.. All Rights Reserved.
  7. *
  8. * Permission to use, copy, modify, and distribute this software
  9. * without fee is hereby granted provided that this copyright notice
  10. * appears in all copies.
  11. *
  12. * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
  13. * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
  14. * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  16. * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
  17. * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
  18. * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
  19. * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  20. *
  21. * @author Asmus Freytag
  22. *
  23. * @version 001 Dec 25 1996
  24. * @version 002 Jun 25 1997
  25. * @version 003 Jul 25 1997
  26. * @version 004 Aug 25 1997
  27. *
  28. * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
  29. * and are registered in some jurisdictions.
  30. **/
  31. /**
  32. This class implements a simple compression algorithm
  33. **/
  34. /*
  35. Note on exception handling
  36. This compressor is designed so that it can be restarted after
  37. an exception. All operations advancing input and/or output cursor
  38. (iIn and iOut) either complete an action, or set a state (fUnicodeMode)
  39. before updating the cursors.
  40. */
  41. public class Compress extends SCSU
  42. {
  43. /** next input character to be read **/
  44. private int iIn;
  45. /** next output byte to be written **/
  46. private int iOut;
  47. /** start index of Unicode mode in output array, or -1 if in single byte mode **/
  48. private int iSCU = -1;
  49. /** true if the next command byte is of the Uxx family */
  50. private boolean fUnicodeMode = false;
  51. /** locate a window for a character given a table of offsets
  52. @param ch - character
  53. @param offsetTable - table of window offsets
  54. @return true if the character fits a window from the table of windows */
  55. private boolean locateWindow(int ch, int[] offsetTable)
  56. {
  57. // always try the current window first
  58. int iWin = getCurrentWindow();
  59. // if the character fits the current window
  60. // just use the current window
  61. if (iWin != - 1 && ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
  62. {
  63. return true;
  64. }
  65. // try all windows in order
  66. for (iWin = 0; iWin < offsetTable.length; iWin++)
  67. {
  68. if (ch >= offsetTable[iWin] && ch < offsetTable[iWin] + 0x80)
  69. {
  70. selectWindow(iWin);
  71. return true;
  72. }
  73. }
  74. // none found
  75. return false;
  76. }
  77. /** returns true if the character is ASCII, but not a control other than CR, LF and TAB */
  78. public static boolean isAsciiCrLfOrTab(int ch)
  79. {
  80. return (ch >= 0x20 && ch <= 0x7F) // ASCII
  81. || ch == 0x09 || ch == 0x0A || ch == 0x0D; // CR/LF or TAB
  82. }
  83. /** output a run of characters in single byte mode
  84. In single byte mode pass through characters in the ASCII range, but
  85. quote characters overlapping with compression command codes. Runs
  86. of characters fitting the current window are output as runs of bytes
  87. in the range 0x80-0xFF. Checks for and validates Surrogate Pairs.
  88. Uses and updates the current input and output cursors store in
  89. the instance variables <i>iIn</i> and <i>iOut</i>.
  90. @param in - input character array
  91. @param out - output byte array
  92. @return the next chaacter to be processed. This may be an extended character.
  93. **/
  94. @SuppressWarnings("fallthrough")
  95. public int outputSingleByteRun(char [] in, byte [] out)
  96. throws EndOfOutputException, EndOfInputException, IllegalInputException
  97. {
  98. int iWin = getCurrentWindow();
  99. while(iIn < in.length)
  100. {
  101. int outlen = 0;
  102. byte byte1 = 0;
  103. byte byte2 = 0;
  104. // get the input character
  105. int ch = in[iIn];
  106. int inlen = 1;
  107. // Check input for Surrogate pair
  108. if ( (ch & 0xF800) == 0xD800 )
  109. {
  110. if ( (ch & 0xFC00) == 0xDC00 )
  111. {
  112. // low surrogate out of order
  113. throw new IllegalInputException("Unpaired low surrogate: "+iIn);
  114. }
  115. else
  116. {
  117. // have high surrogate now get low surrogate
  118. if ( iIn >= in.length-1)
  119. {
  120. // premature end of input
  121. throw new EndOfInputException();
  122. }
  123. // get the char
  124. int ch2 = in[iIn+1];
  125. // make sure it's a low surrogate
  126. if ( (ch2 & 0xFC00) != 0xDC00 )
  127. {
  128. // a low surrogate was required
  129. throw new IllegalInputException("Unpaired high surrogate: "+(iIn+1));
  130. }
  131. // combine the two values
  132. ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
  133. // ch = ch<<10 + ch2 - 0x36F0000;
  134. inlen = 2;
  135. }
  136. }
  137. // ASCII Letter, NUL, CR, LF and TAB are always passed through
  138. if (isAsciiCrLfOrTab(ch) || ch == 0)
  139. {
  140. // pass through directcly
  141. byte2 = (byte)(ch & 0x7F);
  142. outlen = 1;
  143. }
  144. // All other control codes must be quoted
  145. else if (ch < 0x20)
  146. {
  147. byte1 = SQ0;
  148. byte2 = (byte)(ch);
  149. outlen = 2;
  150. }
  151. // Letters that fit the current dynamic window
  152. else if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
  153. {
  154. ch -= dynamicOffset[iWin];
  155. byte2 = (byte)(ch | 0x80);
  156. outlen = 1;
  157. }
  158. // check for room in the output array
  159. if (iOut + outlen >= out.length)
  160. {
  161. throw new EndOfOutputException();
  162. }
  163. switch(outlen)
  164. {
  165. default:
  166. // need to use some other compression mode for this
  167. // character so we terminate this loop
  168. return ch; // input not finished
  169. // output the characters
  170. case 2:
  171. out[iOut++] = byte1;
  172. // fall through
  173. case 1:
  174. out[iOut++] = byte2;
  175. break;
  176. }
  177. // advance input pointer
  178. iIn += inlen;
  179. }
  180. return 0; // input all used up
  181. }
  182. /** quote a single character in single byte mode
  183. Quoting a character (aka 'non-locking shift') gives efficient access
  184. to characters that occur in isolation--usually punctuation characters.
  185. When quoting a character from a dynamic window use 0x80 - 0xFF, when
  186. quoting a character from a static window use 0x00-0x7f.
  187. @param ch - character to be quoted
  188. @param out - output byte array
  189. **/
  190. private void quoteSingleByte(int ch, byte [] out)
  191. throws EndOfOutputException
  192. {
  193. Debug.out("Quoting SingleByte ", ch);
  194. int iWin = getCurrentWindow();
  195. // check for room in the output array
  196. if (iOut >= out.length -2)
  197. {
  198. throw new EndOfOutputException();
  199. }
  200. // Output command byte followed by
  201. out[iOut++] = (byte)(SQ0 + iWin);
  202. // Letter that fits the current dynamic window
  203. if (ch >= dynamicOffset[iWin] && ch < dynamicOffset[iWin] + 0x80)
  204. {
  205. ch -= dynamicOffset[iWin];
  206. out[iOut++] = (byte)(ch | 0x80);
  207. }
  208. // Letter that fits the current static window
  209. else if (ch >= staticOffset[iWin] && ch < staticOffset[iWin] + 0x80)
  210. {
  211. ch -= staticOffset[iWin];
  212. out[iOut++] = (byte)ch;
  213. }
  214. else
  215. {
  216. throw new IllegalStateException("ch = "+ch+" not valid in quoteSingleByte. Internal Compressor Error");
  217. }
  218. // advance input pointer
  219. iIn ++;
  220. Debug.out("New input: ", iIn);
  221. }
  222. /** output a run of characters in Unicode mode
  223. A run of Unicode mode consists of characters which are all in the
  224. range of non-compressible characters or isolated occurrence
  225. of any other characters. Characters in the range 0xE00-0xF2FF must
  226. be quoted to avoid overlap with the Unicode mode compression command codes.
  227. Uses and updates the current input and output cursors store in
  228. the instance variables <i>iIn</i> and <i>iOut</i>.
  229. NOTE: Characters from surrogate pairs are passed through and unlike single
  230. byte mode no checks are made for unpaired surrogate characters.
  231. @param in - input character array
  232. @param out - output byte array
  233. @return the next input character to be processed
  234. **/
  235. public char outputUnicodeRun(char [] in, byte [] out)
  236. throws EndOfOutputException
  237. {
  238. // current character
  239. char ch = 0;
  240. while(iIn < in.length)
  241. {
  242. // get current input and set default output length
  243. ch = in[iIn];
  244. int outlen = 2;
  245. // Characters in these ranges could potentially be compressed.
  246. // We require 2 or more compressible characters to break the run
  247. if (isCompressible(ch))
  248. {
  249. // check whether we can look ahead
  250. if( iIn < in.length - 1)
  251. {
  252. // DEBUG
  253. Debug.out("is-comp: ",ch);
  254. char ch2 = in[iIn + 1];
  255. if (isCompressible(ch2))
  256. {
  257. // at least 2 characters are compressible
  258. // break the run
  259. break;
  260. }
  261. //DEBUG
  262. Debug.out("no-comp: ",ch2);
  263. }
  264. // If we get here, the current character is only character
  265. // left in the input or it is followed by a non-compressible
  266. // character. In neither case do we gain by breaking the
  267. // run, so we proceed to output the character.
  268. if (ch >= 0xE000 && ch <= 0xF2FF)
  269. {
  270. // Characters in this range need to be escaped
  271. outlen = 3;
  272. }
  273. }
  274. // check that there is enough room to output the character
  275. if(iOut >= out.length - outlen)
  276. {
  277. // DEBUG
  278. Debug.out("End of Output @", iOut);
  279. // if we got here, we ran out of space in the output array
  280. throw new EndOfOutputException();
  281. }
  282. // output any characters that cannot be compressed,
  283. if (outlen == 3)
  284. {
  285. // output the quote character
  286. out[iOut++] = UQU;
  287. }
  288. // pass the Unicode character in MSB,LSB order
  289. out[iOut++] = (byte)(ch >>> 8);
  290. out[iOut++] = (byte)(ch & 0xFF);
  291. // advance input cursor
  292. iIn++;
  293. }
  294. // return the last character
  295. return ch;
  296. }
  297. static int iNextWindow = 3;
  298. /** redefine a window so it surrounds a given character value
  299. For now, this function uses window 3 exclusively (window 4
  300. for extended windows);
  301. @return true if a window was successfully defined
  302. @param ch - character around which window is positioned
  303. @param out - output byte array
  304. @param fCurUnicodeMode - type of window
  305. **/
  306. private boolean positionWindow(int ch, byte [] out, boolean fCurUnicodeMode)
  307. throws IllegalInputException, EndOfOutputException
  308. {
  309. int iWin = iNextWindow % 8; // simple LRU
  310. int iPosition = 0;
  311. // iPosition 0 is a reserved value
  312. if (ch < 0x80)
  313. {
  314. throw new IllegalStateException("ch < 0x80");
  315. //return false;
  316. }
  317. // Check the fixed offsets
  318. for (int i = 0; i < fixedOffset.length; i++)
  319. {
  320. if (ch >= fixedOffset[i] && ch < fixedOffset[i] + 0x80)
  321. {
  322. iPosition = i;
  323. break;
  324. }
  325. }
  326. if (iPosition != 0)
  327. {
  328. // DEBUG
  329. Debug.out("FIXED position is ", iPosition + 0xF9);
  330. // ch fits in a fixed offset window position
  331. dynamicOffset[iWin] = fixedOffset[iPosition];
  332. iPosition += 0xF9;
  333. }
  334. else if (ch < 0x3400)
  335. {
  336. // calculate a window position command and set the offset
  337. iPosition = ch >>> 7;
  338. dynamicOffset[iWin] = ch & 0xFF80;
  339. Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
  340. }
  341. else if (ch < 0xE000)
  342. {
  343. // attempt to place a window where none can go
  344. return false;
  345. }
  346. else if (ch <= 0xFFFF)
  347. {
  348. // calculate a window position command, accounting
  349. // for the gap in position values, and set the offset
  350. iPosition = ((ch - gapOffset)>>> 7);
  351. dynamicOffset[iWin] = ch & 0xFF80;
  352. Debug.out("Offset="+dynamicOffset[iWin]+", iPosition="+iPosition+" for char", ch);
  353. }
  354. else
  355. {
  356. // if we get here, the character is in the extended range.
  357. // Always use Window 4 to define an extended window
  358. iPosition = (ch - 0x10000) >>> 7;
  359. // DEBUG
  360. Debug.out("Try position Window at ", iPosition);
  361. iPosition |= iWin << 13;
  362. dynamicOffset[iWin] = ch & 0x1FFF80;
  363. }
  364. // Outputting window defintion command for the general cases
  365. if ( iPosition < 0x100 && iOut < out.length-1)
  366. {
  367. out[iOut++] = (byte) ((fCurUnicodeMode ? UD0 : SD0) + iWin);
  368. out[iOut++] = (byte) (iPosition & 0xFF);
  369. }
  370. // Output an extended window definiton command
  371. else if ( iPosition >= 0x100 && iOut < out.length - 2)
  372. {
  373. Debug.out("Setting extended window at ", iPosition);
  374. out[iOut++] = (fCurUnicodeMode ? UDX : SDX);
  375. out[iOut++] = (byte) ((iPosition >>> 8) & 0xFF);
  376. out[iOut++] = (byte) (iPosition & 0xFF);
  377. }
  378. else
  379. {
  380. throw new EndOfOutputException();
  381. }
  382. selectWindow(iWin);
  383. iNextWindow++;
  384. return true;
  385. }
  386. /**
  387. compress a Unicode character array with some simplifying assumptions
  388. **/
  389. public int simpleCompress(char [] in, int iStartIn, byte[] out, int iStartOut)
  390. throws IllegalInputException, EndOfInputException, EndOfOutputException
  391. {
  392. iIn = iStartIn;
  393. iOut = iStartOut;
  394. while (iIn < in.length)
  395. {
  396. int ch;
  397. // previously we switched to a Unicode run
  398. if (iSCU != -1)
  399. {
  400. Debug.out("Remaining", in, iIn);
  401. Debug.out("Output until ["+iOut+"]: ", out);
  402. // output characters as Unicode
  403. ch = outputUnicodeRun(in, out);
  404. // for single character Unicode runs (3 bytes) use quote
  405. if (iOut - iSCU == 3 )
  406. {
  407. // go back and fix up the SCU to an SQU instead
  408. out[iSCU] = SQU;
  409. iSCU = -1;
  410. continue;
  411. }
  412. else
  413. {
  414. iSCU = -1;
  415. fUnicodeMode = true;
  416. }
  417. }
  418. // next, try to output characters as single byte run
  419. else
  420. {
  421. ch = outputSingleByteRun(in, out);
  422. }
  423. // check whether we still have input
  424. if (iIn == in.length)
  425. {
  426. break; // no more input
  427. }
  428. // if we get here, we have a consistent value for ch, whether or
  429. // not it is an regular or extended character. Locate or define a
  430. // Window for the current character
  431. Debug.out("Output so far: ", out);
  432. Debug.out("Routing ch="+ch+" for Input", in, iIn);
  433. // Check that we have enough room to output the command byte
  434. if (iOut >= out.length - 1)
  435. {
  436. throw new EndOfOutputException();
  437. }
  438. // In order to switch away from Unicode mode, it is necessary
  439. // to select (or define) a window. If the characters that follow
  440. // the Unicode range are ASCII characters, we can't use them
  441. // to decide which window to select, since ASCII characters don't
  442. // influence window settings. This loop looks ahead until it finds
  443. // one compressible character that isn't in the ASCII range.
  444. for (int ich = iIn; ch < 0x80; ich++)
  445. {
  446. if (ich == in.length || !isCompressible(in[ich]))
  447. {
  448. // if there are only ASCII characters left,
  449. ch = in[iIn];
  450. break;
  451. }
  452. ch = in[ich]; // lookahead for next non-ASCII char
  453. }
  454. // The character value contained in ch here will only be used to select
  455. // output modes. Actual output of characters starts with in[iIn] and
  456. // only takes place near the top of the loop.
  457. int iprevWindow = getCurrentWindow();
  458. // try to locate a dynamic window
  459. if (ch < 0x80 || locateWindow(ch, dynamicOffset))
  460. {
  461. Debug.out("located dynamic window "+getCurrentWindow()+" at ", iOut+1);
  462. // lookahead to use SQn instead of SCn for single
  463. // character interruptions of runs in current window
  464. if(!fUnicodeMode && iIn < in.length -1)
  465. {
  466. char ch2 = in[iIn+1];
  467. if (ch2 >= dynamicOffset[iprevWindow] &&
  468. ch2 < dynamicOffset[iprevWindow] + 0x80)
  469. {
  470. quoteSingleByte(ch, out);
  471. selectWindow(iprevWindow);
  472. continue;
  473. }
  474. }
  475. out[iOut++] = (byte)((fUnicodeMode ? UC0 : SC0) + getCurrentWindow());
  476. fUnicodeMode = false;
  477. }
  478. // try to locate a static window
  479. else if (!fUnicodeMode && locateWindow(ch, staticOffset))
  480. {
  481. // static windows are not accessible from Unicode mode
  482. Debug.out("located a static window", getCurrentWindow());
  483. quoteSingleByte(ch, out);
  484. selectWindow(iprevWindow); // restore current Window settings
  485. continue;
  486. }
  487. // try to define a window around ch
  488. else if (positionWindow(ch, out, fUnicodeMode) )
  489. {
  490. fUnicodeMode = false;
  491. }
  492. // If all else fails, start a Unicode run
  493. else
  494. {
  495. iSCU = iOut;
  496. out[iOut++] = SCU;
  497. continue;
  498. }
  499. }
  500. return iOut - iStartOut;
  501. }
  502. public byte[] compress(String inStr)
  503. throws IllegalInputException, EndOfInputException
  504. {
  505. // Running out of room for output can cause non-optimal
  506. // compression. In order to not slow down compression too
  507. // much, not all intermediate state is constantly saved.
  508. byte [] out = new byte[inStr.length() * 2];
  509. char [] in = inStr.toCharArray();
  510. //DEBUG
  511. Debug.out("compress input: ",in);
  512. reset();
  513. while(true)
  514. {
  515. try
  516. {
  517. simpleCompress(in, charsRead(), out, bytesWritten());
  518. // if we get here things went fine.
  519. break;
  520. }
  521. catch (EndOfOutputException e)
  522. {
  523. // create a larger output buffer and continue
  524. byte [] largerOut = new byte[out.length * 2];
  525. System.arraycopy(out, 0, largerOut, 0, out.length);
  526. out = largerOut;
  527. }
  528. }
  529. byte [] trimmedOut = new byte[bytesWritten()];
  530. System.arraycopy(out, 0, trimmedOut, 0, trimmedOut.length);
  531. out = trimmedOut;
  532. Debug.out("compress output: ", out);
  533. return out;
  534. }
  535. /** reset is only needed to bail out after an exception and
  536. restart with new input */
  537. @Override
  538. public void reset()
  539. {
  540. super.reset();
  541. fUnicodeMode = false;
  542. iSCU = - 1;
  543. }
  544. /** returns the number of bytes written **/
  545. public int bytesWritten()
  546. {
  547. return iOut;
  548. }
  549. /** returns the number of bytes written **/
  550. public int charsRead()
  551. {
  552. return iIn;
  553. }
  554. }