You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. package com.healthmarketscience.jackcess.scsu;
  2. import java.io.*;
  3. import java.util.*;
  4. /**
  5. * This sample software accompanies Unicode Technical Report #6 and
  6. * distributed as is by Unicode, Inc., subject to the following:
  7. *
  8. * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
  9. *
  10. * Permission to use, copy, modify, and distribute this software
  11. * without fee is hereby granted provided that this copyright notice
  12. * appears in all copies.
  13. *
  14. * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
  15. * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
  16. * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  18. * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
  19. * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
  20. * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
  21. * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  22. *
  23. * @author Asmus Freytag
  24. *
  25. * @version 001 Dec 25 1996
  26. * @version 002 Jun 25 1997
  27. * @version 003 Jul 25 1997
  28. * @version 004 Aug 25 1997
  29. * @version 005 Sep 30 1998
  30. *
  31. * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
  32. * and are registered in some jurisdictions.
  33. **/
  34. /**
  35. Class CompressMain
  36. A small commandline driver interface for the compression routines
  37. Use the /? to get usage
  38. */
  39. public class CompressMain
  40. {
  41. static void usage()
  42. {
  43. System.err.println("java CompressMain /? : this usage information\n");
  44. System.err.println("java CompressMain /random : random test\n");
  45. System.err.println("java CompressMain /suite : suite test\n");
  46. System.err.println("java CompressMain /suite <file> : file test (file data may include \\uXXXX)\n");
  47. System.err.println("java CompressMain <string> : string test (string may include \\uXXXX)\n");
  48. System.err.println("java CompressMain /roundtrip <file>: check Unicode file for roundtrip\n");
  49. System.err.println("java CompressMain /compress <file> : compresses Unicode files (no \\uXXXX)\n");
  50. System.err.println("java CompressMain /expand <file> : expands into Unicode files\n");
  51. System.err.println("java CompressMain /byteswap <files>: swaps byte order of Unicode files\n");
  52. System.err.println("java CompressMain /display <files> : like expand, but creates a dump instead\n");
  53. System.err.println("java CompressMain /parse <files> : parses \\uXXXX into binary Unicode\n");
  54. }
  55. static void analyze(String text, int inlength, String result, int outlength)
  56. {
  57. boolean fSuccess = text.equals(result);
  58. Debug.out(fSuccess ? "Round trip OK" : "Round trip FAILED");
  59. if (!fSuccess && result != null)
  60. {
  61. int iLim = Math.min(text.length(), result.length());
  62. for (int i = 0; i < iLim; i++)
  63. {
  64. if (text.charAt(i) != result.charAt(i))
  65. {
  66. Debug.out("First Mismatch at "+ i +"=", result.charAt(i) );
  67. Debug.out("Original character "+ i +"=", text.charAt(i) );
  68. break;
  69. }
  70. }
  71. }
  72. else
  73. {
  74. Debug.out("Compressed: "+inlength+" chars to "+outlength+" bytes.");
  75. Debug.out(" Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
  76. }
  77. }
  78. static void test2(String text)
  79. {
  80. byte bytes[] = null;
  81. String result = null;
  82. Debug.out("SCSU:\n");
  83. Compress compressor = new Compress();
  84. try
  85. {
  86. bytes = compressor.compress(text);
  87. Expand display = new Expand();
  88. result = display.expand(bytes);
  89. Debug.out("Input: ", text.toCharArray());
  90. Debug.out("Result: ", result.toCharArray());
  91. Debug.out("");
  92. Expand expander = new Expand();
  93. result = expander.expand(bytes);
  94. }
  95. catch (Exception e)
  96. {
  97. System.out.println(e);
  98. }
  99. int inlength = compressor.charsRead();
  100. int outlength = compressor.bytesWritten();
  101. analyze(text, inlength, result, outlength);
  102. }
  103. static void test(String text) throws Exception
  104. {
  105. test(text, false);
  106. }
  107. static void test(String text, boolean shouldFail)
  108. throws Exception
  109. {
  110. // Create an instance of the compressor
  111. Compress compressor = new Compress();
  112. byte [] bytes = null;
  113. String result = null;
  114. Exception failure = null;
  115. try {
  116. // perform compression
  117. bytes = compressor.compress(text);
  118. }
  119. catch(Exception e)
  120. {
  121. failure = e;
  122. }
  123. if(shouldFail) {
  124. if(failure == null) {
  125. throw new RuntimeException("Did not fail");
  126. }
  127. return;
  128. }
  129. if(failure != null) {
  130. throw failure;
  131. }
  132. Expand expander = new Expand();
  133. // perform expansion
  134. result = expander.expand(bytes);
  135. // analyze the results
  136. int inlength = compressor.charsRead();
  137. int outlength = compressor.bytesWritten();
  138. analyze(text, inlength, result, outlength);
  139. }
  140. public static void display(byte [] input)
  141. {
  142. try
  143. {
  144. Expand expand = new Expand();
  145. String text = expand.expand(input);
  146. Debug.out(text.toCharArray());
  147. }
  148. catch (Exception e)
  149. {
  150. System.out.println(e);
  151. }
  152. }
  153. public static String parse(String input)
  154. {
  155. StringTokenizer st = new StringTokenizer(input, "\\", true);
  156. Debug.out("Input: ", input);
  157. StringBuffer sb = new StringBuffer();
  158. while(st.hasMoreTokens())
  159. {
  160. String token = st.nextToken();
  161. Debug.out("Token: ", token);
  162. if (token.charAt(0) == '\\' && token.length() == 1)
  163. {
  164. if(st.hasMoreTokens())
  165. {
  166. token = st.nextToken();
  167. }
  168. if(token.charAt(0) == 'u')
  169. {
  170. Debug.out("Token: "+ token+ " ", sb.toString());
  171. String hexnum;
  172. if (token.length() > 5)
  173. {
  174. hexnum = token.substring(1,5);
  175. token = token.substring(5);
  176. }
  177. else
  178. {
  179. hexnum = token.substring(1);
  180. token = "";
  181. }
  182. sb.append((char)Integer.parseInt(hexnum, 16));
  183. }
  184. }
  185. sb.append(token);
  186. }
  187. return sb.toString();
  188. }
  189. public static void randomTest(int nTest)
  190. throws Exception
  191. {
  192. Random random = new Random();
  193. for(int n=0; n < nTest; n++)
  194. {
  195. int iLen = (int) (20 * random.nextFloat());
  196. StringBuffer sb = new StringBuffer(iLen);
  197. for(int i = 0; i < iLen; i++)
  198. {
  199. sb.append((char) (0xFFFF * random.nextFloat()));
  200. }
  201. test(sb.toString());
  202. }
  203. }
  204. @SuppressWarnings("deprecation")
  205. public static void fileTest(String name)
  206. throws Exception
  207. {
  208. DataInputStream dis = new DataInputStream(new FileInputStream(name));
  209. int iLine = 0;
  210. while(dis.available() != 0)
  211. {
  212. String line = dis.readLine();
  213. Debug.out("Line "+ iLine++ +" "+line);
  214. test(parse(line), false ); //false);// initially no debug info
  215. }
  216. }
  217. public static void displayFile(String name)
  218. throws IOException
  219. {
  220. DataInputStream dis = new DataInputStream(new FileInputStream(name));
  221. byte bytes[] = new byte[dis.available()];
  222. dis.read(bytes);
  223. display(bytes);
  224. }
  225. public static void decodeTest(String name)
  226. throws IOException
  227. {
  228. DataInputStream dis = new DataInputStream(new FileInputStream(name));
  229. byte bytes[] = new byte[dis.available()];
  230. dis.read(bytes);
  231. Expand expand = new Expand();
  232. char [] chars = null;
  233. try
  234. {
  235. String text = expand.expand(bytes);
  236. chars = text.toCharArray();
  237. }
  238. catch (Exception e)
  239. {
  240. System.out.println(e);
  241. }
  242. int inlength = expand.bytesRead();
  243. int iDot = name.lastIndexOf('.');
  244. StringBuffer sb = new StringBuffer(name);
  245. sb.setLength(iDot + 1);
  246. sb.append("txt");
  247. String outName = sb.toString();
  248. int outlength = expand.charsWritten();
  249. Debug.out("Expanded "+name+": "+inlength+" bytes to "+outName+" " +outlength+" chars." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 200 / inlength))+"%.");
  250. if (chars == null)
  251. return;
  252. writeUnicodeFile(outName, chars);
  253. }
  254. /** most of the next 3 functions should not be needed by JDK11 and later */
  255. private static int iMSB = 1;
  256. public static String readUnicodeFile(String name)
  257. {
  258. try
  259. {
  260. FileInputStream dis = new FileInputStream(name);
  261. byte b[] = new byte[2];
  262. StringBuffer sb = new StringBuffer();
  263. char ch = 0;
  264. iMSB = 1;
  265. int i = 0;
  266. for(i = 0; (dis.available() != 0); i++)
  267. {
  268. b[i%2] = (byte) dis.read();
  269. if ((i & 1) == 1)
  270. {
  271. ch = Expand.charFromTwoBytes(b[(i + iMSB)%2], b[(i + iMSB + 1) % 2]);
  272. }
  273. else
  274. {
  275. continue;
  276. }
  277. if (i == 1 && ch == '\uFEFF')
  278. continue; // throw away byte order mark
  279. if (i == 1 && ch == '\uFFFE')
  280. {
  281. iMSB ++; // flip byte order
  282. continue; // throw away byte order mark
  283. }
  284. sb.append(ch);
  285. }
  286. return sb.toString();
  287. }
  288. catch (IOException e)
  289. {
  290. System.err.println(e);
  291. return "";
  292. }
  293. }
  294. public static void writeUnicodeFile(String outName, char [] chars)
  295. throws IOException
  296. {
  297. DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
  298. if ((iMSB & 1) == 1)
  299. {
  300. dos.writeByte(0xFF);
  301. dos.writeByte(0xFE);
  302. }
  303. else
  304. {
  305. dos.writeByte(0xFE);
  306. dos.writeByte(0xFF);
  307. }
  308. byte b[] = new byte[2];
  309. for (int ich = 0; ich < chars.length; ich++)
  310. {
  311. b[(iMSB + 0)%2] = (byte) (chars[ich] >>> 8);
  312. b[(iMSB + 1)%2] = (byte) (chars[ich] & 0xFF);
  313. dos.write(b, 0, 2);
  314. }
  315. }
  316. static void byteswap(String name)
  317. throws IOException
  318. {
  319. String text = readUnicodeFile(name);
  320. char chars[] = text.toCharArray();
  321. writeUnicodeFile(name, chars);
  322. }
  323. @SuppressWarnings("deprecation")
  324. public static void parseFile(String name)
  325. throws IOException
  326. {
  327. DataInputStream dis = new DataInputStream(new FileInputStream(name));
  328. byte bytes[] = new byte[dis.available()];
  329. dis.read(bytes);
  330. // simplistic test
  331. int bom = (char) bytes[0] + (char) bytes[1];
  332. if (bom == 131069)
  333. {
  334. // FEFF or FFFE detected (either one sums to 131069)
  335. Debug.out(name + " is already in Unicode!");
  336. return;
  337. }
  338. // definitely assumes an ASCII file at this point
  339. String text = new String(bytes, 0);
  340. char chars[] = parse(text).toCharArray();
  341. writeUnicodeFile(name, chars);
  342. return;
  343. }
  344. public static void encodeTest(String name)
  345. throws Exception
  346. {
  347. String text = readUnicodeFile(name);
  348. // Create an instance of the compressor
  349. Compress compressor = new Compress();
  350. byte [] bytes = null;
  351. // perform compression
  352. bytes = compressor.compress(text);
  353. int inlength = compressor.charsRead();
  354. int iDot = name.lastIndexOf('.');
  355. StringBuffer sb = new StringBuffer(name);
  356. sb.setLength(iDot + 1);
  357. sb.append("csu");
  358. String outName = sb.toString();
  359. DataOutputStream dos = new DataOutputStream(new FileOutputStream(outName));
  360. dos.write(bytes, 0, bytes.length);
  361. int outlength = compressor.bytesWritten();
  362. Debug.out("Compressed "+name+": "+inlength+" chars to "+outName+" " +outlength+" bytes." + " Ratio: "+(outlength == 0 ? 0 :(outlength * 50 / inlength))+"%.");
  363. }
  364. public static void roundtripTest(String name)
  365. throws Exception
  366. {
  367. test(readUnicodeFile(name), false);// no debug info
  368. }
  369. /** The Main function */
  370. public static void main(String args[])
  371. throws Exception
  372. {
  373. int iArg = args.length;
  374. try
  375. {
  376. if (iArg != 0)
  377. {
  378. if (args[0].equalsIgnoreCase("/compress"))
  379. {
  380. while (--iArg > 0)
  381. {
  382. encodeTest(args[args.length - iArg]);
  383. }
  384. }
  385. else if (args[0].equalsIgnoreCase("/parse"))
  386. {
  387. while (--iArg > 0)
  388. {
  389. parseFile(args[args.length - iArg]);
  390. }
  391. }
  392. else if (args[0].equalsIgnoreCase("/expand"))
  393. {
  394. while (--iArg > 0)
  395. {
  396. decodeTest(args[args.length - iArg]);
  397. }
  398. }
  399. else if (args[0].equalsIgnoreCase("/display"))
  400. {
  401. while (--iArg > 0)
  402. {
  403. displayFile(args[args.length - iArg]);
  404. }
  405. }
  406. else if (args[0].equalsIgnoreCase("/roundtrip"))
  407. {
  408. while (--iArg > 0)
  409. {
  410. roundtripTest(args[args.length - iArg]);
  411. }
  412. }
  413. else if (args[0].equalsIgnoreCase("/byteswap"))
  414. {
  415. while (--iArg > 0)
  416. {
  417. byteswap(args[args.length - iArg]);
  418. }
  419. }else if (args[0].equalsIgnoreCase("/random"))
  420. {
  421. randomTest(8);
  422. }
  423. else if (args[0].equalsIgnoreCase("/suite"))
  424. {
  425. if (iArg == 1)
  426. {
  427. suiteTest();
  428. }
  429. else
  430. {
  431. while (--iArg > 0)
  432. {
  433. fileTest(args[args.length - iArg]);
  434. }
  435. }
  436. }
  437. else if (args[0].equalsIgnoreCase("/?"))
  438. {
  439. usage();
  440. }
  441. else
  442. {
  443. while (iArg > 0)
  444. {
  445. test2(parse(args[--iArg]));
  446. }
  447. }
  448. }
  449. else
  450. {
  451. usage();
  452. }
  453. }
  454. catch (IOException e)
  455. {
  456. System.err.println(e);
  457. }
  458. try
  459. {
  460. System.err.println("Done. Press enter to exit");
  461. System.in.read();
  462. }
  463. catch (IOException e)
  464. {
  465. }
  466. }
  467. static void suiteTest()
  468. throws Exception
  469. {
  470. Debug.out("Standard Compression test suite:");
  471. test("Hello \u9292 \u9192 World!");
  472. test("Hell\u0429o \u9292 \u9192 W\u00e4rld!");
  473. test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
  474. test("\u0648\u06c8"); // catch missing reset
  475. test("\u0648\u06c8");
  476. test("\u4444\uE001"); // lowest quotable
  477. test("\u4444\uf2FF"); // highest quotable
  478. test("\u4444\uf188\u4444");
  479. test("\u4444\uf188\uf288");
  480. test("\u4444\uf188abc\0429\uf288");
  481. test("\u9292\u2222");
  482. test("Hell\u0429\u04230o \u9292 \u9292W\u00e4\u0192rld!");
  483. test("Hell\u0429o \u9292 \u9292W\u00e4rld!");
  484. test("Hello World!123456");
  485. test("Hello W\u0081\u011f\u0082!"); // Latin 1 run
  486. test("abc\u0301\u0302"); // uses SQn for u301 u302
  487. test("abc\u4411d"); // uses SQU
  488. test("abc\u4411\u4412d");// uses SCU
  489. test("abc\u0401\u0402\u047f\u00a5\u0405"); // uses SQn for ua5
  490. test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000"); // SJIS like data
  491. test("\u9292\u2222");
  492. test("\u9191\u9191\u3041\u9191\u3041\u3041\u3000");
  493. test("\u9999\u3051\u300c\u9999\u9999\u3060\u9999\u3065\u3065\u3065\u300c");
  494. test("\u3000\u266a\u30ea\u30f3\u30b4\u53ef\u611b\u3044\u3084\u53ef\u611b\u3044\u3084\u30ea\u30f3\u30b4\u3002");
  495. test(""); // empty input
  496. test("\u0000"); // smallest BMP character
  497. test("\uFFFF"); // largest BMP character
  498. test("\ud800\udc00"); // smallest surrogate
  499. test("\ud8ff\udcff"); // largest surrogate pair
  500. Debug.out("\nTHESE TESTS ARE SUPPOSED TO FAIL:");
  501. test("\ud800 \udc00", true); // unpaired surrogate (1)
  502. test("\udc00", true); // unpaired surrogate (2)
  503. test("\ud800", true); // unpaired surrogate (3)
  504. }
  505. }