You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TernaryTree.java 19KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634
  1. /*
  2. * $Id$
  3. * Copyright (C) 2001 The Apache Software Foundation. All rights reserved.
  4. * For details on use and redistribution please refer to the
  5. * LICENSE file included with these sources.
  6. */
  7. package org.apache.fop.layout.hyphenation;
  8. import java.util.Enumeration;
  9. import java.util.Stack;
  10. import java.io.Serializable;
  11. /**
  12. * <h2>Ternary Search Tree</h2>
  13. *
  14. * <p>A ternary search tree is a hibrid between a binary tree and
  15. * a digital search tree (trie). Keys are limited to strings.
  16. * A data value of type char is stored in each leaf node.
  17. * It can be used as an index (or pointer) to the data.
  18. * Branches that only contain one key are compressed to one node
  19. * by storing a pointer to the trailer substring of the key.
  20. * This class is intended to serve as base class or helper class
  21. * to implement Dictionary collections or the like. Ternary trees
  22. * have some nice properties as the following: the tree can be
  23. * traversed in sorted order, partial matches (wildcard) can be
  24. * implemented, retrieval of all keys within a given distance
  25. * from the target, etc. The storage requirements are higher than
  26. * a binary tree but a lot less than a trie. Performance is
  27. * comparable with a hash table, sometimes it outperforms a hash
  28. * function (most of the time can determine a miss faster than a hash).</p>
  29. *
  30. * <p>The main purpose of this java port is to serve as a base for
  31. * implementing TeX's hyphenation algorithm (see The TeXBook,
  32. * appendix H). Each language requires from 5000 to 15000 hyphenation
  33. * patterns which will be keys in this tree. The strings patterns
  34. * are usually small (from 2 to 5 characters), but each char in the
  35. * tree is stored in a node. Thus memory usage is the main concern.
  36. * We will sacrify 'elegance' to keep memory requirenments to the
  37. * minimum. Using java's char type as pointer (yes, I know pointer
  38. * it is a forbidden word in java) we can keep the size of the node
  39. * to be just 8 bytes (3 pointers and the data char). This gives
  40. * room for about 65000 nodes. In my tests the english patterns
  41. * took 7694 nodes and the german patterns 10055 nodes,
  42. * so I think we are safe.</p>
  43. *
  44. * <p>All said, this is a map with strings as keys and char as value.
  45. * Pretty limited!. It can be extended to a general map by
  46. * using the string representation of an object and using the
  47. * char value as an index to an array that contains the object
  48. * values.</p>
  49. *
  50. * @author cav@uniscope.co.jp
  51. */
  52. public class TernaryTree implements Cloneable, Serializable {
  53. /**
  54. * We use 4 arrays to represent a node. I guess I should have created
  55. * a proper node class, but somehow Knuth's pascal code made me forget
  56. * we now have a portable language with virtual memory management and
  57. * automatic garbage collection! And now is kind of late, furthermore,
  58. * if it ain't broken, don't fix it.
  59. */
  60. /**
  61. * Pointer to low branch and to rest of the key when it is
  62. * stored directly in this node, we don't have unions in java!
  63. */
  64. protected char[] lo;
  65. /**
  66. * Pointer to high branch.
  67. */
  68. protected char[] hi;
  69. /**
  70. * Pointer to equal branch and to data when this node is a string terminator.
  71. */
  72. protected char[] eq;
  73. /**
  74. * <P>The character stored in this node: splitchar
  75. * Two special values are reserved:</P>
  76. * <ul><li>0x0000 as string terminator</li>
  77. * <li>0xFFFF to indicate that the branch starting at
  78. * this node is compressed</li></ul>
  79. * <p>This shouldn't be a problem if we give the usual semantics to
  80. * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
  81. */
  82. protected char[] sc;
  83. /**
  84. * This vector holds the trailing of the keys when the branch is compressed.
  85. */
  86. protected CharVector kv;
  87. protected char root;
  88. protected char freenode;
  89. protected int length; // number of items in tree
  90. protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
  91. TernaryTree() {
  92. init();
  93. }
  94. protected void init() {
  95. root = 0;
  96. freenode = 1;
  97. length = 0;
  98. lo = new char[BLOCK_SIZE];
  99. hi = new char[BLOCK_SIZE];
  100. eq = new char[BLOCK_SIZE];
  101. sc = new char[BLOCK_SIZE];
  102. kv = new CharVector();
  103. }
  104. /**
  105. * Branches are initially compressed, needing
  106. * one node per key plus the size of the string
  107. * key. They are decompressed as needed when
  108. * another key with same prefix
  109. * is inserted. This saves a lot of space,
  110. * specially for long keys.
  111. */
  112. public void insert(String key, char val) {
  113. // make sure we have enough room in the arrays
  114. int len = key.length()
  115. + 1; // maximum number of nodes that may be generated
  116. if (freenode + len > eq.length)
  117. redimNodeArrays(eq.length + BLOCK_SIZE);
  118. char strkey[] = new char[len--];
  119. key.getChars(0, len, strkey, 0);
  120. strkey[len] = 0;
  121. root = insert(root, strkey, 0, val);
  122. }
  123. public void insert(char[] key, int start, char val) {
  124. int len = strlen(key) + 1;
  125. if (freenode + len > eq.length)
  126. redimNodeArrays(eq.length + BLOCK_SIZE);
  127. root = insert(root, key, start, val);
  128. }
  129. /**
  130. * The actual insertion function, recursive version.
  131. */
  132. private char insert(char p, char[] key, int start, char val) {
  133. int len = strlen(key, start);
  134. if (p == 0) {
  135. // this means there is no branch, this node will start a new branch.
  136. // Instead of doing that, we store the key somewhere else and create
  137. // only one node with a pointer to the key
  138. p = freenode++;
  139. eq[p] = val; // holds data
  140. length++;
  141. hi[p] = 0;
  142. if (len > 0) {
  143. sc[p] = 0xFFFF; // indicates branch is compressed
  144. lo[p] = (char)kv.alloc(len
  145. + 1); // use 'lo' to hold pointer to key
  146. strcpy(kv.getArray(), lo[p], key, start);
  147. } else {
  148. sc[p] = 0;
  149. lo[p] = 0;
  150. }
  151. return p;
  152. }
  153. if (sc[p] == 0xFFFF) {
  154. // branch is compressed: need to decompress
  155. // this will generate garbage in the external key array
  156. // but we can do some garbage collection later
  157. char pp = freenode++;
  158. lo[pp] = lo[p]; // previous pointer to key
  159. eq[pp] = eq[p]; // previous pointer to data
  160. lo[p] = 0;
  161. if (len > 0) {
  162. sc[p] = kv.get(lo[pp]);
  163. eq[p] = pp;
  164. lo[pp]++;
  165. if (kv.get(lo[pp]) == 0) {
  166. // key completly decompressed leaving garbage in key array
  167. lo[pp] = 0;
  168. sc[pp] = 0;
  169. hi[pp] = 0;
  170. } else
  171. sc[pp] =
  172. 0xFFFF; // we only got first char of key, rest is still there
  173. } else {
  174. // In this case we can save a node by swapping the new node
  175. // with the compressed node
  176. sc[pp] = 0xFFFF;
  177. hi[p] = pp;
  178. sc[p] = 0;
  179. eq[p] = val;
  180. length++;
  181. return p;
  182. }
  183. }
  184. char s = key[start];
  185. if (s < sc[p])
  186. lo[p] = insert(lo[p], key, start, val);
  187. else if (s == sc[p]) {
  188. if (s != 0)
  189. eq[p] = insert(eq[p], key, start + 1, val);
  190. else {
  191. // key already in tree, overwrite data
  192. eq[p] = val;
  193. }
  194. } else
  195. hi[p] = insert(hi[p], key, start, val);
  196. return p;
  197. }
  198. /**
  199. * Compares 2 null terminated char arrays
  200. */
  201. public static int strcmp(char[] a, int startA, char[] b, int startB) {
  202. for (; a[startA] == b[startB]; startA++, startB++)
  203. if (a[startA] == 0)
  204. return 0;
  205. return a[startA] - b[startB];
  206. }
  207. /**
  208. * Compares a string with null terminated char array
  209. */
  210. public static int strcmp(String str, char[] a, int start) {
  211. int i, d, len = str.length();
  212. for (i = 0; i < len; i++) {
  213. d = (int)str.charAt(i) - a[start + i];
  214. if (d != 0)
  215. return d;
  216. if (a[start + i] == 0)
  217. return d;
  218. }
  219. if (a[start + i] != 0)
  220. return (int)-a[start + i];
  221. return 0;
  222. }
  223. public static void strcpy(char[] dst, int di, char[] src, int si) {
  224. while (src[si] != 0)
  225. dst[di++] = src[si++];
  226. dst[di] = 0;
  227. }
  228. public static int strlen(char[] a, int start) {
  229. int len = 0;
  230. for (int i = start; i < a.length && a[i] != 0; i++)
  231. len++;
  232. return len;
  233. }
  234. public static int strlen(char[] a) {
  235. return strlen(a, 0);
  236. }
  237. public int find(String key) {
  238. int len = key.length();
  239. char strkey[] = new char[len + 1];
  240. key.getChars(0, len, strkey, 0);
  241. strkey[len] = 0;
  242. return find(strkey, 0);
  243. }
  244. public int find(char[] key, int start) {
  245. int d;
  246. char p = root;
  247. int i = start;
  248. char c;
  249. while (p != 0) {
  250. if (sc[p] == 0xFFFF) {
  251. if (strcmp(key, i, kv.getArray(), lo[p]) == 0)
  252. return eq[p];
  253. else
  254. return -1;
  255. }
  256. c = key[i];
  257. d = c - sc[p];
  258. if (d == 0) {
  259. if (c == 0)
  260. return eq[p];
  261. i++;
  262. p = eq[p];
  263. } else if (d < 0)
  264. p = lo[p];
  265. else
  266. p = hi[p];
  267. }
  268. return -1;
  269. }
  270. public boolean knows(String key) {
  271. return (find(key) >= 0);
  272. }
  273. // redimension the arrays
  274. private void redimNodeArrays(int newsize) {
  275. int len = newsize < lo.length ? newsize : lo.length;
  276. char[] na = new char[newsize];
  277. System.arraycopy(lo, 0, na, 0, len);
  278. lo = na;
  279. na = new char[newsize];
  280. System.arraycopy(hi, 0, na, 0, len);
  281. hi = na;
  282. na = new char[newsize];
  283. System.arraycopy(eq, 0, na, 0, len);
  284. eq = na;
  285. na = new char[newsize];
  286. System.arraycopy(sc, 0, na, 0, len);
  287. sc = na;
  288. }
  289. public int size() {
  290. return length;
  291. }
  292. public Object clone() {
  293. TernaryTree t = new TernaryTree();
  294. t.lo = (char[])this.lo.clone();
  295. t.hi = (char[])this.hi.clone();
  296. t.eq = (char[])this.eq.clone();
  297. t.sc = (char[])this.sc.clone();
  298. t.kv = (CharVector)this.kv.clone();
  299. t.root = this.root;
  300. t.freenode = this.freenode;
  301. t.length = this.length;
  302. return t;
  303. }
  304. /**
  305. * Recursively insert the median first and then the median of the
  306. * lower and upper halves, and so on in order to get a balanced
  307. * tree. The array of keys is assumed to be sorted in ascending
  308. * order.
  309. */
  310. protected void insertBalanced(String[] k, char[] v, int offset, int n) {
  311. int m;
  312. if (n < 1)
  313. return;
  314. m = n >> 1;
  315. insert(k[m + offset], v[m + offset]);
  316. insertBalanced(k, v, offset, m);
  317. insertBalanced(k, v, offset + m + 1, n - m - 1);
  318. }
  319. /**
  320. * Balance the tree for best search performance
  321. */
  322. public void balance() {
  323. // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
  324. int i = 0, n = length;
  325. String[] k = new String[n];
  326. char[] v = new char[n];
  327. Iterator iter = new Iterator();
  328. while (iter.hasMoreElements()) {
  329. v[i] = iter.getValue();
  330. k[i++] = (String)iter.nextElement();
  331. }
  332. init();
  333. insertBalanced(k, v, 0, n);
  334. // With uniform letter distribution sc[root] should be around 'm'
  335. // System.out.print("After root splitchar = "); System.out.println(sc[root]);
  336. }
  337. /**
  338. * Each node stores a character (splitchar) which is part of
  339. * some key(s). In a compressed branch (one that only contain
  340. * a single string key) the trailer of the key which is not
  341. * already in nodes is stored externally in the kv array.
  342. * As items are inserted, key substrings decrease.
  343. * Some substrings may completely disappear when the whole
  344. * branch is totally decompressed.
  345. * The tree is traversed to find the key substrings actually
  346. * used. In addition, duplicate substrings are removed using
  347. * a map (implemented with a TernaryTree!).
  348. *
  349. */
  350. public void trimToSize() {
  351. // first balance the tree for best performance
  352. balance();
  353. // redimension the node arrays
  354. redimNodeArrays(freenode);
  355. // ok, compact kv array
  356. CharVector kx = new CharVector();
  357. kx.alloc(1);
  358. TernaryTree map = new TernaryTree();
  359. compact(kx, map, root);
  360. kv = kx;
  361. kv.trimToSize();
  362. }
  363. private void compact(CharVector kx, TernaryTree map, char p) {
  364. int k;
  365. if (p == 0)
  366. return;
  367. if (sc[p] == 0xFFFF) {
  368. k = map.find(kv.getArray(), lo[p]);
  369. if (k < 0) {
  370. k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
  371. strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
  372. map.insert(kx.getArray(), k, (char)k);
  373. }
  374. lo[p] = (char)k;
  375. } else {
  376. compact(kx, map, lo[p]);
  377. if (sc[p] != 0)
  378. compact(kx, map, eq[p]);
  379. compact(kx, map, hi[p]);
  380. }
  381. }
  382. public Enumeration keys() {
  383. return new Iterator();
  384. }
  385. public class Iterator implements Enumeration {
  386. /**
  387. * current node index
  388. */
  389. int cur;
  390. /**
  391. * current key
  392. */
  393. String curkey;
  394. private class Item implements Cloneable {
  395. char parent;
  396. char child;
  397. public Item() {
  398. parent = 0;
  399. child = 0;
  400. }
  401. public Item(char p, char c) {
  402. parent = p;
  403. child = c;
  404. }
  405. public Object clone() {
  406. return new Item(parent, child);
  407. }
  408. }
  409. /**
  410. * Node stack
  411. */
  412. Stack ns;
  413. /**
  414. * key stack implemented with a StringBuffer
  415. */
  416. StringBuffer ks;
  417. public Iterator() {
  418. cur = -1;
  419. ns = new Stack();
  420. ks = new StringBuffer();
  421. rewind();
  422. }
  423. public void rewind() {
  424. ns.removeAllElements();
  425. ks.setLength(0);
  426. cur = root;
  427. run();
  428. }
  429. public Object nextElement() {
  430. String res = new String(curkey);
  431. cur = up();
  432. run();
  433. return res;
  434. }
  435. public char getValue() {
  436. if (cur >= 0)
  437. return eq[cur];
  438. return 0;
  439. }
  440. public boolean hasMoreElements() {
  441. return (cur != -1);
  442. }
  443. /**
  444. * traverse upwards
  445. */
  446. private int up() {
  447. Item i = new Item();
  448. int res = 0;
  449. if (ns.empty())
  450. return -1;
  451. if (cur != 0 && sc[cur] == 0)
  452. return lo[cur];
  453. boolean climb = true;
  454. while (climb) {
  455. i = (Item)ns.pop();
  456. i.child++;
  457. switch (i.child) {
  458. case 1:
  459. if (sc[i.parent] != 0) {
  460. res = eq[i.parent];
  461. ns.push(i.clone());
  462. ks.append(sc[i.parent]);
  463. } else {
  464. i.child++;
  465. ns.push(i.clone());
  466. res = hi[i.parent];
  467. }
  468. climb = false;
  469. break;
  470. case 2:
  471. res = hi[i.parent];
  472. ns.push(i.clone());
  473. if (ks.length() > 0)
  474. ks.setLength(ks.length() - 1); // pop
  475. climb = false;
  476. break;
  477. default:
  478. if (ns.empty())
  479. return -1;
  480. climb = true;
  481. break;
  482. }
  483. }
  484. return res;
  485. }
  486. /**
  487. * traverse the tree to find next key
  488. */
  489. private int run() {
  490. if (cur == -1)
  491. return -1;
  492. boolean leaf = false;
  493. for (; ; ) {
  494. // first go down on low branch until leaf or compressed branch
  495. while (cur != 0) {
  496. if (sc[cur] == 0xFFFF) {
  497. leaf = true;
  498. break;
  499. }
  500. ns.push(new Item((char)cur, '\u0000'));
  501. if (sc[cur] == 0) {
  502. leaf = true;
  503. break;
  504. }
  505. cur = lo[cur];
  506. }
  507. if (leaf)
  508. break;
  509. // nothing found, go up one node and try again
  510. cur = up();
  511. if (cur == -1) {
  512. return -1;
  513. }
  514. }
  515. // The current node should be a data node and
  516. // the key should be in the key stack (at least partially)
  517. StringBuffer buf = new StringBuffer(ks.toString());
  518. if (sc[cur] == 0xFFFF) {
  519. int p = lo[cur];
  520. while (kv.get(p) != 0)
  521. buf.append(kv.get(p++));
  522. }
  523. curkey = buf.toString();
  524. return 0;
  525. }
  526. }
  527. public void printStats() {
  528. System.out.println("Number of keys = " + Integer.toString(length));
  529. System.out.println("Node count = " + Integer.toString(freenode));
  530. // System.out.println("Array length = " + Integer.toString(eq.length));
  531. System.out.println("Key Array length = "
  532. + Integer.toString(kv.length()));
  533. /*
  534. * for(int i=0; i<kv.length(); i++)
  535. * if ( kv.get(i) != 0 )
  536. * System.out.print(kv.get(i));
  537. * else
  538. * System.out.println("");
  539. * System.out.println("Keys:");
  540. * for(Enumeration enum = keys(); enum.hasMoreElements(); )
  541. * System.out.println(enum.nextElement());
  542. */
  543. }
  544. public static void main(String[] args) throws Exception {
  545. TernaryTree tt = new TernaryTree();
  546. tt.insert("Carlos", 'C');
  547. tt.insert("Car", 'r');
  548. tt.insert("palos", 'l');
  549. tt.insert("pa", 'p');
  550. tt.trimToSize();
  551. System.out.println((char)tt.find("Car"));
  552. System.out.println((char)tt.find("Carlos"));
  553. System.out.println((char)tt.find("alto"));
  554. tt.printStats();
  555. }
  556. }