aboutsummaryrefslogtreecommitdiffstats
path: root/src/java/org/apache/fop/hyphenation/TernaryTree.java
diff options
context:
space:
mode:
authorGlen Mazza <gmazza@apache.org>2004-09-06 18:03:12 +0000
committerGlen Mazza <gmazza@apache.org>2004-09-06 18:03:12 +0000
commit04e3d089d6faef74065f9de7fdbec0862f97669b (patch)
treeaaec09825d8cea2eac0dd86ee239f1a52be71b47 /src/java/org/apache/fop/hyphenation/TernaryTree.java
parenta0831593665d120fcd6da96b0c33795e21531818 (diff)
downloadxmlgraphics-fop-04e3d089d6faef74065f9de7fdbec0862f97669b.tar.gz
xmlgraphics-fop-04e3d089d6faef74065f9de7fdbec0862f97669b.zip
PR:
Obtained from: Submitted by: Reviewed by: Moved hyphenation package to org.apache.fop.hyphenation git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@197909 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/java/org/apache/fop/hyphenation/TernaryTree.java')
-rw-r--r--src/java/org/apache/fop/hyphenation/TernaryTree.java669
1 files changed, 669 insertions, 0 deletions
diff --git a/src/java/org/apache/fop/hyphenation/TernaryTree.java b/src/java/org/apache/fop/hyphenation/TernaryTree.java
new file mode 100644
index 000000000..d026dc858
--- /dev/null
+++ b/src/java/org/apache/fop/hyphenation/TernaryTree.java
@@ -0,0 +1,669 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id$ */
+
+package org.apache.fop.hyphenation;
+
+import java.util.Enumeration;
+import java.util.Stack;
+import java.io.Serializable;
+
+/**
+ * <h2>Ternary Search Tree.</h2>
+ *
+ * <p>A ternary search tree is a hibrid between a binary tree and
+ * a digital search tree (trie). Keys are limited to strings.
+ * A data value of type char is stored in each leaf node.
+ * It can be used as an index (or pointer) to the data.
+ * Branches that only contain one key are compressed to one node
+ * by storing a pointer to the trailer substring of the key.
+ * This class is intended to serve as base class or helper class
+ * to implement Dictionary collections or the like. Ternary trees
+ * have some nice properties as the following: the tree can be
+ * traversed in sorted order, partial matches (wildcard) can be
+ * implemented, retrieval of all keys within a given distance
+ * from the target, etc. The storage requirements are higher than
+ * a binary tree but a lot less than a trie. Performance is
+ * comparable with a hash table, sometimes it outperforms a hash
+ * function (most of the time can determine a miss faster than a hash).</p>
+ *
+ * <p>The main purpose of this java port is to serve as a base for
+ * implementing TeX's hyphenation algorithm (see The TeXBook,
+ * appendix H). Each language requires from 5000 to 15000 hyphenation
+ * patterns which will be keys in this tree. The strings patterns
+ * are usually small (from 2 to 5 characters), but each char in the
+ * tree is stored in a node. Thus memory usage is the main concern.
+ * We will sacrify 'elegance' to keep memory requirenments to the
+ * minimum. Using java's char type as pointer (yes, I know pointer
+ * it is a forbidden word in java) we can keep the size of the node
+ * to be just 8 bytes (3 pointers and the data char). This gives
+ * room for about 65000 nodes. In my tests the english patterns
+ * took 7694 nodes and the german patterns 10055 nodes,
+ * so I think we are safe.</p>
+ *
+ * <p>All said, this is a map with strings as keys and char as value.
+ * Pretty limited!. It can be extended to a general map by
+ * using the string representation of an object and using the
+ * char value as an index to an array that contains the object
+ * values.</p>
+ *
+ * @author cav@uniscope.co.jp
+ */
+
+public class TernaryTree implements Cloneable, Serializable {
+
+ /**
+ * We use 4 arrays to represent a node. I guess I should have created
+ * a proper node class, but somehow Knuth's pascal code made me forget
+ * we now have a portable language with virtual memory management and
+ * automatic garbage collection! And now is kind of late, furthermore,
+ * if it ain't broken, don't fix it.
+ */
+
+ /**
+ * Pointer to low branch and to rest of the key when it is
+ * stored directly in this node, we don't have unions in java!
+ */
+ protected char[] lo;
+
+ /**
+ * Pointer to high branch.
+ */
+ protected char[] hi;
+
+ /**
+ * Pointer to equal branch and to data when this node is a string terminator.
+ */
+ protected char[] eq;
+
+ /**
+ * <P>The character stored in this node: splitchar.
+ * Two special values are reserved:</P>
+ * <ul><li>0x0000 as string terminator</li>
+ * <li>0xFFFF to indicate that the branch starting at
+ * this node is compressed</li></ul>
+ * <p>This shouldn't be a problem if we give the usual semantics to
+ * strings since 0xFFFF is garanteed not to be an Unicode character.</p>
+ */
+ protected char[] sc;
+
+ /**
+ * This vector holds the trailing of the keys when the branch is compressed.
+ */
+ protected CharVector kv;
+
+ protected char root;
+ protected char freenode;
+ protected int length; // number of items in tree
+
+ protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
+
+ TernaryTree() {
+ init();
+ }
+
+ protected void init() {
+ root = 0;
+ freenode = 1;
+ length = 0;
+ lo = new char[BLOCK_SIZE];
+ hi = new char[BLOCK_SIZE];
+ eq = new char[BLOCK_SIZE];
+ sc = new char[BLOCK_SIZE];
+ kv = new CharVector();
+ }
+
+ /**
+ * Branches are initially compressed, needing
+ * one node per key plus the size of the string
+ * key. They are decompressed as needed when
+ * another key with same prefix
+ * is inserted. This saves a lot of space,
+ * specially for long keys.
+ */
+ public void insert(String key, char val) {
+ // make sure we have enough room in the arrays
+ int len = key.length()
+ + 1; // maximum number of nodes that may be generated
+ if (freenode + len > eq.length) {
+ redimNodeArrays(eq.length + BLOCK_SIZE);
+ }
+ char strkey[] = new char[len--];
+ key.getChars(0, len, strkey, 0);
+ strkey[len] = 0;
+ root = insert(root, strkey, 0, val);
+ }
+
+ public void insert(char[] key, int start, char val) {
+ int len = strlen(key) + 1;
+ if (freenode + len > eq.length) {
+ redimNodeArrays(eq.length + BLOCK_SIZE);
+ }
+ root = insert(root, key, start, val);
+ }
+
+ /**
+ * The actual insertion function, recursive version.
+ */
+ private char insert(char p, char[] key, int start, char val) {
+ int len = strlen(key, start);
+ if (p == 0) {
+ // this means there is no branch, this node will start a new branch.
+ // Instead of doing that, we store the key somewhere else and create
+ // only one node with a pointer to the key
+ p = freenode++;
+ eq[p] = val; // holds data
+ length++;
+ hi[p] = 0;
+ if (len > 0) {
+ sc[p] = 0xFFFF; // indicates branch is compressed
+ lo[p] = (char)kv.alloc(len
+ + 1); // use 'lo' to hold pointer to key
+ strcpy(kv.getArray(), lo[p], key, start);
+ } else {
+ sc[p] = 0;
+ lo[p] = 0;
+ }
+ return p;
+ }
+
+ if (sc[p] == 0xFFFF) {
+ // branch is compressed: need to decompress
+ // this will generate garbage in the external key array
+ // but we can do some garbage collection later
+ char pp = freenode++;
+ lo[pp] = lo[p]; // previous pointer to key
+ eq[pp] = eq[p]; // previous pointer to data
+ lo[p] = 0;
+ if (len > 0) {
+ sc[p] = kv.get(lo[pp]);
+ eq[p] = pp;
+ lo[pp]++;
+ if (kv.get(lo[pp]) == 0) {
+ // key completly decompressed leaving garbage in key array
+ lo[pp] = 0;
+ sc[pp] = 0;
+ hi[pp] = 0;
+ } else {
+ // we only got first char of key, rest is still there
+ sc[pp] = 0xFFFF;
+ }
+ } else {
+ // In this case we can save a node by swapping the new node
+ // with the compressed node
+ sc[pp] = 0xFFFF;
+ hi[p] = pp;
+ sc[p] = 0;
+ eq[p] = val;
+ length++;
+ return p;
+ }
+ }
+ char s = key[start];
+ if (s < sc[p]) {
+ lo[p] = insert(lo[p], key, start, val);
+ } else if (s == sc[p]) {
+ if (s != 0) {
+ eq[p] = insert(eq[p], key, start + 1, val);
+ } else {
+ // key already in tree, overwrite data
+ eq[p] = val;
+ }
+ } else {
+ hi[p] = insert(hi[p], key, start, val);
+ }
+ return p;
+ }
+
+ /**
+ * Compares 2 null terminated char arrays
+ */
+ public static int strcmp(char[] a, int startA, char[] b, int startB) {
+ for (; a[startA] == b[startB]; startA++, startB++) {
+ if (a[startA] == 0) {
+ return 0;
+ }
+ }
+ return a[startA] - b[startB];
+ }
+
+ /**
+ * Compares a string with null terminated char array
+ */
+ public static int strcmp(String str, char[] a, int start) {
+ int i, d, len = str.length();
+ for (i = 0; i < len; i++) {
+ d = (int)str.charAt(i) - a[start + i];
+ if (d != 0) {
+ return d;
+ }
+ if (a[start + i] == 0) {
+ return d;
+ }
+ }
+ if (a[start + i] != 0) {
+ return (int)-a[start + i];
+ }
+ return 0;
+
+ }
+
+ public static void strcpy(char[] dst, int di, char[] src, int si) {
+ while (src[si] != 0) {
+ dst[di++] = src[si++];
+ }
+ dst[di] = 0;
+ }
+
+ public static int strlen(char[] a, int start) {
+ int len = 0;
+ for (int i = start; i < a.length && a[i] != 0; i++) {
+ len++;
+ }
+ return len;
+ }
+
+ public static int strlen(char[] a) {
+ return strlen(a, 0);
+ }
+
+ public int find(String key) {
+ int len = key.length();
+ char strkey[] = new char[len + 1];
+ key.getChars(0, len, strkey, 0);
+ strkey[len] = 0;
+
+ return find(strkey, 0);
+ }
+
+ public int find(char[] key, int start) {
+ int d;
+ char p = root;
+ int i = start;
+ char c;
+
+ while (p != 0) {
+ if (sc[p] == 0xFFFF) {
+ if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
+ return eq[p];
+ } else {
+ return -1;
+ }
+ }
+ c = key[i];
+ d = c - sc[p];
+ if (d == 0) {
+ if (c == 0) {
+ return eq[p];
+ }
+ i++;
+ p = eq[p];
+ } else if (d < 0) {
+ p = lo[p];
+ } else {
+ p = hi[p];
+ }
+ }
+ return -1;
+ }
+
+ public boolean knows(String key) {
+ return (find(key) >= 0);
+ }
+
+ // redimension the arrays
+ private void redimNodeArrays(int newsize) {
+ int len = newsize < lo.length ? newsize : lo.length;
+ char[] na = new char[newsize];
+ System.arraycopy(lo, 0, na, 0, len);
+ lo = na;
+ na = new char[newsize];
+ System.arraycopy(hi, 0, na, 0, len);
+ hi = na;
+ na = new char[newsize];
+ System.arraycopy(eq, 0, na, 0, len);
+ eq = na;
+ na = new char[newsize];
+ System.arraycopy(sc, 0, na, 0, len);
+ sc = na;
+ }
+
+ public int size() {
+ return length;
+ }
+
+ public Object clone() {
+ TernaryTree t = new TernaryTree();
+ t.lo = (char[])this.lo.clone();
+ t.hi = (char[])this.hi.clone();
+ t.eq = (char[])this.eq.clone();
+ t.sc = (char[])this.sc.clone();
+ t.kv = (CharVector)this.kv.clone();
+ t.root = this.root;
+ t.freenode = this.freenode;
+ t.length = this.length;
+
+ return t;
+ }
+
+ /**
+ * Recursively insert the median first and then the median of the
+ * lower and upper halves, and so on in order to get a balanced
+ * tree. The array of keys is assumed to be sorted in ascending
+ * order.
+ */
+ protected void insertBalanced(String[] k, char[] v, int offset, int n) {
+ int m;
+ if (n < 1) {
+ return;
+ }
+ m = n >> 1;
+
+ insert(k[m + offset], v[m + offset]);
+ insertBalanced(k, v, offset, m);
+
+ insertBalanced(k, v, offset + m + 1, n - m - 1);
+ }
+
+
+ /**
+ * Balance the tree for best search performance
+ */
+ public void balance() {
+ // System.out.print("Before root splitchar = "); System.out.println(sc[root]);
+
+ int i = 0, n = length;
+ String[] k = new String[n];
+ char[] v = new char[n];
+ Iterator iter = new Iterator();
+ while (iter.hasMoreElements()) {
+ v[i] = iter.getValue();
+ k[i++] = (String)iter.nextElement();
+ }
+ init();
+ insertBalanced(k, v, 0, n);
+
+ // With uniform letter distribution sc[root] should be around 'm'
+ // System.out.print("After root splitchar = "); System.out.println(sc[root]);
+ }
+
+ /**
+ * Each node stores a character (splitchar) which is part of
+ * some key(s). In a compressed branch (one that only contain
+ * a single string key) the trailer of the key which is not
+ * already in nodes is stored externally in the kv array.
+ * As items are inserted, key substrings decrease.
+ * Some substrings may completely disappear when the whole
+ * branch is totally decompressed.
+ * The tree is traversed to find the key substrings actually
+ * used. In addition, duplicate substrings are removed using
+ * a map (implemented with a TernaryTree!).
+ *
+ */
+ public void trimToSize() {
+ // first balance the tree for best performance
+ balance();
+
+ // redimension the node arrays
+ redimNodeArrays(freenode);
+
+ // ok, compact kv array
+ CharVector kx = new CharVector();
+ kx.alloc(1);
+ TernaryTree map = new TernaryTree();
+ compact(kx, map, root);
+ kv = kx;
+ kv.trimToSize();
+ }
+
+ private void compact(CharVector kx, TernaryTree map, char p) {
+ int k;
+ if (p == 0) {
+ return;
+ }
+ if (sc[p] == 0xFFFF) {
+ k = map.find(kv.getArray(), lo[p]);
+ if (k < 0) {
+ k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
+ strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
+ map.insert(kx.getArray(), k, (char)k);
+ }
+ lo[p] = (char)k;
+ } else {
+ compact(kx, map, lo[p]);
+ if (sc[p] != 0) {
+ compact(kx, map, eq[p]);
+ }
+ compact(kx, map, hi[p]);
+ }
+ }
+
+
+ public Enumeration keys() {
+ return new Iterator();
+ }
+
+ public class Iterator implements Enumeration {
+
+ /**
+ * current node index
+ */
+ int cur;
+
+ /**
+ * current key
+ */
+ String curkey;
+
+ private class Item implements Cloneable {
+ char parent;
+ char child;
+
+ public Item() {
+ parent = 0;
+ child = 0;
+ }
+
+ public Item(char p, char c) {
+ parent = p;
+ child = c;
+ }
+
+ public Object clone() {
+ return new Item(parent, child);
+ }
+
+ }
+
+ /**
+ * Node stack
+ */
+ Stack ns;
+
+ /**
+ * key stack implemented with a StringBuffer
+ */
+ StringBuffer ks;
+
+ public Iterator() {
+ cur = -1;
+ ns = new Stack();
+ ks = new StringBuffer();
+ rewind();
+ }
+
+ public void rewind() {
+ ns.removeAllElements();
+ ks.setLength(0);
+ cur = root;
+ run();
+ }
+
+ public Object nextElement() {
+ String res = new String(curkey);
+ cur = up();
+ run();
+ return res;
+ }
+
+ public char getValue() {
+ if (cur >= 0) {
+ return eq[cur];
+ }
+ return 0;
+ }
+
+ public boolean hasMoreElements() {
+ return (cur != -1);
+ }
+
+ /**
+ * traverse upwards
+ */
+ private int up() {
+ Item i = new Item();
+ int res = 0;
+
+ if (ns.empty()) {
+ return -1;
+ }
+
+ if (cur != 0 && sc[cur] == 0) {
+ return lo[cur];
+ }
+
+ boolean climb = true;
+
+ while (climb) {
+ i = (Item)ns.pop();
+ i.child++;
+ switch (i.child) {
+ case 1:
+ if (sc[i.parent] != 0) {
+ res = eq[i.parent];
+ ns.push(i.clone());
+ ks.append(sc[i.parent]);
+ } else {
+ i.child++;
+ ns.push(i.clone());
+ res = hi[i.parent];
+ }
+ climb = false;
+ break;
+
+ case 2:
+ res = hi[i.parent];
+ ns.push(i.clone());
+ if (ks.length() > 0) {
+ ks.setLength(ks.length() - 1); // pop
+ }
+ climb = false;
+ break;
+
+ default:
+ if (ns.empty()) {
+ return -1;
+ }
+ climb = true;
+ break;
+ }
+ }
+ return res;
+ }
+
+ /**
+ * traverse the tree to find next key
+ */
+ private int run() {
+ if (cur == -1) {
+ return -1;
+ }
+
+ boolean leaf = false;
+ while (true) {
+ // first go down on low branch until leaf or compressed branch
+ while (cur != 0) {
+ if (sc[cur] == 0xFFFF) {
+ leaf = true;
+ break;
+ }
+ ns.push(new Item((char)cur, '\u0000'));
+ if (sc[cur] == 0) {
+ leaf = true;
+ break;
+ }
+ cur = lo[cur];
+ }
+ if (leaf) {
+ break;
+ }
+ // nothing found, go up one node and try again
+ cur = up();
+ if (cur == -1) {
+ return -1;
+ }
+ }
+ // The current node should be a data node and
+ // the key should be in the key stack (at least partially)
+ StringBuffer buf = new StringBuffer(ks.toString());
+ if (sc[cur] == 0xFFFF) {
+ int p = lo[cur];
+ while (kv.get(p) != 0) {
+ buf.append(kv.get(p++));
+ }
+ }
+ curkey = buf.toString();
+ return 0;
+ }
+
+ }
+
+ public void printStats() {
+ System.out.println("Number of keys = " + Integer.toString(length));
+ System.out.println("Node count = " + Integer.toString(freenode));
+ // System.out.println("Array length = " + Integer.toString(eq.length));
+ System.out.println("Key Array length = "
+ + Integer.toString(kv.length()));
+
+ /*
+ * for(int i=0; i<kv.length(); i++)
+ * if ( kv.get(i) != 0 )
+ * System.out.print(kv.get(i));
+ * else
+ * System.out.println("");
+ * System.out.println("Keys:");
+ * for(Enumeration enum = keys(); enum.hasMoreElements(); )
+ * System.out.println(enum.nextElement());
+ */
+
+ }
+
+ public static void main(String[] args) throws Exception {
+ TernaryTree tt = new TernaryTree();
+ tt.insert("Carlos", 'C');
+ tt.insert("Car", 'r');
+ tt.insert("palos", 'l');
+ tt.insert("pa", 'p');
+ tt.trimToSize();
+ System.out.println((char)tt.find("Car"));
+ System.out.println((char)tt.find("Carlos"));
+ System.out.println((char)tt.find("alto"));
+ tt.printStats();
+ }
+
+}
+