From 04e3d089d6faef74065f9de7fdbec0862f97669b Mon Sep 17 00:00:00 2001 From: Glen Mazza Date: Mon, 6 Sep 2004 18:03:12 +0000 Subject: PR: Obtained from: Submitted by: Reviewed by: Moved hyphenation package to org.apache.fop.hyphenation git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@197909 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/fop/hyphenation/ByteVector.java | 126 ++++ .../org/apache/fop/hyphenation/CharVector.java | 136 +++++ src/java/org/apache/fop/hyphenation/Hyphen.java | 70 +++ .../org/apache/fop/hyphenation/Hyphenation.java | 85 +++ .../fop/hyphenation/HyphenationException.java | 34 ++ .../apache/fop/hyphenation/HyphenationTree.java | 584 ++++++++++++++++++ .../org/apache/fop/hyphenation/Hyphenator.java | 280 +++++++++ .../apache/fop/hyphenation/PatternConsumer.java | 57 ++ .../org/apache/fop/hyphenation/PatternParser.java | 433 +++++++++++++ .../org/apache/fop/hyphenation/TernaryTree.java | 669 +++++++++++++++++++++ .../apache/fop/layout/hyphenation/ByteVector.java | 126 ---- .../apache/fop/layout/hyphenation/CharVector.java | 136 ----- .../org/apache/fop/layout/hyphenation/Hyphen.java | 70 --- .../apache/fop/layout/hyphenation/Hyphenation.java | 85 --- .../layout/hyphenation/HyphenationException.java | 34 -- .../fop/layout/hyphenation/HyphenationTree.java | 584 ------------------ .../apache/fop/layout/hyphenation/Hyphenator.java | 280 --------- .../fop/layout/hyphenation/PatternConsumer.java | 57 -- .../fop/layout/hyphenation/PatternParser.java | 433 ------------- .../apache/fop/layout/hyphenation/TernaryTree.java | 669 --------------------- .../apache/fop/layoutmgr/LineLayoutManager.java | 4 +- .../fop/tools/anttasks/SerializeHyphPattern.java | 4 +- 22 files changed, 2478 insertions(+), 2478 deletions(-) create mode 100644 src/java/org/apache/fop/hyphenation/ByteVector.java create mode 100644 src/java/org/apache/fop/hyphenation/CharVector.java create mode 100644 src/java/org/apache/fop/hyphenation/Hyphen.java create mode 100644 src/java/org/apache/fop/hyphenation/Hyphenation.java create mode 100644 src/java/org/apache/fop/hyphenation/HyphenationException.java create mode 100644 src/java/org/apache/fop/hyphenation/HyphenationTree.java create mode 100644 src/java/org/apache/fop/hyphenation/Hyphenator.java create mode 100644 src/java/org/apache/fop/hyphenation/PatternConsumer.java create mode 100644 src/java/org/apache/fop/hyphenation/PatternParser.java create mode 100644 src/java/org/apache/fop/hyphenation/TernaryTree.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/ByteVector.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/CharVector.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/Hyphen.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/Hyphenation.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/HyphenationException.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/HyphenationTree.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/Hyphenator.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/PatternConsumer.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/PatternParser.java delete mode 100644 src/java/org/apache/fop/layout/hyphenation/TernaryTree.java (limited to 'src/java/org/apache') diff --git a/src/java/org/apache/fop/hyphenation/ByteVector.java b/src/java/org/apache/fop/hyphenation/ByteVector.java new file mode 100644 index 000000000..f3c232dc6 --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/ByteVector.java @@ -0,0 +1,126 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.io.Serializable; + +/** + * This class implements a simple byte vector with access to the + * underlying array. + * + * @author Carlos Villegas + */ +public class ByteVector implements Serializable { + + /** + * Capacity increment size + */ + private static final int DEFAULT_BLOCK_SIZE = 2048; + private int blockSize; + + /** + * The encapsulated array + */ + private byte[] array; + + /** + * Points to next free item + */ + private int n; + + public ByteVector() { + this(DEFAULT_BLOCK_SIZE); + } + + public ByteVector(int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = new byte[blockSize]; + n = 0; + } + + public ByteVector(byte[] a) { + blockSize = DEFAULT_BLOCK_SIZE; + array = a; + n = 0; + } + + public ByteVector(byte[] a, int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = a; + n = 0; + } + + public byte[] getArray() { + return array; + } + + /** + * return number of items in array + */ + public int length() { + return n; + } + + /** + * returns current capacity of array + */ + public int capacity() { + return array.length; + } + + public void put(int index, byte val) { + array[index] = val; + } + + public byte get(int index) { + return array[index]; + } + + /** + * This is to implement memory allocation in the array. Like malloc(). + */ + public int alloc(int size) { + int index = n; + int len = array.length; + if (n + size >= len) { + byte[] aux = new byte[len + blockSize]; + System.arraycopy(array, 0, aux, 0, len); + array = aux; + } + n += size; + return index; + } + + public void trimToSize() { + if (n < array.length) { + byte[] aux = new byte[n]; + System.arraycopy(array, 0, aux, 0, n); + array = aux; + } + } + +} diff --git a/src/java/org/apache/fop/hyphenation/CharVector.java b/src/java/org/apache/fop/hyphenation/CharVector.java new file mode 100644 index 000000000..bc4109920 --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/CharVector.java @@ -0,0 +1,136 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.io.Serializable; + +/** + * This class implements a simple char vector with access to the + * underlying array. + * + * @author Carlos Villegas + */ +public class CharVector implements Cloneable, Serializable { + + /** + * Capacity increment size + */ + private static final int DEFAULT_BLOCK_SIZE = 2048; + private int blockSize; + + /** + * The encapsulated array + */ + private char[] array; + + /** + * Points to next free item + */ + private int n; + + public CharVector() { + this(DEFAULT_BLOCK_SIZE); + } + + public CharVector(int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = new char[blockSize]; + n = 0; + } + + public CharVector(char[] a) { + blockSize = DEFAULT_BLOCK_SIZE; + array = a; + n = a.length; + } + + public CharVector(char[] a, int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = a; + n = a.length; + } + + /** + * Reset Vector but don't resize or clear elements + */ + public void clear() { + n = 0; + } + + public Object clone() { + CharVector cv = new CharVector((char[])array.clone(), blockSize); + cv.n = this.n; + return cv; + } + + public char[] getArray() { + return array; + } + + /** + * return number of items in array + */ + public int length() { + return n; + } + + /** + * returns current capacity of array + */ + public int capacity() { + return array.length; + } + + public void put(int index, char val) { + array[index] = val; + } + + public char get(int index) { + return array[index]; + } + + public int alloc(int size) { + int index = n; + int len = array.length; + if (n + size >= len) { + char[] aux = new char[len + blockSize]; + System.arraycopy(array, 0, aux, 0, len); + array = aux; + } + n += size; + return index; + } + + public void trimToSize() { + if (n < array.length) { + char[] aux = new char[n]; + System.arraycopy(array, 0, aux, 0, n); + array = aux; + } + } + +} diff --git a/src/java/org/apache/fop/hyphenation/Hyphen.java b/src/java/org/apache/fop/hyphenation/Hyphen.java new file mode 100644 index 000000000..84474c98b --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/Hyphen.java @@ -0,0 +1,70 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.io.Serializable; + +/** + * This class represents a hyphen. A 'full' hyphen is made of 3 parts: + * the pre-break text, post-break text and no-break. If no line-break + * is generated at this position, the no-break text is used, otherwise, + * pre-break and post-break are used. Typically, pre-break is equal to + * the hyphen character and the others are empty. However, this general + * scheme allows support for cases in some languages where words change + * spelling if they're split across lines, like german's 'backen' which + * hyphenates 'bak-ken'. BTW, this comes from TeX. + * + * @author Carlos Villegas + */ + +public class Hyphen implements Serializable { + public String preBreak; + public String noBreak; + public String postBreak; + + Hyphen(String pre, String no, String post) { + preBreak = pre; + noBreak = no; + postBreak = post; + } + + Hyphen(String pre) { + preBreak = pre; + noBreak = null; + postBreak = null; + } + + public String toString() { + if (noBreak == null + && postBreak == null + && preBreak != null + && preBreak.equals("-")) { + return "-"; + } + StringBuffer res = new StringBuffer("{"); + res.append(preBreak); + res.append("}{"); + res.append(postBreak); + res.append("}{"); + res.append(noBreak); + res.append('}'); + return res.toString(); + } + +} diff --git a/src/java/org/apache/fop/hyphenation/Hyphenation.java b/src/java/org/apache/fop/hyphenation/Hyphenation.java new file mode 100644 index 000000000..7a922cbdf --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/Hyphenation.java @@ -0,0 +1,85 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +/** + * This class represents a hyphenated word. + * + * @author Carlos Villegas + */ +public class Hyphenation { + + private int[] hyphenPoints; + private String word; + + /** + * number of hyphenation points in word + */ + private int len; + + /** + * rawWord as made of alternating strings and {@link Hyphen Hyphen} + * instances + */ + Hyphenation(String word, int[] points) { + this.word = word; + hyphenPoints = points; + len = points.length; + } + + /** + * @return the number of hyphenation points in the word + */ + public int length() { + return len; + } + + /** + * @return the pre-break text, not including the hyphen character + */ + public String getPreHyphenText(int index) { + return word.substring(0, hyphenPoints[index]); + } + + /** + * @return the post-break text + */ + public String getPostHyphenText(int index) { + return word.substring(hyphenPoints[index]); + } + + /** + * @return the hyphenation points + */ + public int[] getHyphenationPoints() { + return hyphenPoints; + } + + public String toString() { + StringBuffer str = new StringBuffer(); + int start = 0; + for (int i = 0; i < len; i++) { + str.append(word.substring(start, hyphenPoints[i]) + "-"); + start = hyphenPoints[i]; + } + str.append(word.substring(start)); + return str.toString(); + } + +} diff --git a/src/java/org/apache/fop/hyphenation/HyphenationException.java b/src/java/org/apache/fop/hyphenation/HyphenationException.java new file mode 100644 index 000000000..d66ce985d --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/HyphenationException.java @@ -0,0 +1,34 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +/** + * @author Carlos Villegas + * (todo) Derive from FOPException + */ +public class HyphenationException extends Exception { + + /** + * @see java.lang.Throwable#Throwable(String) + */ + public HyphenationException(String msg) { + super(msg); + } + +} diff --git a/src/java/org/apache/fop/hyphenation/HyphenationTree.java b/src/java/org/apache/fop/hyphenation/HyphenationTree.java new file mode 100644 index 000000000..51a1a875e --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/HyphenationTree.java @@ -0,0 +1,584 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; + +/** + * This tree structure stores the hyphenation patterns in an efficient + * way for fast lookup. It provides the provides the method to + * hyphenate a word. + * + * @author Carlos Villegas + */ +public class HyphenationTree extends TernaryTree + implements PatternConsumer, Serializable { + + /** + * value space: stores the inteletter values + */ + protected ByteVector vspace; + + /** + * This map stores hyphenation exceptions + */ + protected HashMap stoplist; + + /** + * This map stores the character classes + */ + protected TernaryTree classmap; + + /** + * Temporary map to store interletter values on pattern loading. + */ + private transient TernaryTree ivalues; + + public HyphenationTree() { + stoplist = new HashMap(23); // usually a small table + classmap = new TernaryTree(); + vspace = new ByteVector(); + vspace.alloc(1); // this reserves index 0, which we don't use + } + + /** + * Packs the values by storing them in 4 bits, two values into a byte + * Values range is from 0 to 9. We use zero as terminator, + * so we'll add 1 to the value. + * @param values a string of digits from '0' to '9' representing the + * interletter values. + * @return the index into the vspace array where the packed values + * are stored. + */ + protected int packValues(String values) { + int i, n = values.length(); + int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; + int offset = vspace.alloc(m); + byte[] va = vspace.getArray(); + for (i = 0; i < n; i++) { + int j = i >> 1; + byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f); + if ((i & 1) == 1) { + va[j + offset] = (byte)(va[j + offset] | v); + } else { + va[j + offset] = (byte)(v << 4); // big endian + } + } + va[m - 1 + offset] = 0; // terminator + return offset; + } + + protected String unpackValues(int k) { + StringBuffer buf = new StringBuffer(); + byte v = vspace.get(k++); + while (v != 0) { + char c = (char)((v >>> 4) - 1 + '0'); + buf.append(c); + c = (char)(v & 0x0f); + if (c == 0) { + break; + } + c = (char)(c - 1 + '0'); + buf.append(c); + v = vspace.get(k++); + } + return buf.toString(); + } + + /** + * Read hyphenation patterns from an XML file. + */ + public void loadPatterns(String filename) throws HyphenationException { + PatternParser pp = new PatternParser(this); + ivalues = new TernaryTree(); + + pp.parse(filename); + + // patterns/values should be now in the tree + // let's optimize a bit + trimToSize(); + vspace.trimToSize(); + classmap.trimToSize(); + + // get rid of the auxiliary map + ivalues = null; + } + + public String findPattern(String pat) { + int k = super.find(pat); + if (k >= 0) { + return unpackValues(k); + } + return ""; + } + + /** + * String compare, returns 0 if equal or + * t is a substring of s + */ + protected int hstrcmp(char[] s, int si, char[] t, int ti) { + for (; s[si] == t[ti]; si++, ti++) { + if (s[si] == 0) { + return 0; + } + } + if (t[ti] == 0) { + return 0; + } + return s[si] - t[ti]; + } + + protected byte[] getValues(int k) { + StringBuffer buf = new StringBuffer(); + byte v = vspace.get(k++); + while (v != 0) { + char c = (char)((v >>> 4) - 1); + buf.append(c); + c = (char)(v & 0x0f); + if (c == 0) { + break; + } + c = (char)(c - 1); + buf.append(c); + v = vspace.get(k++); + } + byte[] res = new byte[buf.length()]; + for (int i = 0; i < res.length; i++) { + res[i] = (byte)buf.charAt(i); + } + return res; + } + + /** + *

Search for all possible partial matches of word starting + * at index an update interletter values. In other words, it + * does something like:

+ * + * for(i=0; i + *

But it is done in an efficient way since the patterns are + * stored in a ternary tree. In fact, this is the whole purpose + * of having the tree: doing this search without having to test + * every single pattern. The number of patterns for languages + * such as English range from 4000 to 10000. Thus, doing thousands + * of string comparisons for each word to hyphenate would be + * really slow without the tree. The tradeoff is memory, but + * using a ternary tree instead of a trie, almost halves the + * the memory used by Lout or TeX. It's also faster than using + * a hash table

+ * @param word null terminated word to match + * @param index start index from word + * @param il interletter values array to update + */ + protected void searchPatterns(char[] word, int index, byte[] il) { + byte[] values; + int i = index; + char p, q; + char sp = word[i]; + p = root; + + while (p > 0 && p < sc.length) { + if (sc[p] == 0xFFFF) { + if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { + values = getValues(eq[p]); // data pointer is in eq[] + int j = index; + for (int k = 0; k < values.length; k++) { + if (j < il.length && values[k] > il[j]) { + il[j] = values[k]; + } + j++; + } + } + return; + } + int d = sp - sc[p]; + if (d == 0) { + if (sp == 0) { + break; + } + sp = word[++i]; + p = eq[p]; + q = p; + + // look for a pattern ending at this position by searching for + // the null char ( splitchar == 0 ) + while (q > 0 && q < sc.length) { + if (sc[q] == 0xFFFF) { // stop at compressed branch + break; + } + if (sc[q] == 0) { + values = getValues(eq[q]); + int j = index; + for (int k = 0; k < values.length; k++) { + if (j < il.length && values[k] > il[j]) { + il[j] = values[k]; + } + j++; + } + break; + } else { + q = lo[q]; + + /** + * actually the code should be: + * q = sc[q] < 0 ? hi[q] : lo[q]; + * but java chars are unsigned + */ + } + } + } else { + p = d < 0 ? lo[p] : hi[p]; + } + } + } + + /** + * Hyphenate word and return a Hyphenation object. + * @param word the word to be hyphenated + * @param remainCharCount Minimum number of characters allowed + * before the hyphenation point. + * @param pushCharCount Minimum number of characters allowed after + * the hyphenation point. + * @return a {@link Hyphenation Hyphenation} object representing + * the hyphenated word or null if word is not hyphenated. + */ + public Hyphenation hyphenate(String word, int remainCharCount, + int pushCharCount) { + char[] w = word.toCharArray(); + return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); + } + + /** + * w = "****nnllllllnnn*****", + * where n is a non-letter, l is a letter, + * all n may be absent, the first n is at offset, + * the first l is at offset + iIgnoreAtBeginning; + * word = ".llllll.'\0'***", + * where all l in w are copied into word. + * In the first part of the routine len = w.length, + * in the second part of the routine len = word.length. + * Three indices are used: + * index(w), the index in w, + * index(word), the index in word, + * letterindex(word), the index in the letter part of word. + * The following relations exist: + * index(w) = offset + i - 1 + * index(word) = i - iIgnoreAtBeginning + * letterindex(word) = index(word) - 1 + * (see first loop). + * It follows that: + * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning + * index(w) = letterindex(word) + offset + iIgnoreAtBeginning + */ + + /** + * Hyphenate word and return an array of hyphenation points. + * @param w char array that contains the word + * @param offset Offset to first character in word + * @param len Length of word + * @param remainCharCount Minimum number of characters allowed + * before the hyphenation point. + * @param pushCharCount Minimum number of characters allowed after + * the hyphenation point. + * @return a {@link Hyphenation Hyphenation} object representing + * the hyphenated word or null if word is not hyphenated. + */ + public Hyphenation hyphenate(char[] w, int offset, int len, + int remainCharCount, int pushCharCount) { + int i; + char[] word = new char[len + 3]; + + // normalize word + char[] c = new char[2]; + int iIgnoreAtBeginning = 0; + int iLength = len; + boolean bEndOfLetters = false; + for (i = 1; i <= len; i++) { + c[0] = w[offset + i - 1]; + int nc = classmap.find(c, 0); + if (nc < 0) { // found a non-letter character ... + if (i == (1 + iIgnoreAtBeginning)) { + // ... before any letter character + iIgnoreAtBeginning ++; + } else { + // ... after a letter character + bEndOfLetters = true; + } + iLength --; + } else { + if (!bEndOfLetters) { + word[i - iIgnoreAtBeginning] = (char)nc; + } else { + return null; + } + } + } + len = iLength; + if (len < (remainCharCount + pushCharCount)) { + // word is too short to be hyphenated + return null; + } + int[] result = new int[len + 1]; + int k = 0; + + // check exception list first + String sw = new String(word, 1, len); + if (stoplist.containsKey(sw)) { + // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null) + ArrayList hw = (ArrayList)stoplist.get(sw); + int j = 0; + for (i = 0; i < hw.size(); i++) { + Object o = hw.get(i); + // j = index(sw) = letterindex(word)? + // result[k] = corresponding index(w) + if (o instanceof String) { + j += ((String)o).length(); + if (j >= remainCharCount && j < (len - pushCharCount)) { + result[k++] = j + iIgnoreAtBeginning; + } + } + } + } else { + // use algorithm to get hyphenation points + word[0] = '.'; // word start marker + word[len + 1] = '.'; // word end marker + word[len + 2] = 0; // null terminated + byte[] il = new byte[len + 3]; // initialized to zero + for (i = 0; i < len + 1; i++) { + searchPatterns(word, i, il); + } + + // hyphenation points are located where interletter value is odd + // i is letterindex(word), + // i + 1 is index(word), + // result[k] = corresponding index(w) + for (i = 0; i < len; i++) { + if (((il[i + 1] & 1) == 1) && i >= remainCharCount + && i <= (len - pushCharCount)) { + result[k++] = i + iIgnoreAtBeginning; + } + } + } + + + if (k > 0) { + // trim result array + int[] res = new int[k]; + System.arraycopy(result, 0, res, 0, k); + return new Hyphenation(new String(w, offset, len), res); + } else { + return null; + } + } + + /** + * Add a character class to the tree. It is used by + * {@link PatternParser PatternParser} as callback to + * add character classes. Character classes define the + * valid word characters for hyphenation. If a word contains + * a character not defined in any of the classes, it is not hyphenated. + * It also defines a way to normalize the characters in order + * to compare them with the stored patterns. Usually pattern + * files use only lower case characters, in this case a class + * for letter 'a', for example, should be defined as "aA", the first + * character being the normalization char. + */ + public void addClass(String chargroup) { + if (chargroup.length() > 0) { + char equivChar = chargroup.charAt(0); + char[] key = new char[2]; + key[1] = 0; + for (int i = 0; i < chargroup.length(); i++) { + key[0] = chargroup.charAt(i); + classmap.insert(key, 0, equivChar); + } + } + } + + /** + * Add an exception to the tree. It is used by + * {@link PatternParser PatternParser} class as callback to + * store the hyphenation exceptions. + * @param word normalized word + * @param hyphenatedword a vector of alternating strings and + * {@link Hyphen hyphen} objects. + */ + public void addException(String word, ArrayList hyphenatedword) { + stoplist.put(word, hyphenatedword); + } + + /** + * Add a pattern to the tree. Mainly, to be used by + * {@link PatternParser PatternParser} class as callback to + * add a pattern to the tree. + * @param pattern the hyphenation pattern + * @param ivalue interletter weight values indicating the + * desirability and priority of hyphenating at a given point + * within the pattern. It should contain only digit characters. + * (i.e. '0' to '9'). + */ + public void addPattern(String pattern, String ivalue) { + int k = ivalues.find(ivalue); + if (k <= 0) { + k = packValues(ivalue); + ivalues.insert(ivalue, (char)k); + } + insert(pattern, (char)k); + } + + public void printStats() { + System.out.println("Value space size = " + + Integer.toString(vspace.length())); + super.printStats(); + + } + + public static void main(String[] argv) throws Exception { + HyphenationTree ht = null; + int minCharCount = 2; + BufferedReader in = + new BufferedReader(new java.io.InputStreamReader(System.in)); + while (true) { + System.out.print("l:\tload patterns from XML\n" + + "L:\tload patterns from serialized object\n" + + "s:\tset minimun character count\n" + + "w:\twrite hyphenation tree to object file\n" + + "h:\thyphenate\n" + + "f:\tfind pattern\n" + + "b:\tbenchmark\n" + + "q:\tquit\n\n" + + "Command:"); + String token = in.readLine().trim(); + if (token.equals("f")) { + System.out.print("Pattern: "); + token = in.readLine().trim(); + System.out.println("Values: " + ht.findPattern(token)); + } else if (token.equals("s")) { + System.out.print("Minimun value: "); + token = in.readLine().trim(); + minCharCount = Integer.parseInt(token); + } else if (token.equals("l")) { + ht = new HyphenationTree(); + System.out.print("XML file name: "); + token = in.readLine().trim(); + ht.loadPatterns(token); + } else if (token.equals("L")) { + ObjectInputStream ois = null; + System.out.print("Object file name: "); + token = in.readLine().trim(); + try { + ois = new ObjectInputStream(new FileInputStream(token)); + ht = (HyphenationTree)ois.readObject(); + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (ois != null) { + try { + ois.close(); + } catch (IOException e) { + //ignore + } + } + } + } else if (token.equals("w")) { + System.out.print("Object file name: "); + token = in.readLine().trim(); + ObjectOutputStream oos = null; + try { + oos = new ObjectOutputStream(new FileOutputStream(token)); + oos.writeObject(ht); + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (oos != null) { + try { + oos.flush(); + } catch (IOException e) { + //ignore + } + try { + oos.close(); + } catch (IOException e) { + //ignore + } + } + } + } else if (token.equals("h")) { + System.out.print("Word: "); + token = in.readLine().trim(); + System.out.print("Hyphenation points: "); + System.out.println(ht.hyphenate(token, minCharCount, + minCharCount)); + } else if (token.equals("b")) { + if (ht == null) { + System.out.println("No patterns has been loaded."); + break; + } + System.out.print("Word list filename: "); + token = in.readLine().trim(); + long starttime = 0; + int counter = 0; + try { + BufferedReader reader = + new BufferedReader(new FileReader(token)); + String line; + + starttime = System.currentTimeMillis(); + while ((line = reader.readLine()) != null) { + // System.out.print("\nline: "); + Hyphenation hyp = ht.hyphenate(line, minCharCount, + minCharCount); + if (hyp != null) { + String hword = hyp.toString(); + // System.out.println(line); + // System.out.println(hword); + } else { + // System.out.println("No hyphenation"); + } + counter++; + } + } catch (Exception ioe) { + System.out.println("Exception " + ioe); + ioe.printStackTrace(); + } + long endtime = System.currentTimeMillis(); + long result = endtime - starttime; + System.out.println(counter + " words in " + result + + " Millisekunden hyphenated"); + + } else if (token.equals("q")) { + break; + } + } + + } + +} diff --git a/src/java/org/apache/fop/hyphenation/Hyphenator.java b/src/java/org/apache/fop/hyphenation/Hyphenator.java new file mode 100644 index 000000000..1e8c52e66 --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/Hyphenator.java @@ -0,0 +1,280 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.Hashtable; + +/** + * This class is the main entry point to the hyphenation package. + * You can use only the static methods or create an instance. + * + * @author Carlos Villegas + */ +public class Hyphenator { + + /**@todo Don't use statics */ + private static Hashtable hyphenTrees = new Hashtable(); + + private HyphenationTree hyphenTree = null; + private int remainCharCount = 2; + private int pushCharCount = 2; + private static boolean errorDump = false; + + public Hyphenator(String lang, String country, int leftMin, + int rightMin) { + hyphenTree = getHyphenationTree(lang, country); + remainCharCount = leftMin; + pushCharCount = rightMin; + } + + public static HyphenationTree getHyphenationTree(String lang, + String country) { + String key = lang; + // check whether the country code has been used + if (country != null && !country.equals("none")) { + key += "_" + country; + } + // first try to find it in the cache + if (hyphenTrees.containsKey(key)) { + return (HyphenationTree)hyphenTrees.get(key); + } + if (hyphenTrees.containsKey(lang)) { + return (HyphenationTree)hyphenTrees.get(lang); + } + + HyphenationTree hTree = getFopHyphenationTree(key); + if (hTree == null) { + String hyphenDir = "/hyph"; + if (hyphenDir != null) { + hTree = getUserHyphenationTree(key, hyphenDir); + } + } + // put it into the pattern cache + if (hTree != null) { + hyphenTrees.put(key, hTree); + } else { + /**@todo Proper logging please */ + //log.error("Couldn't find hyphenation pattern " + // + key); + } + return hTree; + } + + private static InputStream getResourceStream(String key) { + InputStream is = null; + // Try to use Context Class Loader to load the properties file. + try { + java.lang.reflect.Method getCCL = + Thread.class.getMethod("getContextClassLoader", new Class[0]); + if (getCCL != null) { + ClassLoader contextClassLoader = + (ClassLoader)getCCL.invoke(Thread.currentThread(), + new Object[0]); + is = contextClassLoader.getResourceAsStream("hyph/" + key + + ".hyp"); + } + } catch (Exception e) { + //ignore, fallback further down + } + + if (is == null) { + is = Hyphenator.class.getResourceAsStream("/hyph/" + key + + ".hyp"); + } + + return is; + } + + public static HyphenationTree getFopHyphenationTree(String key) { + HyphenationTree hTree = null; + ObjectInputStream ois = null; + InputStream is = null; + try { + is = getResourceStream(key); + if (is == null) { + if (key.length() == 5) { + is = getResourceStream(key.substring(0, 2)); + if (is != null) { + //log.error("Couldn't find hyphenation pattern " + // + key + // + "\nusing general language pattern " + // + key.substring(0, 2) + // + " instead."); + } else { + if (errorDump) { + //log.error("Couldn't find precompiled " + // + "fop hyphenation pattern " + // + key + ".hyp"); + } + return null; + } + } else { + if (errorDump) { + //log.error("Couldn't find precompiled " + // + "fop hyphenation pattern " + // + key + ".hyp"); + } + return null; + } + } + ois = new ObjectInputStream(is); + hTree = (HyphenationTree)ois.readObject(); + } catch (Exception e) { + /**@todo proper logging please */ + e.printStackTrace(); + } finally { + if (ois != null) { + try { + ois.close(); + } catch (IOException e) { + //log.error("can't close hyphenation object stream"); + } + } + } + return hTree; + } + + /** + * load tree from serialized file or xml file + * using configuration settings + */ + public static HyphenationTree getUserHyphenationTree(String key, + String hyphenDir) { + HyphenationTree hTree = null; + // I use here the following convention. The file name specified in + // the configuration is taken as the base name. First we try + // name + ".hyp" assuming a serialized HyphenationTree. If that fails + // we try name + ".xml", assumming a raw hyphenation pattern file. + + // first try serialized object + File hyphenFile = new File(hyphenDir, key + ".hyp"); + if (hyphenFile.exists()) { + ObjectInputStream ois = null; + try { + ois = new ObjectInputStream(new BufferedInputStream( + new FileInputStream(hyphenFile))); + hTree = (HyphenationTree)ois.readObject(); + } catch (Exception e) { + /**@todo Proper logging please */ + e.printStackTrace(); + } finally { + if (ois != null) { + try { + ois.close(); + } catch (IOException e) { + //ignore + } + } + } + return hTree; + } else { + + // try the raw XML file + hyphenFile = new File(hyphenDir, key + ".xml"); + if (hyphenFile.exists()) { + hTree = new HyphenationTree(); + if (errorDump) { + //log.error("reading " + hyphenDir + key + // + ".xml"); + } + try { + hTree.loadPatterns(hyphenFile.getPath()); + if (errorDump) { + System.out.println("Stats: "); + hTree.printStats(); + } + return hTree; + } catch (HyphenationException ex) { + if (errorDump) { + //log.error("Can't load user patterns " + // + "from xml file " + hyphenDir + // + key + ".xml"); + } + return null; + } + } else { + if (errorDump) { + //log.error("Tried to load " + // + hyphenFile.toString() + // + "\nCannot find compiled nor xml file for " + // + "hyphenation pattern" + key); + } + return null; + } + } + } + + public static Hyphenation hyphenate(String lang, String country, + String word, int leftMin, + int rightMin) { + HyphenationTree hTree = getHyphenationTree(lang, country); + if (hTree == null) { + //log.error("Error building hyphenation tree for language " + // + lang); + return null; + } + return hTree.hyphenate(word, leftMin, rightMin); + } + + public static Hyphenation hyphenate(String lang, String country, + char[] word, int offset, int len, + int leftMin, int rightMin) { + HyphenationTree hTree = getHyphenationTree(lang, country); + if (hTree == null) { + //log.error("Error building hyphenation tree for language " + // + lang); + return null; + } + return hTree.hyphenate(word, offset, len, leftMin, rightMin); + } + + public void setMinRemainCharCount(int min) { + remainCharCount = min; + } + + public void setMinPushCharCount(int min) { + pushCharCount = min; + } + + public void setLanguage(String lang, String country) { + hyphenTree = getHyphenationTree(lang, country); + } + + public Hyphenation hyphenate(char[] word, int offset, int len) { + if (hyphenTree == null) { + return null; + } + return hyphenTree.hyphenate(word, offset, len, remainCharCount, + pushCharCount); + } + + public Hyphenation hyphenate(String word) { + if (hyphenTree == null) { + return null; + } + return hyphenTree.hyphenate(word, remainCharCount, pushCharCount); + } + +} diff --git a/src/java/org/apache/fop/hyphenation/PatternConsumer.java b/src/java/org/apache/fop/hyphenation/PatternConsumer.java new file mode 100644 index 000000000..4f63de381 --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/PatternConsumer.java @@ -0,0 +1,57 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.util.ArrayList; + +/** + * This interface is used to connect the XML pattern file parser to + * the hyphenation tree. + * + * @author Carlos Villegas + */ +public interface PatternConsumer { + + /** + * Add a character class. + * A character class defines characters that are considered + * equivalent for the purpose of hyphenation (e.g. "aA"). It + * usually means to ignore case. + * @param chargroup character group + */ + void addClass(String chargroup); + + /** + * Add a hyphenation exception. An exception replaces the + * result obtained by the algorithm for cases for which this + * fails or the user wants to provide his own hyphenation. + * A hyphenatedword is a vector of alternating String's and + * {@link Hyphen Hyphen} instances + */ + void addException(String word, ArrayList hyphenatedword); + + /** + * Add hyphenation patterns. + * @param pattern the pattern + * @param values interletter values expressed as a string of + * digit characters. + */ + void addPattern(String pattern, String values); + +} diff --git a/src/java/org/apache/fop/hyphenation/PatternParser.java b/src/java/org/apache/fop/hyphenation/PatternParser.java new file mode 100644 index 000000000..bde436f82 --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/PatternParser.java @@ -0,0 +1,433 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +// SAX +import org.xml.sax.XMLReader; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.Attributes; + +// Java +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.net.URL; + +/** + * A SAX document handler to read and parse hyphenation patterns + * from a XML file. + * + * @author Carlos Villegas + */ +public class PatternParser extends DefaultHandler implements PatternConsumer { + + XMLReader parser; + int currElement; + PatternConsumer consumer; + StringBuffer token; + ArrayList exception; + char hyphenChar; + String errMsg; + + static final int ELEM_CLASSES = 1; + static final int ELEM_EXCEPTIONS = 2; + static final int ELEM_PATTERNS = 3; + static final int ELEM_HYPHEN = 4; + + public PatternParser() throws HyphenationException { + token = new StringBuffer(); + parser = createParser(); + parser.setContentHandler(this); + parser.setErrorHandler(this); + hyphenChar = '-'; // default + + } + + public PatternParser(PatternConsumer consumer) + throws HyphenationException { + this(); + this.consumer = consumer; + } + + public void setConsumer(PatternConsumer consumer) { + this.consumer = consumer; + } + + public void parse(String filename) throws HyphenationException { + InputSource uri = fileInputSource(filename); + + try { + parser.parse(uri); + } catch (SAXException e) { + throw new HyphenationException(errMsg); + } catch (IOException e) { + throw new HyphenationException(e.getMessage()); + } catch (NullPointerException e) { + throw new HyphenationException("SAX parser not available"); + } + } + + /** + * creates a SAX parser, using the value of org.xml.sax.parser + * defaulting to org.apache.xerces.parsers.SAXParser + * + * @return the created SAX parser + */ + static XMLReader createParser() throws HyphenationException { + String parserClassName = System.getProperty("org.xml.sax.parser"); + if (parserClassName == null) { + parserClassName = "org.apache.xerces.parsers.SAXParser"; + } + // System.out.println("using SAX parser " + parserClassName); + + try { + return (XMLReader)Class.forName(parserClassName).newInstance(); + } catch (ClassNotFoundException e) { + throw new HyphenationException("Could not find " + + parserClassName); + } catch (InstantiationException e) { + throw new HyphenationException("Could not instantiate " + + parserClassName); + } catch (IllegalAccessException e) { + throw new HyphenationException("Could not access " + + parserClassName); + } catch (ClassCastException e) { + throw new HyphenationException(parserClassName + + " is not a SAX driver"); + } + } + + /** + * create an InputSource from a file name + * + * @param filename the name of the file + * @return the InputSource created + */ + protected static InputSource fileInputSource(String filename) + throws HyphenationException { + + /* this code adapted from James Clark's in XT */ + File file = new File(filename); + String path = file.getAbsolutePath(); + String fSep = System.getProperty("file.separator"); + if (fSep != null && fSep.length() == 1) { + path = path.replace(fSep.charAt(0), '/'); + } + if (path.length() > 0 && path.charAt(0) != '/') { + path = '/' + path; + } + try { + return new InputSource(new URL("file", null, path).toString()); + } catch (java.net.MalformedURLException e) { + throw new HyphenationException("unexpected MalformedURLException"); + } + } + + protected String readToken(StringBuffer chars) { + String word; + boolean space = false; + int i; + for (i = 0; i < chars.length(); i++) { + if (Character.isWhitespace(chars.charAt(i))) { + space = true; + } else { + break; + } + } + if (space) { + // chars.delete(0,i); + for (int countr = i; countr < chars.length(); countr++) { + chars.setCharAt(countr - i, chars.charAt(countr)); + } + chars.setLength(chars.length() - i); + if (token.length() > 0) { + word = token.toString(); + token.setLength(0); + return word; + } + } + space = false; + for (i = 0; i < chars.length(); i++) { + if (Character.isWhitespace(chars.charAt(i))) { + space = true; + break; + } + } + token.append(chars.toString().substring(0, i)); + // chars.delete(0,i); + for (int countr = i; countr < chars.length(); countr++) { + chars.setCharAt(countr - i, chars.charAt(countr)); + } + chars.setLength(chars.length() - i); + if (space) { + word = token.toString(); + token.setLength(0); + return word; + } + token.append(chars); + return null; + } + + protected static String getPattern(String word) { + StringBuffer pat = new StringBuffer(); + int len = word.length(); + for (int i = 0; i < len; i++) { + if (!Character.isDigit(word.charAt(i))) { + pat.append(word.charAt(i)); + } + } + return pat.toString(); + } + + protected ArrayList normalizeException(ArrayList ex) { + ArrayList res = new ArrayList(); + for (int i = 0; i < ex.size(); i++) { + Object item = ex.get(i); + if (item instanceof String) { + String str = (String)item; + StringBuffer buf = new StringBuffer(); + for (int j = 0; j < str.length(); j++) { + char c = str.charAt(j); + if (c != hyphenChar) { + buf.append(c); + } else { + res.add(buf.toString()); + buf.setLength(0); + char[] h = new char[1]; + h[0] = hyphenChar; + // we use here hyphenChar which is not necessarily + // the one to be printed + res.add(new Hyphen(new String(h), null, null)); + } + } + if (buf.length() > 0) { + res.add(buf.toString()); + } + } else { + res.add(item); + } + } + return res; + } + + protected String getExceptionWord(ArrayList ex) { + StringBuffer res = new StringBuffer(); + for (int i = 0; i < ex.size(); i++) { + Object item = ex.get(i); + if (item instanceof String) { + res.append((String)item); + } else { + if (((Hyphen)item).noBreak != null) { + res.append(((Hyphen)item).noBreak); + } + } + } + return res.toString(); + } + + protected static String getInterletterValues(String pat) { + StringBuffer il = new StringBuffer(); + String word = pat + "a"; // add dummy letter to serve as sentinel + int len = word.length(); + for (int i = 0; i < len; i++) { + char c = word.charAt(i); + if (Character.isDigit(c)) { + il.append(c); + i++; + } else { + il.append('0'); + } + } + return il.toString(); + } + + // + // DocumentHandler methods + // + + /** + * Start element. + */ + public void startElement(String uri, String local, String raw, + Attributes attrs) { + if (local.equals("hyphen-char")) { + String h = attrs.getValue("value"); + if (h != null && h.length() == 1) { + hyphenChar = h.charAt(0); + } + } else if (local.equals("classes")) { + currElement = ELEM_CLASSES; + } else if (local.equals("patterns")) { + currElement = ELEM_PATTERNS; + } else if (local.equals("exceptions")) { + currElement = ELEM_EXCEPTIONS; + exception = new ArrayList(); + } else if (local.equals("hyphen")) { + if (token.length() > 0) { + exception.add(token.toString()); + } + exception.add(new Hyphen(attrs.getValue("pre"), + attrs.getValue("no"), + attrs.getValue("post"))); + currElement = ELEM_HYPHEN; + } + token.setLength(0); + } + + public void endElement(String uri, String local, String raw) { + + if (token.length() > 0) { + String word = token.toString(); + switch (currElement) { + case ELEM_CLASSES: + consumer.addClass(word); + break; + case ELEM_EXCEPTIONS: + exception.add(word); + exception = normalizeException(exception); + consumer.addException(getExceptionWord(exception), + (ArrayList)exception.clone()); + break; + case ELEM_PATTERNS: + consumer.addPattern(getPattern(word), + getInterletterValues(word)); + break; + case ELEM_HYPHEN: + // nothing to do + break; + } + if (currElement != ELEM_HYPHEN) { + token.setLength(0); + } + } + if (currElement == ELEM_HYPHEN) { + currElement = ELEM_EXCEPTIONS; + } else { + currElement = 0; + } + + } + + /** + * Characters. + */ + public void characters(char ch[], int start, int length) { + StringBuffer chars = new StringBuffer(length); + chars.append(ch, start, length); + String word = readToken(chars); + while (word != null) { + // System.out.println("\"" + word + "\""); + switch (currElement) { + case ELEM_CLASSES: + consumer.addClass(word); + break; + case ELEM_EXCEPTIONS: + exception.add(word); + exception = normalizeException(exception); + consumer.addException(getExceptionWord(exception), + (ArrayList)exception.clone()); + exception.clear(); + break; + case ELEM_PATTERNS: + consumer.addPattern(getPattern(word), + getInterletterValues(word)); + break; + } + word = readToken(chars); + } + + } + + // + // ErrorHandler methods + // + + /** + * Warning. + */ + public void warning(SAXParseException ex) { + errMsg = "[Warning] " + getLocationString(ex) + ": " + + ex.getMessage(); + } + + /** + * Error. + */ + public void error(SAXParseException ex) { + errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage(); + } + + /** + * Fatal error. + */ + public void fatalError(SAXParseException ex) throws SAXException { + errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + + ex.getMessage(); + throw ex; + } + + /** + * Returns a string of the location. + */ + private String getLocationString(SAXParseException ex) { + StringBuffer str = new StringBuffer(); + + String systemId = ex.getSystemId(); + if (systemId != null) { + int index = systemId.lastIndexOf('/'); + if (index != -1) { + systemId = systemId.substring(index + 1); + } + str.append(systemId); + } + str.append(':'); + str.append(ex.getLineNumber()); + str.append(':'); + str.append(ex.getColumnNumber()); + + return str.toString(); + + } // getLocationString(SAXParseException):String + + + // PatternConsumer implementation for testing purposes + public void addClass(String c) { + System.out.println("class: " + c); + } + + public void addException(String w, ArrayList e) { + System.out.println("exception: " + w + " : " + e.toString()); + } + + public void addPattern(String p, String v) { + System.out.println("pattern: " + p + " : " + v); + } + + public static void main(String[] args) throws Exception { + if (args.length > 0) { + PatternParser pp = new PatternParser(); + pp.setConsumer(pp); + pp.parse(args[0]); + } + } + +} diff --git a/src/java/org/apache/fop/hyphenation/TernaryTree.java b/src/java/org/apache/fop/hyphenation/TernaryTree.java new file mode 100644 index 000000000..d026dc858 --- /dev/null +++ b/src/java/org/apache/fop/hyphenation/TernaryTree.java @@ -0,0 +1,669 @@ +/* + * Copyright 1999-2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.hyphenation; + +import java.util.Enumeration; +import java.util.Stack; +import java.io.Serializable; + +/** + *

Ternary Search Tree.

+ * + *

A ternary search tree is a hibrid between a binary tree and + * a digital search tree (trie). Keys are limited to strings. + * A data value of type char is stored in each leaf node. + * It can be used as an index (or pointer) to the data. + * Branches that only contain one key are compressed to one node + * by storing a pointer to the trailer substring of the key. + * This class is intended to serve as base class or helper class + * to implement Dictionary collections or the like. Ternary trees + * have some nice properties as the following: the tree can be + * traversed in sorted order, partial matches (wildcard) can be + * implemented, retrieval of all keys within a given distance + * from the target, etc. The storage requirements are higher than + * a binary tree but a lot less than a trie. Performance is + * comparable with a hash table, sometimes it outperforms a hash + * function (most of the time can determine a miss faster than a hash).

+ * + *

The main purpose of this java port is to serve as a base for + * implementing TeX's hyphenation algorithm (see The TeXBook, + * appendix H). Each language requires from 5000 to 15000 hyphenation + * patterns which will be keys in this tree. The strings patterns + * are usually small (from 2 to 5 characters), but each char in the + * tree is stored in a node. Thus memory usage is the main concern. + * We will sacrify 'elegance' to keep memory requirenments to the + * minimum. Using java's char type as pointer (yes, I know pointer + * it is a forbidden word in java) we can keep the size of the node + * to be just 8 bytes (3 pointers and the data char). This gives + * room for about 65000 nodes. In my tests the english patterns + * took 7694 nodes and the german patterns 10055 nodes, + * so I think we are safe.

+ * + *

All said, this is a map with strings as keys and char as value. + * Pretty limited!. It can be extended to a general map by + * using the string representation of an object and using the + * char value as an index to an array that contains the object + * values.

+ * + * @author cav@uniscope.co.jp + */ + +public class TernaryTree implements Cloneable, Serializable { + + /** + * We use 4 arrays to represent a node. I guess I should have created + * a proper node class, but somehow Knuth's pascal code made me forget + * we now have a portable language with virtual memory management and + * automatic garbage collection! And now is kind of late, furthermore, + * if it ain't broken, don't fix it. + */ + + /** + * Pointer to low branch and to rest of the key when it is + * stored directly in this node, we don't have unions in java! + */ + protected char[] lo; + + /** + * Pointer to high branch. + */ + protected char[] hi; + + /** + * Pointer to equal branch and to data when this node is a string terminator. + */ + protected char[] eq; + + /** + *

The character stored in this node: splitchar. + * Two special values are reserved:

+ *
  • 0x0000 as string terminator
  • + *
  • 0xFFFF to indicate that the branch starting at + * this node is compressed
+ *

This shouldn't be a problem if we give the usual semantics to + * strings since 0xFFFF is garanteed not to be an Unicode character.

+ */ + protected char[] sc; + + /** + * This vector holds the trailing of the keys when the branch is compressed. + */ + protected CharVector kv; + + protected char root; + protected char freenode; + protected int length; // number of items in tree + + protected static final int BLOCK_SIZE = 2048; // allocation size for arrays + + TernaryTree() { + init(); + } + + protected void init() { + root = 0; + freenode = 1; + length = 0; + lo = new char[BLOCK_SIZE]; + hi = new char[BLOCK_SIZE]; + eq = new char[BLOCK_SIZE]; + sc = new char[BLOCK_SIZE]; + kv = new CharVector(); + } + + /** + * Branches are initially compressed, needing + * one node per key plus the size of the string + * key. They are decompressed as needed when + * another key with same prefix + * is inserted. This saves a lot of space, + * specially for long keys. + */ + public void insert(String key, char val) { + // make sure we have enough room in the arrays + int len = key.length() + + 1; // maximum number of nodes that may be generated + if (freenode + len > eq.length) { + redimNodeArrays(eq.length + BLOCK_SIZE); + } + char strkey[] = new char[len--]; + key.getChars(0, len, strkey, 0); + strkey[len] = 0; + root = insert(root, strkey, 0, val); + } + + public void insert(char[] key, int start, char val) { + int len = strlen(key) + 1; + if (freenode + len > eq.length) { + redimNodeArrays(eq.length + BLOCK_SIZE); + } + root = insert(root, key, start, val); + } + + /** + * The actual insertion function, recursive version. + */ + private char insert(char p, char[] key, int start, char val) { + int len = strlen(key, start); + if (p == 0) { + // this means there is no branch, this node will start a new branch. + // Instead of doing that, we store the key somewhere else and create + // only one node with a pointer to the key + p = freenode++; + eq[p] = val; // holds data + length++; + hi[p] = 0; + if (len > 0) { + sc[p] = 0xFFFF; // indicates branch is compressed + lo[p] = (char)kv.alloc(len + + 1); // use 'lo' to hold pointer to key + strcpy(kv.getArray(), lo[p], key, start); + } else { + sc[p] = 0; + lo[p] = 0; + } + return p; + } + + if (sc[p] == 0xFFFF) { + // branch is compressed: need to decompress + // this will generate garbage in the external key array + // but we can do some garbage collection later + char pp = freenode++; + lo[pp] = lo[p]; // previous pointer to key + eq[pp] = eq[p]; // previous pointer to data + lo[p] = 0; + if (len > 0) { + sc[p] = kv.get(lo[pp]); + eq[p] = pp; + lo[pp]++; + if (kv.get(lo[pp]) == 0) { + // key completly decompressed leaving garbage in key array + lo[pp] = 0; + sc[pp] = 0; + hi[pp] = 0; + } else { + // we only got first char of key, rest is still there + sc[pp] = 0xFFFF; + } + } else { + // In this case we can save a node by swapping the new node + // with the compressed node + sc[pp] = 0xFFFF; + hi[p] = pp; + sc[p] = 0; + eq[p] = val; + length++; + return p; + } + } + char s = key[start]; + if (s < sc[p]) { + lo[p] = insert(lo[p], key, start, val); + } else if (s == sc[p]) { + if (s != 0) { + eq[p] = insert(eq[p], key, start + 1, val); + } else { + // key already in tree, overwrite data + eq[p] = val; + } + } else { + hi[p] = insert(hi[p], key, start, val); + } + return p; + } + + /** + * Compares 2 null terminated char arrays + */ + public static int strcmp(char[] a, int startA, char[] b, int startB) { + for (; a[startA] == b[startB]; startA++, startB++) { + if (a[startA] == 0) { + return 0; + } + } + return a[startA] - b[startB]; + } + + /** + * Compares a string with null terminated char array + */ + public static int strcmp(String str, char[] a, int start) { + int i, d, len = str.length(); + for (i = 0; i < len; i++) { + d = (int)str.charAt(i) - a[start + i]; + if (d != 0) { + return d; + } + if (a[start + i] == 0) { + return d; + } + } + if (a[start + i] != 0) { + return (int)-a[start + i]; + } + return 0; + + } + + public static void strcpy(char[] dst, int di, char[] src, int si) { + while (src[si] != 0) { + dst[di++] = src[si++]; + } + dst[di] = 0; + } + + public static int strlen(char[] a, int start) { + int len = 0; + for (int i = start; i < a.length && a[i] != 0; i++) { + len++; + } + return len; + } + + public static int strlen(char[] a) { + return strlen(a, 0); + } + + public int find(String key) { + int len = key.length(); + char strkey[] = new char[len + 1]; + key.getChars(0, len, strkey, 0); + strkey[len] = 0; + + return find(strkey, 0); + } + + public int find(char[] key, int start) { + int d; + char p = root; + int i = start; + char c; + + while (p != 0) { + if (sc[p] == 0xFFFF) { + if (strcmp(key, i, kv.getArray(), lo[p]) == 0) { + return eq[p]; + } else { + return -1; + } + } + c = key[i]; + d = c - sc[p]; + if (d == 0) { + if (c == 0) { + return eq[p]; + } + i++; + p = eq[p]; + } else if (d < 0) { + p = lo[p]; + } else { + p = hi[p]; + } + } + return -1; + } + + public boolean knows(String key) { + return (find(key) >= 0); + } + + // redimension the arrays + private void redimNodeArrays(int newsize) { + int len = newsize < lo.length ? newsize : lo.length; + char[] na = new char[newsize]; + System.arraycopy(lo, 0, na, 0, len); + lo = na; + na = new char[newsize]; + System.arraycopy(hi, 0, na, 0, len); + hi = na; + na = new char[newsize]; + System.arraycopy(eq, 0, na, 0, len); + eq = na; + na = new char[newsize]; + System.arraycopy(sc, 0, na, 0, len); + sc = na; + } + + public int size() { + return length; + } + + public Object clone() { + TernaryTree t = new TernaryTree(); + t.lo = (char[])this.lo.clone(); + t.hi = (char[])this.hi.clone(); + t.eq = (char[])this.eq.clone(); + t.sc = (char[])this.sc.clone(); + t.kv = (CharVector)this.kv.clone(); + t.root = this.root; + t.freenode = this.freenode; + t.length = this.length; + + return t; + } + + /** + * Recursively insert the median first and then the median of the + * lower and upper halves, and so on in order to get a balanced + * tree. The array of keys is assumed to be sorted in ascending + * order. + */ + protected void insertBalanced(String[] k, char[] v, int offset, int n) { + int m; + if (n < 1) { + return; + } + m = n >> 1; + + insert(k[m + offset], v[m + offset]); + insertBalanced(k, v, offset, m); + + insertBalanced(k, v, offset + m + 1, n - m - 1); + } + + + /** + * Balance the tree for best search performance + */ + public void balance() { + // System.out.print("Before root splitchar = "); System.out.println(sc[root]); + + int i = 0, n = length; + String[] k = new String[n]; + char[] v = new char[n]; + Iterator iter = new Iterator(); + while (iter.hasMoreElements()) { + v[i] = iter.getValue(); + k[i++] = (String)iter.nextElement(); + } + init(); + insertBalanced(k, v, 0, n); + + // With uniform letter distribution sc[root] should be around 'm' + // System.out.print("After root splitchar = "); System.out.println(sc[root]); + } + + /** + * Each node stores a character (splitchar) which is part of + * some key(s). In a compressed branch (one that only contain + * a single string key) the trailer of the key which is not + * already in nodes is stored externally in the kv array. + * As items are inserted, key substrings decrease. + * Some substrings may completely disappear when the whole + * branch is totally decompressed. + * The tree is traversed to find the key substrings actually + * used. In addition, duplicate substrings are removed using + * a map (implemented with a TernaryTree!). + * + */ + public void trimToSize() { + // first balance the tree for best performance + balance(); + + // redimension the node arrays + redimNodeArrays(freenode); + + // ok, compact kv array + CharVector kx = new CharVector(); + kx.alloc(1); + TernaryTree map = new TernaryTree(); + compact(kx, map, root); + kv = kx; + kv.trimToSize(); + } + + private void compact(CharVector kx, TernaryTree map, char p) { + int k; + if (p == 0) { + return; + } + if (sc[p] == 0xFFFF) { + k = map.find(kv.getArray(), lo[p]); + if (k < 0) { + k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1); + strcpy(kx.getArray(), k, kv.getArray(), lo[p]); + map.insert(kx.getArray(), k, (char)k); + } + lo[p] = (char)k; + } else { + compact(kx, map, lo[p]); + if (sc[p] != 0) { + compact(kx, map, eq[p]); + } + compact(kx, map, hi[p]); + } + } + + + public Enumeration keys() { + return new Iterator(); + } + + public class Iterator implements Enumeration { + + /** + * current node index + */ + int cur; + + /** + * current key + */ + String curkey; + + private class Item implements Cloneable { + char parent; + char child; + + public Item() { + parent = 0; + child = 0; + } + + public Item(char p, char c) { + parent = p; + child = c; + } + + public Object clone() { + return new Item(parent, child); + } + + } + + /** + * Node stack + */ + Stack ns; + + /** + * key stack implemented with a StringBuffer + */ + StringBuffer ks; + + public Iterator() { + cur = -1; + ns = new Stack(); + ks = new StringBuffer(); + rewind(); + } + + public void rewind() { + ns.removeAllElements(); + ks.setLength(0); + cur = root; + run(); + } + + public Object nextElement() { + String res = new String(curkey); + cur = up(); + run(); + return res; + } + + public char getValue() { + if (cur >= 0) { + return eq[cur]; + } + return 0; + } + + public boolean hasMoreElements() { + return (cur != -1); + } + + /** + * traverse upwards + */ + private int up() { + Item i = new Item(); + int res = 0; + + if (ns.empty()) { + return -1; + } + + if (cur != 0 && sc[cur] == 0) { + return lo[cur]; + } + + boolean climb = true; + + while (climb) { + i = (Item)ns.pop(); + i.child++; + switch (i.child) { + case 1: + if (sc[i.parent] != 0) { + res = eq[i.parent]; + ns.push(i.clone()); + ks.append(sc[i.parent]); + } else { + i.child++; + ns.push(i.clone()); + res = hi[i.parent]; + } + climb = false; + break; + + case 2: + res = hi[i.parent]; + ns.push(i.clone()); + if (ks.length() > 0) { + ks.setLength(ks.length() - 1); // pop + } + climb = false; + break; + + default: + if (ns.empty()) { + return -1; + } + climb = true; + break; + } + } + return res; + } + + /** + * traverse the tree to find next key + */ + private int run() { + if (cur == -1) { + return -1; + } + + boolean leaf = false; + while (true) { + // first go down on low branch until leaf or compressed branch + while (cur != 0) { + if (sc[cur] == 0xFFFF) { + leaf = true; + break; + } + ns.push(new Item((char)cur, '\u0000')); + if (sc[cur] == 0) { + leaf = true; + break; + } + cur = lo[cur]; + } + if (leaf) { + break; + } + // nothing found, go up one node and try again + cur = up(); + if (cur == -1) { + return -1; + } + } + // The current node should be a data node and + // the key should be in the key stack (at least partially) + StringBuffer buf = new StringBuffer(ks.toString()); + if (sc[cur] == 0xFFFF) { + int p = lo[cur]; + while (kv.get(p) != 0) { + buf.append(kv.get(p++)); + } + } + curkey = buf.toString(); + return 0; + } + + } + + public void printStats() { + System.out.println("Number of keys = " + Integer.toString(length)); + System.out.println("Node count = " + Integer.toString(freenode)); + // System.out.println("Array length = " + Integer.toString(eq.length)); + System.out.println("Key Array length = " + + Integer.toString(kv.length())); + + /* + * for(int i=0; i - */ -public class ByteVector implements Serializable { - - /** - * Capacity increment size - */ - private static final int DEFAULT_BLOCK_SIZE = 2048; - private int blockSize; - - /** - * The encapsulated array - */ - private byte[] array; - - /** - * Points to next free item - */ - private int n; - - public ByteVector() { - this(DEFAULT_BLOCK_SIZE); - } - - public ByteVector(int capacity) { - if (capacity > 0) { - blockSize = capacity; - } else { - blockSize = DEFAULT_BLOCK_SIZE; - } - array = new byte[blockSize]; - n = 0; - } - - public ByteVector(byte[] a) { - blockSize = DEFAULT_BLOCK_SIZE; - array = a; - n = 0; - } - - public ByteVector(byte[] a, int capacity) { - if (capacity > 0) { - blockSize = capacity; - } else { - blockSize = DEFAULT_BLOCK_SIZE; - } - array = a; - n = 0; - } - - public byte[] getArray() { - return array; - } - - /** - * return number of items in array - */ - public int length() { - return n; - } - - /** - * returns current capacity of array - */ - public int capacity() { - return array.length; - } - - public void put(int index, byte val) { - array[index] = val; - } - - public byte get(int index) { - return array[index]; - } - - /** - * This is to implement memory allocation in the array. Like malloc(). - */ - public int alloc(int size) { - int index = n; - int len = array.length; - if (n + size >= len) { - byte[] aux = new byte[len + blockSize]; - System.arraycopy(array, 0, aux, 0, len); - array = aux; - } - n += size; - return index; - } - - public void trimToSize() { - if (n < array.length) { - byte[] aux = new byte[n]; - System.arraycopy(array, 0, aux, 0, n); - array = aux; - } - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/CharVector.java b/src/java/org/apache/fop/layout/hyphenation/CharVector.java deleted file mode 100644 index 181191edb..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/CharVector.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -import java.io.Serializable; - -/** - * This class implements a simple char vector with access to the - * underlying array. - * - * @author Carlos Villegas - */ -public class CharVector implements Cloneable, Serializable { - - /** - * Capacity increment size - */ - private static final int DEFAULT_BLOCK_SIZE = 2048; - private int blockSize; - - /** - * The encapsulated array - */ - private char[] array; - - /** - * Points to next free item - */ - private int n; - - public CharVector() { - this(DEFAULT_BLOCK_SIZE); - } - - public CharVector(int capacity) { - if (capacity > 0) { - blockSize = capacity; - } else { - blockSize = DEFAULT_BLOCK_SIZE; - } - array = new char[blockSize]; - n = 0; - } - - public CharVector(char[] a) { - blockSize = DEFAULT_BLOCK_SIZE; - array = a; - n = a.length; - } - - public CharVector(char[] a, int capacity) { - if (capacity > 0) { - blockSize = capacity; - } else { - blockSize = DEFAULT_BLOCK_SIZE; - } - array = a; - n = a.length; - } - - /** - * Reset Vector but don't resize or clear elements - */ - public void clear() { - n = 0; - } - - public Object clone() { - CharVector cv = new CharVector((char[])array.clone(), blockSize); - cv.n = this.n; - return cv; - } - - public char[] getArray() { - return array; - } - - /** - * return number of items in array - */ - public int length() { - return n; - } - - /** - * returns current capacity of array - */ - public int capacity() { - return array.length; - } - - public void put(int index, char val) { - array[index] = val; - } - - public char get(int index) { - return array[index]; - } - - public int alloc(int size) { - int index = n; - int len = array.length; - if (n + size >= len) { - char[] aux = new char[len + blockSize]; - System.arraycopy(array, 0, aux, 0, len); - array = aux; - } - n += size; - return index; - } - - public void trimToSize() { - if (n < array.length) { - char[] aux = new char[n]; - System.arraycopy(array, 0, aux, 0, n); - array = aux; - } - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/Hyphen.java b/src/java/org/apache/fop/layout/hyphenation/Hyphen.java deleted file mode 100644 index cbf506ade..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/Hyphen.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -import java.io.Serializable; - -/** - * This class represents a hyphen. A 'full' hyphen is made of 3 parts: - * the pre-break text, post-break text and no-break. If no line-break - * is generated at this position, the no-break text is used, otherwise, - * pre-break and post-break are used. Typically, pre-break is equal to - * the hyphen character and the others are empty. However, this general - * scheme allows support for cases in some languages where words change - * spelling if they're split across lines, like german's 'backen' which - * hyphenates 'bak-ken'. BTW, this comes from TeX. - * - * @author Carlos Villegas - */ - -public class Hyphen implements Serializable { - public String preBreak; - public String noBreak; - public String postBreak; - - Hyphen(String pre, String no, String post) { - preBreak = pre; - noBreak = no; - postBreak = post; - } - - Hyphen(String pre) { - preBreak = pre; - noBreak = null; - postBreak = null; - } - - public String toString() { - if (noBreak == null - && postBreak == null - && preBreak != null - && preBreak.equals("-")) { - return "-"; - } - StringBuffer res = new StringBuffer("{"); - res.append(preBreak); - res.append("}{"); - res.append(postBreak); - res.append("}{"); - res.append(noBreak); - res.append('}'); - return res.toString(); - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/Hyphenation.java b/src/java/org/apache/fop/layout/hyphenation/Hyphenation.java deleted file mode 100644 index 4183ceef0..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/Hyphenation.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -/** - * This class represents a hyphenated word. - * - * @author Carlos Villegas - */ -public class Hyphenation { - - private int[] hyphenPoints; - private String word; - - /** - * number of hyphenation points in word - */ - private int len; - - /** - * rawWord as made of alternating strings and {@link Hyphen Hyphen} - * instances - */ - Hyphenation(String word, int[] points) { - this.word = word; - hyphenPoints = points; - len = points.length; - } - - /** - * @return the number of hyphenation points in the word - */ - public int length() { - return len; - } - - /** - * @return the pre-break text, not including the hyphen character - */ - public String getPreHyphenText(int index) { - return word.substring(0, hyphenPoints[index]); - } - - /** - * @return the post-break text - */ - public String getPostHyphenText(int index) { - return word.substring(hyphenPoints[index]); - } - - /** - * @return the hyphenation points - */ - public int[] getHyphenationPoints() { - return hyphenPoints; - } - - public String toString() { - StringBuffer str = new StringBuffer(); - int start = 0; - for (int i = 0; i < len; i++) { - str.append(word.substring(start, hyphenPoints[i]) + "-"); - start = hyphenPoints[i]; - } - str.append(word.substring(start)); - return str.toString(); - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/HyphenationException.java b/src/java/org/apache/fop/layout/hyphenation/HyphenationException.java deleted file mode 100644 index 9276fe5fa..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/HyphenationException.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -/** - * @author Carlos Villegas - * (todo) Derive from FOPException - */ -public class HyphenationException extends Exception { - - /** - * @see java.lang.Throwable#Throwable(String) - */ - public HyphenationException(String msg) { - super(msg); - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/HyphenationTree.java b/src/java/org/apache/fop/layout/hyphenation/HyphenationTree.java deleted file mode 100644 index 0a39e84b5..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/HyphenationTree.java +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashMap; - -/** - * This tree structure stores the hyphenation patterns in an efficient - * way for fast lookup. It provides the provides the method to - * hyphenate a word. - * - * @author Carlos Villegas - */ -public class HyphenationTree extends TernaryTree - implements PatternConsumer, Serializable { - - /** - * value space: stores the inteletter values - */ - protected ByteVector vspace; - - /** - * This map stores hyphenation exceptions - */ - protected HashMap stoplist; - - /** - * This map stores the character classes - */ - protected TernaryTree classmap; - - /** - * Temporary map to store interletter values on pattern loading. - */ - private transient TernaryTree ivalues; - - public HyphenationTree() { - stoplist = new HashMap(23); // usually a small table - classmap = new TernaryTree(); - vspace = new ByteVector(); - vspace.alloc(1); // this reserves index 0, which we don't use - } - - /** - * Packs the values by storing them in 4 bits, two values into a byte - * Values range is from 0 to 9. We use zero as terminator, - * so we'll add 1 to the value. - * @param values a string of digits from '0' to '9' representing the - * interletter values. - * @return the index into the vspace array where the packed values - * are stored. - */ - protected int packValues(String values) { - int i, n = values.length(); - int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; - int offset = vspace.alloc(m); - byte[] va = vspace.getArray(); - for (i = 0; i < n; i++) { - int j = i >> 1; - byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f); - if ((i & 1) == 1) { - va[j + offset] = (byte)(va[j + offset] | v); - } else { - va[j + offset] = (byte)(v << 4); // big endian - } - } - va[m - 1 + offset] = 0; // terminator - return offset; - } - - protected String unpackValues(int k) { - StringBuffer buf = new StringBuffer(); - byte v = vspace.get(k++); - while (v != 0) { - char c = (char)((v >>> 4) - 1 + '0'); - buf.append(c); - c = (char)(v & 0x0f); - if (c == 0) { - break; - } - c = (char)(c - 1 + '0'); - buf.append(c); - v = vspace.get(k++); - } - return buf.toString(); - } - - /** - * Read hyphenation patterns from an XML file. - */ - public void loadPatterns(String filename) throws HyphenationException { - PatternParser pp = new PatternParser(this); - ivalues = new TernaryTree(); - - pp.parse(filename); - - // patterns/values should be now in the tree - // let's optimize a bit - trimToSize(); - vspace.trimToSize(); - classmap.trimToSize(); - - // get rid of the auxiliary map - ivalues = null; - } - - public String findPattern(String pat) { - int k = super.find(pat); - if (k >= 0) { - return unpackValues(k); - } - return ""; - } - - /** - * String compare, returns 0 if equal or - * t is a substring of s - */ - protected int hstrcmp(char[] s, int si, char[] t, int ti) { - for (; s[si] == t[ti]; si++, ti++) { - if (s[si] == 0) { - return 0; - } - } - if (t[ti] == 0) { - return 0; - } - return s[si] - t[ti]; - } - - protected byte[] getValues(int k) { - StringBuffer buf = new StringBuffer(); - byte v = vspace.get(k++); - while (v != 0) { - char c = (char)((v >>> 4) - 1); - buf.append(c); - c = (char)(v & 0x0f); - if (c == 0) { - break; - } - c = (char)(c - 1); - buf.append(c); - v = vspace.get(k++); - } - byte[] res = new byte[buf.length()]; - for (int i = 0; i < res.length; i++) { - res[i] = (byte)buf.charAt(i); - } - return res; - } - - /** - *

Search for all possible partial matches of word starting - * at index an update interletter values. In other words, it - * does something like:

- * - * for(i=0; i - *

But it is done in an efficient way since the patterns are - * stored in a ternary tree. In fact, this is the whole purpose - * of having the tree: doing this search without having to test - * every single pattern. The number of patterns for languages - * such as English range from 4000 to 10000. Thus, doing thousands - * of string comparisons for each word to hyphenate would be - * really slow without the tree. The tradeoff is memory, but - * using a ternary tree instead of a trie, almost halves the - * the memory used by Lout or TeX. It's also faster than using - * a hash table

- * @param word null terminated word to match - * @param index start index from word - * @param il interletter values array to update - */ - protected void searchPatterns(char[] word, int index, byte[] il) { - byte[] values; - int i = index; - char p, q; - char sp = word[i]; - p = root; - - while (p > 0 && p < sc.length) { - if (sc[p] == 0xFFFF) { - if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { - values = getValues(eq[p]); // data pointer is in eq[] - int j = index; - for (int k = 0; k < values.length; k++) { - if (j < il.length && values[k] > il[j]) { - il[j] = values[k]; - } - j++; - } - } - return; - } - int d = sp - sc[p]; - if (d == 0) { - if (sp == 0) { - break; - } - sp = word[++i]; - p = eq[p]; - q = p; - - // look for a pattern ending at this position by searching for - // the null char ( splitchar == 0 ) - while (q > 0 && q < sc.length) { - if (sc[q] == 0xFFFF) { // stop at compressed branch - break; - } - if (sc[q] == 0) { - values = getValues(eq[q]); - int j = index; - for (int k = 0; k < values.length; k++) { - if (j < il.length && values[k] > il[j]) { - il[j] = values[k]; - } - j++; - } - break; - } else { - q = lo[q]; - - /** - * actually the code should be: - * q = sc[q] < 0 ? hi[q] : lo[q]; - * but java chars are unsigned - */ - } - } - } else { - p = d < 0 ? lo[p] : hi[p]; - } - } - } - - /** - * Hyphenate word and return a Hyphenation object. - * @param word the word to be hyphenated - * @param remainCharCount Minimum number of characters allowed - * before the hyphenation point. - * @param pushCharCount Minimum number of characters allowed after - * the hyphenation point. - * @return a {@link Hyphenation Hyphenation} object representing - * the hyphenated word or null if word is not hyphenated. - */ - public Hyphenation hyphenate(String word, int remainCharCount, - int pushCharCount) { - char[] w = word.toCharArray(); - return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); - } - - /** - * w = "****nnllllllnnn*****", - * where n is a non-letter, l is a letter, - * all n may be absent, the first n is at offset, - * the first l is at offset + iIgnoreAtBeginning; - * word = ".llllll.'\0'***", - * where all l in w are copied into word. - * In the first part of the routine len = w.length, - * in the second part of the routine len = word.length. - * Three indices are used: - * index(w), the index in w, - * index(word), the index in word, - * letterindex(word), the index in the letter part of word. - * The following relations exist: - * index(w) = offset + i - 1 - * index(word) = i - iIgnoreAtBeginning - * letterindex(word) = index(word) - 1 - * (see first loop). - * It follows that: - * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning - * index(w) = letterindex(word) + offset + iIgnoreAtBeginning - */ - - /** - * Hyphenate word and return an array of hyphenation points. - * @param w char array that contains the word - * @param offset Offset to first character in word - * @param len Length of word - * @param remainCharCount Minimum number of characters allowed - * before the hyphenation point. - * @param pushCharCount Minimum number of characters allowed after - * the hyphenation point. - * @return a {@link Hyphenation Hyphenation} object representing - * the hyphenated word or null if word is not hyphenated. - */ - public Hyphenation hyphenate(char[] w, int offset, int len, - int remainCharCount, int pushCharCount) { - int i; - char[] word = new char[len + 3]; - - // normalize word - char[] c = new char[2]; - int iIgnoreAtBeginning = 0; - int iLength = len; - boolean bEndOfLetters = false; - for (i = 1; i <= len; i++) { - c[0] = w[offset + i - 1]; - int nc = classmap.find(c, 0); - if (nc < 0) { // found a non-letter character ... - if (i == (1 + iIgnoreAtBeginning)) { - // ... before any letter character - iIgnoreAtBeginning ++; - } else { - // ... after a letter character - bEndOfLetters = true; - } - iLength --; - } else { - if (!bEndOfLetters) { - word[i - iIgnoreAtBeginning] = (char)nc; - } else { - return null; - } - } - } - len = iLength; - if (len < (remainCharCount + pushCharCount)) { - // word is too short to be hyphenated - return null; - } - int[] result = new int[len + 1]; - int k = 0; - - // check exception list first - String sw = new String(word, 1, len); - if (stoplist.containsKey(sw)) { - // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null) - ArrayList hw = (ArrayList)stoplist.get(sw); - int j = 0; - for (i = 0; i < hw.size(); i++) { - Object o = hw.get(i); - // j = index(sw) = letterindex(word)? - // result[k] = corresponding index(w) - if (o instanceof String) { - j += ((String)o).length(); - if (j >= remainCharCount && j < (len - pushCharCount)) { - result[k++] = j + iIgnoreAtBeginning; - } - } - } - } else { - // use algorithm to get hyphenation points - word[0] = '.'; // word start marker - word[len + 1] = '.'; // word end marker - word[len + 2] = 0; // null terminated - byte[] il = new byte[len + 3]; // initialized to zero - for (i = 0; i < len + 1; i++) { - searchPatterns(word, i, il); - } - - // hyphenation points are located where interletter value is odd - // i is letterindex(word), - // i + 1 is index(word), - // result[k] = corresponding index(w) - for (i = 0; i < len; i++) { - if (((il[i + 1] & 1) == 1) && i >= remainCharCount - && i <= (len - pushCharCount)) { - result[k++] = i + iIgnoreAtBeginning; - } - } - } - - - if (k > 0) { - // trim result array - int[] res = new int[k]; - System.arraycopy(result, 0, res, 0, k); - return new Hyphenation(new String(w, offset, len), res); - } else { - return null; - } - } - - /** - * Add a character class to the tree. It is used by - * {@link PatternParser PatternParser} as callback to - * add character classes. Character classes define the - * valid word characters for hyphenation. If a word contains - * a character not defined in any of the classes, it is not hyphenated. - * It also defines a way to normalize the characters in order - * to compare them with the stored patterns. Usually pattern - * files use only lower case characters, in this case a class - * for letter 'a', for example, should be defined as "aA", the first - * character being the normalization char. - */ - public void addClass(String chargroup) { - if (chargroup.length() > 0) { - char equivChar = chargroup.charAt(0); - char[] key = new char[2]; - key[1] = 0; - for (int i = 0; i < chargroup.length(); i++) { - key[0] = chargroup.charAt(i); - classmap.insert(key, 0, equivChar); - } - } - } - - /** - * Add an exception to the tree. It is used by - * {@link PatternParser PatternParser} class as callback to - * store the hyphenation exceptions. - * @param word normalized word - * @param hyphenatedword a vector of alternating strings and - * {@link Hyphen hyphen} objects. - */ - public void addException(String word, ArrayList hyphenatedword) { - stoplist.put(word, hyphenatedword); - } - - /** - * Add a pattern to the tree. Mainly, to be used by - * {@link PatternParser PatternParser} class as callback to - * add a pattern to the tree. - * @param pattern the hyphenation pattern - * @param ivalue interletter weight values indicating the - * desirability and priority of hyphenating at a given point - * within the pattern. It should contain only digit characters. - * (i.e. '0' to '9'). - */ - public void addPattern(String pattern, String ivalue) { - int k = ivalues.find(ivalue); - if (k <= 0) { - k = packValues(ivalue); - ivalues.insert(ivalue, (char)k); - } - insert(pattern, (char)k); - } - - public void printStats() { - System.out.println("Value space size = " - + Integer.toString(vspace.length())); - super.printStats(); - - } - - public static void main(String[] argv) throws Exception { - HyphenationTree ht = null; - int minCharCount = 2; - BufferedReader in = - new BufferedReader(new java.io.InputStreamReader(System.in)); - while (true) { - System.out.print("l:\tload patterns from XML\n" - + "L:\tload patterns from serialized object\n" - + "s:\tset minimun character count\n" - + "w:\twrite hyphenation tree to object file\n" - + "h:\thyphenate\n" - + "f:\tfind pattern\n" - + "b:\tbenchmark\n" - + "q:\tquit\n\n" - + "Command:"); - String token = in.readLine().trim(); - if (token.equals("f")) { - System.out.print("Pattern: "); - token = in.readLine().trim(); - System.out.println("Values: " + ht.findPattern(token)); - } else if (token.equals("s")) { - System.out.print("Minimun value: "); - token = in.readLine().trim(); - minCharCount = Integer.parseInt(token); - } else if (token.equals("l")) { - ht = new HyphenationTree(); - System.out.print("XML file name: "); - token = in.readLine().trim(); - ht.loadPatterns(token); - } else if (token.equals("L")) { - ObjectInputStream ois = null; - System.out.print("Object file name: "); - token = in.readLine().trim(); - try { - ois = new ObjectInputStream(new FileInputStream(token)); - ht = (HyphenationTree)ois.readObject(); - } catch (Exception e) { - e.printStackTrace(); - } finally { - if (ois != null) { - try { - ois.close(); - } catch (IOException e) { - //ignore - } - } - } - } else if (token.equals("w")) { - System.out.print("Object file name: "); - token = in.readLine().trim(); - ObjectOutputStream oos = null; - try { - oos = new ObjectOutputStream(new FileOutputStream(token)); - oos.writeObject(ht); - } catch (Exception e) { - e.printStackTrace(); - } finally { - if (oos != null) { - try { - oos.flush(); - } catch (IOException e) { - //ignore - } - try { - oos.close(); - } catch (IOException e) { - //ignore - } - } - } - } else if (token.equals("h")) { - System.out.print("Word: "); - token = in.readLine().trim(); - System.out.print("Hyphenation points: "); - System.out.println(ht.hyphenate(token, minCharCount, - minCharCount)); - } else if (token.equals("b")) { - if (ht == null) { - System.out.println("No patterns has been loaded."); - break; - } - System.out.print("Word list filename: "); - token = in.readLine().trim(); - long starttime = 0; - int counter = 0; - try { - BufferedReader reader = - new BufferedReader(new FileReader(token)); - String line; - - starttime = System.currentTimeMillis(); - while ((line = reader.readLine()) != null) { - // System.out.print("\nline: "); - Hyphenation hyp = ht.hyphenate(line, minCharCount, - minCharCount); - if (hyp != null) { - String hword = hyp.toString(); - // System.out.println(line); - // System.out.println(hword); - } else { - // System.out.println("No hyphenation"); - } - counter++; - } - } catch (Exception ioe) { - System.out.println("Exception " + ioe); - ioe.printStackTrace(); - } - long endtime = System.currentTimeMillis(); - long result = endtime - starttime; - System.out.println(counter + " words in " + result - + " Millisekunden hyphenated"); - - } else if (token.equals("q")) { - break; - } - } - - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/Hyphenator.java b/src/java/org/apache/fop/layout/hyphenation/Hyphenator.java deleted file mode 100644 index f6545680d..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/Hyphenator.java +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.util.Hashtable; - -/** - * This class is the main entry point to the hyphenation package. - * You can use only the static methods or create an instance. - * - * @author Carlos Villegas - */ -public class Hyphenator { - - /**@todo Don't use statics */ - private static Hashtable hyphenTrees = new Hashtable(); - - private HyphenationTree hyphenTree = null; - private int remainCharCount = 2; - private int pushCharCount = 2; - private static boolean errorDump = false; - - public Hyphenator(String lang, String country, int leftMin, - int rightMin) { - hyphenTree = getHyphenationTree(lang, country); - remainCharCount = leftMin; - pushCharCount = rightMin; - } - - public static HyphenationTree getHyphenationTree(String lang, - String country) { - String key = lang; - // check whether the country code has been used - if (country != null && !country.equals("none")) { - key += "_" + country; - } - // first try to find it in the cache - if (hyphenTrees.containsKey(key)) { - return (HyphenationTree)hyphenTrees.get(key); - } - if (hyphenTrees.containsKey(lang)) { - return (HyphenationTree)hyphenTrees.get(lang); - } - - HyphenationTree hTree = getFopHyphenationTree(key); - if (hTree == null) { - String hyphenDir = "/hyph"; - if (hyphenDir != null) { - hTree = getUserHyphenationTree(key, hyphenDir); - } - } - // put it into the pattern cache - if (hTree != null) { - hyphenTrees.put(key, hTree); - } else { - /**@todo Proper logging please */ - //log.error("Couldn't find hyphenation pattern " - // + key); - } - return hTree; - } - - private static InputStream getResourceStream(String key) { - InputStream is = null; - // Try to use Context Class Loader to load the properties file. - try { - java.lang.reflect.Method getCCL = - Thread.class.getMethod("getContextClassLoader", new Class[0]); - if (getCCL != null) { - ClassLoader contextClassLoader = - (ClassLoader)getCCL.invoke(Thread.currentThread(), - new Object[0]); - is = contextClassLoader.getResourceAsStream("hyph/" + key - + ".hyp"); - } - } catch (Exception e) { - //ignore, fallback further down - } - - if (is == null) { - is = Hyphenator.class.getResourceAsStream("/hyph/" + key - + ".hyp"); - } - - return is; - } - - public static HyphenationTree getFopHyphenationTree(String key) { - HyphenationTree hTree = null; - ObjectInputStream ois = null; - InputStream is = null; - try { - is = getResourceStream(key); - if (is == null) { - if (key.length() == 5) { - is = getResourceStream(key.substring(0, 2)); - if (is != null) { - //log.error("Couldn't find hyphenation pattern " - // + key - // + "\nusing general language pattern " - // + key.substring(0, 2) - // + " instead."); - } else { - if (errorDump) { - //log.error("Couldn't find precompiled " - // + "fop hyphenation pattern " - // + key + ".hyp"); - } - return null; - } - } else { - if (errorDump) { - //log.error("Couldn't find precompiled " - // + "fop hyphenation pattern " - // + key + ".hyp"); - } - return null; - } - } - ois = new ObjectInputStream(is); - hTree = (HyphenationTree)ois.readObject(); - } catch (Exception e) { - /**@todo proper logging please */ - e.printStackTrace(); - } finally { - if (ois != null) { - try { - ois.close(); - } catch (IOException e) { - //log.error("can't close hyphenation object stream"); - } - } - } - return hTree; - } - - /** - * load tree from serialized file or xml file - * using configuration settings - */ - public static HyphenationTree getUserHyphenationTree(String key, - String hyphenDir) { - HyphenationTree hTree = null; - // I use here the following convention. The file name specified in - // the configuration is taken as the base name. First we try - // name + ".hyp" assuming a serialized HyphenationTree. If that fails - // we try name + ".xml", assumming a raw hyphenation pattern file. - - // first try serialized object - File hyphenFile = new File(hyphenDir, key + ".hyp"); - if (hyphenFile.exists()) { - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(new BufferedInputStream( - new FileInputStream(hyphenFile))); - hTree = (HyphenationTree)ois.readObject(); - } catch (Exception e) { - /**@todo Proper logging please */ - e.printStackTrace(); - } finally { - if (ois != null) { - try { - ois.close(); - } catch (IOException e) { - //ignore - } - } - } - return hTree; - } else { - - // try the raw XML file - hyphenFile = new File(hyphenDir, key + ".xml"); - if (hyphenFile.exists()) { - hTree = new HyphenationTree(); - if (errorDump) { - //log.error("reading " + hyphenDir + key - // + ".xml"); - } - try { - hTree.loadPatterns(hyphenFile.getPath()); - if (errorDump) { - System.out.println("Stats: "); - hTree.printStats(); - } - return hTree; - } catch (HyphenationException ex) { - if (errorDump) { - //log.error("Can't load user patterns " - // + "from xml file " + hyphenDir - // + key + ".xml"); - } - return null; - } - } else { - if (errorDump) { - //log.error("Tried to load " - // + hyphenFile.toString() - // + "\nCannot find compiled nor xml file for " - // + "hyphenation pattern" + key); - } - return null; - } - } - } - - public static Hyphenation hyphenate(String lang, String country, - String word, int leftMin, - int rightMin) { - HyphenationTree hTree = getHyphenationTree(lang, country); - if (hTree == null) { - //log.error("Error building hyphenation tree for language " - // + lang); - return null; - } - return hTree.hyphenate(word, leftMin, rightMin); - } - - public static Hyphenation hyphenate(String lang, String country, - char[] word, int offset, int len, - int leftMin, int rightMin) { - HyphenationTree hTree = getHyphenationTree(lang, country); - if (hTree == null) { - //log.error("Error building hyphenation tree for language " - // + lang); - return null; - } - return hTree.hyphenate(word, offset, len, leftMin, rightMin); - } - - public void setMinRemainCharCount(int min) { - remainCharCount = min; - } - - public void setMinPushCharCount(int min) { - pushCharCount = min; - } - - public void setLanguage(String lang, String country) { - hyphenTree = getHyphenationTree(lang, country); - } - - public Hyphenation hyphenate(char[] word, int offset, int len) { - if (hyphenTree == null) { - return null; - } - return hyphenTree.hyphenate(word, offset, len, remainCharCount, - pushCharCount); - } - - public Hyphenation hyphenate(String word) { - if (hyphenTree == null) { - return null; - } - return hyphenTree.hyphenate(word, remainCharCount, pushCharCount); - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/PatternConsumer.java b/src/java/org/apache/fop/layout/hyphenation/PatternConsumer.java deleted file mode 100644 index 7db7fff71..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/PatternConsumer.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -import java.util.ArrayList; - -/** - * This interface is used to connect the XML pattern file parser to - * the hyphenation tree. - * - * @author Carlos Villegas - */ -public interface PatternConsumer { - - /** - * Add a character class. - * A character class defines characters that are considered - * equivalent for the purpose of hyphenation (e.g. "aA"). It - * usually means to ignore case. - * @param chargroup character group - */ - void addClass(String chargroup); - - /** - * Add a hyphenation exception. An exception replaces the - * result obtained by the algorithm for cases for which this - * fails or the user wants to provide his own hyphenation. - * A hyphenatedword is a vector of alternating String's and - * {@link Hyphen Hyphen} instances - */ - void addException(String word, ArrayList hyphenatedword); - - /** - * Add hyphenation patterns. - * @param pattern the pattern - * @param values interletter values expressed as a string of - * digit characters. - */ - void addPattern(String pattern, String values); - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/PatternParser.java b/src/java/org/apache/fop/layout/hyphenation/PatternParser.java deleted file mode 100644 index ec047d971..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/PatternParser.java +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -// SAX -import org.xml.sax.XMLReader; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.SAXParseException; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.Attributes; - -// Java -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.net.URL; - -/** - * A SAX document handler to read and parse hyphenation patterns - * from a XML file. - * - * @author Carlos Villegas - */ -public class PatternParser extends DefaultHandler implements PatternConsumer { - - XMLReader parser; - int currElement; - PatternConsumer consumer; - StringBuffer token; - ArrayList exception; - char hyphenChar; - String errMsg; - - static final int ELEM_CLASSES = 1; - static final int ELEM_EXCEPTIONS = 2; - static final int ELEM_PATTERNS = 3; - static final int ELEM_HYPHEN = 4; - - public PatternParser() throws HyphenationException { - token = new StringBuffer(); - parser = createParser(); - parser.setContentHandler(this); - parser.setErrorHandler(this); - hyphenChar = '-'; // default - - } - - public PatternParser(PatternConsumer consumer) - throws HyphenationException { - this(); - this.consumer = consumer; - } - - public void setConsumer(PatternConsumer consumer) { - this.consumer = consumer; - } - - public void parse(String filename) throws HyphenationException { - InputSource uri = fileInputSource(filename); - - try { - parser.parse(uri); - } catch (SAXException e) { - throw new HyphenationException(errMsg); - } catch (IOException e) { - throw new HyphenationException(e.getMessage()); - } catch (NullPointerException e) { - throw new HyphenationException("SAX parser not available"); - } - } - - /** - * creates a SAX parser, using the value of org.xml.sax.parser - * defaulting to org.apache.xerces.parsers.SAXParser - * - * @return the created SAX parser - */ - static XMLReader createParser() throws HyphenationException { - String parserClassName = System.getProperty("org.xml.sax.parser"); - if (parserClassName == null) { - parserClassName = "org.apache.xerces.parsers.SAXParser"; - } - // System.out.println("using SAX parser " + parserClassName); - - try { - return (XMLReader)Class.forName(parserClassName).newInstance(); - } catch (ClassNotFoundException e) { - throw new HyphenationException("Could not find " - + parserClassName); - } catch (InstantiationException e) { - throw new HyphenationException("Could not instantiate " - + parserClassName); - } catch (IllegalAccessException e) { - throw new HyphenationException("Could not access " - + parserClassName); - } catch (ClassCastException e) { - throw new HyphenationException(parserClassName - + " is not a SAX driver"); - } - } - - /** - * create an InputSource from a file name - * - * @param filename the name of the file - * @return the InputSource created - */ - protected static InputSource fileInputSource(String filename) - throws HyphenationException { - - /* this code adapted from James Clark's in XT */ - File file = new File(filename); - String path = file.getAbsolutePath(); - String fSep = System.getProperty("file.separator"); - if (fSep != null && fSep.length() == 1) { - path = path.replace(fSep.charAt(0), '/'); - } - if (path.length() > 0 && path.charAt(0) != '/') { - path = '/' + path; - } - try { - return new InputSource(new URL("file", null, path).toString()); - } catch (java.net.MalformedURLException e) { - throw new HyphenationException("unexpected MalformedURLException"); - } - } - - protected String readToken(StringBuffer chars) { - String word; - boolean space = false; - int i; - for (i = 0; i < chars.length(); i++) { - if (Character.isWhitespace(chars.charAt(i))) { - space = true; - } else { - break; - } - } - if (space) { - // chars.delete(0,i); - for (int countr = i; countr < chars.length(); countr++) { - chars.setCharAt(countr - i, chars.charAt(countr)); - } - chars.setLength(chars.length() - i); - if (token.length() > 0) { - word = token.toString(); - token.setLength(0); - return word; - } - } - space = false; - for (i = 0; i < chars.length(); i++) { - if (Character.isWhitespace(chars.charAt(i))) { - space = true; - break; - } - } - token.append(chars.toString().substring(0, i)); - // chars.delete(0,i); - for (int countr = i; countr < chars.length(); countr++) { - chars.setCharAt(countr - i, chars.charAt(countr)); - } - chars.setLength(chars.length() - i); - if (space) { - word = token.toString(); - token.setLength(0); - return word; - } - token.append(chars); - return null; - } - - protected static String getPattern(String word) { - StringBuffer pat = new StringBuffer(); - int len = word.length(); - for (int i = 0; i < len; i++) { - if (!Character.isDigit(word.charAt(i))) { - pat.append(word.charAt(i)); - } - } - return pat.toString(); - } - - protected ArrayList normalizeException(ArrayList ex) { - ArrayList res = new ArrayList(); - for (int i = 0; i < ex.size(); i++) { - Object item = ex.get(i); - if (item instanceof String) { - String str = (String)item; - StringBuffer buf = new StringBuffer(); - for (int j = 0; j < str.length(); j++) { - char c = str.charAt(j); - if (c != hyphenChar) { - buf.append(c); - } else { - res.add(buf.toString()); - buf.setLength(0); - char[] h = new char[1]; - h[0] = hyphenChar; - // we use here hyphenChar which is not necessarily - // the one to be printed - res.add(new Hyphen(new String(h), null, null)); - } - } - if (buf.length() > 0) { - res.add(buf.toString()); - } - } else { - res.add(item); - } - } - return res; - } - - protected String getExceptionWord(ArrayList ex) { - StringBuffer res = new StringBuffer(); - for (int i = 0; i < ex.size(); i++) { - Object item = ex.get(i); - if (item instanceof String) { - res.append((String)item); - } else { - if (((Hyphen)item).noBreak != null) { - res.append(((Hyphen)item).noBreak); - } - } - } - return res.toString(); - } - - protected static String getInterletterValues(String pat) { - StringBuffer il = new StringBuffer(); - String word = pat + "a"; // add dummy letter to serve as sentinel - int len = word.length(); - for (int i = 0; i < len; i++) { - char c = word.charAt(i); - if (Character.isDigit(c)) { - il.append(c); - i++; - } else { - il.append('0'); - } - } - return il.toString(); - } - - // - // DocumentHandler methods - // - - /** - * Start element. - */ - public void startElement(String uri, String local, String raw, - Attributes attrs) { - if (local.equals("hyphen-char")) { - String h = attrs.getValue("value"); - if (h != null && h.length() == 1) { - hyphenChar = h.charAt(0); - } - } else if (local.equals("classes")) { - currElement = ELEM_CLASSES; - } else if (local.equals("patterns")) { - currElement = ELEM_PATTERNS; - } else if (local.equals("exceptions")) { - currElement = ELEM_EXCEPTIONS; - exception = new ArrayList(); - } else if (local.equals("hyphen")) { - if (token.length() > 0) { - exception.add(token.toString()); - } - exception.add(new Hyphen(attrs.getValue("pre"), - attrs.getValue("no"), - attrs.getValue("post"))); - currElement = ELEM_HYPHEN; - } - token.setLength(0); - } - - public void endElement(String uri, String local, String raw) { - - if (token.length() > 0) { - String word = token.toString(); - switch (currElement) { - case ELEM_CLASSES: - consumer.addClass(word); - break; - case ELEM_EXCEPTIONS: - exception.add(word); - exception = normalizeException(exception); - consumer.addException(getExceptionWord(exception), - (ArrayList)exception.clone()); - break; - case ELEM_PATTERNS: - consumer.addPattern(getPattern(word), - getInterletterValues(word)); - break; - case ELEM_HYPHEN: - // nothing to do - break; - } - if (currElement != ELEM_HYPHEN) { - token.setLength(0); - } - } - if (currElement == ELEM_HYPHEN) { - currElement = ELEM_EXCEPTIONS; - } else { - currElement = 0; - } - - } - - /** - * Characters. - */ - public void characters(char ch[], int start, int length) { - StringBuffer chars = new StringBuffer(length); - chars.append(ch, start, length); - String word = readToken(chars); - while (word != null) { - // System.out.println("\"" + word + "\""); - switch (currElement) { - case ELEM_CLASSES: - consumer.addClass(word); - break; - case ELEM_EXCEPTIONS: - exception.add(word); - exception = normalizeException(exception); - consumer.addException(getExceptionWord(exception), - (ArrayList)exception.clone()); - exception.clear(); - break; - case ELEM_PATTERNS: - consumer.addPattern(getPattern(word), - getInterletterValues(word)); - break; - } - word = readToken(chars); - } - - } - - // - // ErrorHandler methods - // - - /** - * Warning. - */ - public void warning(SAXParseException ex) { - errMsg = "[Warning] " + getLocationString(ex) + ": " - + ex.getMessage(); - } - - /** - * Error. - */ - public void error(SAXParseException ex) { - errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage(); - } - - /** - * Fatal error. - */ - public void fatalError(SAXParseException ex) throws SAXException { - errMsg = "[Fatal Error] " + getLocationString(ex) + ": " - + ex.getMessage(); - throw ex; - } - - /** - * Returns a string of the location. - */ - private String getLocationString(SAXParseException ex) { - StringBuffer str = new StringBuffer(); - - String systemId = ex.getSystemId(); - if (systemId != null) { - int index = systemId.lastIndexOf('/'); - if (index != -1) { - systemId = systemId.substring(index + 1); - } - str.append(systemId); - } - str.append(':'); - str.append(ex.getLineNumber()); - str.append(':'); - str.append(ex.getColumnNumber()); - - return str.toString(); - - } // getLocationString(SAXParseException):String - - - // PatternConsumer implementation for testing purposes - public void addClass(String c) { - System.out.println("class: " + c); - } - - public void addException(String w, ArrayList e) { - System.out.println("exception: " + w + " : " + e.toString()); - } - - public void addPattern(String p, String v) { - System.out.println("pattern: " + p + " : " + v); - } - - public static void main(String[] args) throws Exception { - if (args.length > 0) { - PatternParser pp = new PatternParser(); - pp.setConsumer(pp); - pp.parse(args[0]); - } - } - -} diff --git a/src/java/org/apache/fop/layout/hyphenation/TernaryTree.java b/src/java/org/apache/fop/layout/hyphenation/TernaryTree.java deleted file mode 100644 index 62c95bfb6..000000000 --- a/src/java/org/apache/fop/layout/hyphenation/TernaryTree.java +++ /dev/null @@ -1,669 +0,0 @@ -/* - * Copyright 1999-2004 The Apache Software Foundation. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id$ */ - -package org.apache.fop.layout.hyphenation; - -import java.util.Enumeration; -import java.util.Stack; -import java.io.Serializable; - -/** - *

Ternary Search Tree.

- * - *

A ternary search tree is a hibrid between a binary tree and - * a digital search tree (trie). Keys are limited to strings. - * A data value of type char is stored in each leaf node. - * It can be used as an index (or pointer) to the data. - * Branches that only contain one key are compressed to one node - * by storing a pointer to the trailer substring of the key. - * This class is intended to serve as base class or helper class - * to implement Dictionary collections or the like. Ternary trees - * have some nice properties as the following: the tree can be - * traversed in sorted order, partial matches (wildcard) can be - * implemented, retrieval of all keys within a given distance - * from the target, etc. The storage requirements are higher than - * a binary tree but a lot less than a trie. Performance is - * comparable with a hash table, sometimes it outperforms a hash - * function (most of the time can determine a miss faster than a hash).

- * - *

The main purpose of this java port is to serve as a base for - * implementing TeX's hyphenation algorithm (see The TeXBook, - * appendix H). Each language requires from 5000 to 15000 hyphenation - * patterns which will be keys in this tree. The strings patterns - * are usually small (from 2 to 5 characters), but each char in the - * tree is stored in a node. Thus memory usage is the main concern. - * We will sacrify 'elegance' to keep memory requirenments to the - * minimum. Using java's char type as pointer (yes, I know pointer - * it is a forbidden word in java) we can keep the size of the node - * to be just 8 bytes (3 pointers and the data char). This gives - * room for about 65000 nodes. In my tests the english patterns - * took 7694 nodes and the german patterns 10055 nodes, - * so I think we are safe.

- * - *

All said, this is a map with strings as keys and char as value. - * Pretty limited!. It can be extended to a general map by - * using the string representation of an object and using the - * char value as an index to an array that contains the object - * values.

- * - * @author cav@uniscope.co.jp - */ - -public class TernaryTree implements Cloneable, Serializable { - - /** - * We use 4 arrays to represent a node. I guess I should have created - * a proper node class, but somehow Knuth's pascal code made me forget - * we now have a portable language with virtual memory management and - * automatic garbage collection! And now is kind of late, furthermore, - * if it ain't broken, don't fix it. - */ - - /** - * Pointer to low branch and to rest of the key when it is - * stored directly in this node, we don't have unions in java! - */ - protected char[] lo; - - /** - * Pointer to high branch. - */ - protected char[] hi; - - /** - * Pointer to equal branch and to data when this node is a string terminator. - */ - protected char[] eq; - - /** - *

The character stored in this node: splitchar. - * Two special values are reserved:

- *
  • 0x0000 as string terminator
  • - *
  • 0xFFFF to indicate that the branch starting at - * this node is compressed
- *

This shouldn't be a problem if we give the usual semantics to - * strings since 0xFFFF is garanteed not to be an Unicode character.

- */ - protected char[] sc; - - /** - * This vector holds the trailing of the keys when the branch is compressed. - */ - protected CharVector kv; - - protected char root; - protected char freenode; - protected int length; // number of items in tree - - protected static final int BLOCK_SIZE = 2048; // allocation size for arrays - - TernaryTree() { - init(); - } - - protected void init() { - root = 0; - freenode = 1; - length = 0; - lo = new char[BLOCK_SIZE]; - hi = new char[BLOCK_SIZE]; - eq = new char[BLOCK_SIZE]; - sc = new char[BLOCK_SIZE]; - kv = new CharVector(); - } - - /** - * Branches are initially compressed, needing - * one node per key plus the size of the string - * key. They are decompressed as needed when - * another key with same prefix - * is inserted. This saves a lot of space, - * specially for long keys. - */ - public void insert(String key, char val) { - // make sure we have enough room in the arrays - int len = key.length() - + 1; // maximum number of nodes that may be generated - if (freenode + len > eq.length) { - redimNodeArrays(eq.length + BLOCK_SIZE); - } - char strkey[] = new char[len--]; - key.getChars(0, len, strkey, 0); - strkey[len] = 0; - root = insert(root, strkey, 0, val); - } - - public void insert(char[] key, int start, char val) { - int len = strlen(key) + 1; - if (freenode + len > eq.length) { - redimNodeArrays(eq.length + BLOCK_SIZE); - } - root = insert(root, key, start, val); - } - - /** - * The actual insertion function, recursive version. - */ - private char insert(char p, char[] key, int start, char val) { - int len = strlen(key, start); - if (p == 0) { - // this means there is no branch, this node will start a new branch. - // Instead of doing that, we store the key somewhere else and create - // only one node with a pointer to the key - p = freenode++; - eq[p] = val; // holds data - length++; - hi[p] = 0; - if (len > 0) { - sc[p] = 0xFFFF; // indicates branch is compressed - lo[p] = (char)kv.alloc(len - + 1); // use 'lo' to hold pointer to key - strcpy(kv.getArray(), lo[p], key, start); - } else { - sc[p] = 0; - lo[p] = 0; - } - return p; - } - - if (sc[p] == 0xFFFF) { - // branch is compressed: need to decompress - // this will generate garbage in the external key array - // but we can do some garbage collection later - char pp = freenode++; - lo[pp] = lo[p]; // previous pointer to key - eq[pp] = eq[p]; // previous pointer to data - lo[p] = 0; - if (len > 0) { - sc[p] = kv.get(lo[pp]); - eq[p] = pp; - lo[pp]++; - if (kv.get(lo[pp]) == 0) { - // key completly decompressed leaving garbage in key array - lo[pp] = 0; - sc[pp] = 0; - hi[pp] = 0; - } else { - // we only got first char of key, rest is still there - sc[pp] = 0xFFFF; - } - } else { - // In this case we can save a node by swapping the new node - // with the compressed node - sc[pp] = 0xFFFF; - hi[p] = pp; - sc[p] = 0; - eq[p] = val; - length++; - return p; - } - } - char s = key[start]; - if (s < sc[p]) { - lo[p] = insert(lo[p], key, start, val); - } else if (s == sc[p]) { - if (s != 0) { - eq[p] = insert(eq[p], key, start + 1, val); - } else { - // key already in tree, overwrite data - eq[p] = val; - } - } else { - hi[p] = insert(hi[p], key, start, val); - } - return p; - } - - /** - * Compares 2 null terminated char arrays - */ - public static int strcmp(char[] a, int startA, char[] b, int startB) { - for (; a[startA] == b[startB]; startA++, startB++) { - if (a[startA] == 0) { - return 0; - } - } - return a[startA] - b[startB]; - } - - /** - * Compares a string with null terminated char array - */ - public static int strcmp(String str, char[] a, int start) { - int i, d, len = str.length(); - for (i = 0; i < len; i++) { - d = (int)str.charAt(i) - a[start + i]; - if (d != 0) { - return d; - } - if (a[start + i] == 0) { - return d; - } - } - if (a[start + i] != 0) { - return (int)-a[start + i]; - } - return 0; - - } - - public static void strcpy(char[] dst, int di, char[] src, int si) { - while (src[si] != 0) { - dst[di++] = src[si++]; - } - dst[di] = 0; - } - - public static int strlen(char[] a, int start) { - int len = 0; - for (int i = start; i < a.length && a[i] != 0; i++) { - len++; - } - return len; - } - - public static int strlen(char[] a) { - return strlen(a, 0); - } - - public int find(String key) { - int len = key.length(); - char strkey[] = new char[len + 1]; - key.getChars(0, len, strkey, 0); - strkey[len] = 0; - - return find(strkey, 0); - } - - public int find(char[] key, int start) { - int d; - char p = root; - int i = start; - char c; - - while (p != 0) { - if (sc[p] == 0xFFFF) { - if (strcmp(key, i, kv.getArray(), lo[p]) == 0) { - return eq[p]; - } else { - return -1; - } - } - c = key[i]; - d = c - sc[p]; - if (d == 0) { - if (c == 0) { - return eq[p]; - } - i++; - p = eq[p]; - } else if (d < 0) { - p = lo[p]; - } else { - p = hi[p]; - } - } - return -1; - } - - public boolean knows(String key) { - return (find(key) >= 0); - } - - // redimension the arrays - private void redimNodeArrays(int newsize) { - int len = newsize < lo.length ? newsize : lo.length; - char[] na = new char[newsize]; - System.arraycopy(lo, 0, na, 0, len); - lo = na; - na = new char[newsize]; - System.arraycopy(hi, 0, na, 0, len); - hi = na; - na = new char[newsize]; - System.arraycopy(eq, 0, na, 0, len); - eq = na; - na = new char[newsize]; - System.arraycopy(sc, 0, na, 0, len); - sc = na; - } - - public int size() { - return length; - } - - public Object clone() { - TernaryTree t = new TernaryTree(); - t.lo = (char[])this.lo.clone(); - t.hi = (char[])this.hi.clone(); - t.eq = (char[])this.eq.clone(); - t.sc = (char[])this.sc.clone(); - t.kv = (CharVector)this.kv.clone(); - t.root = this.root; - t.freenode = this.freenode; - t.length = this.length; - - return t; - } - - /** - * Recursively insert the median first and then the median of the - * lower and upper halves, and so on in order to get a balanced - * tree. The array of keys is assumed to be sorted in ascending - * order. - */ - protected void insertBalanced(String[] k, char[] v, int offset, int n) { - int m; - if (n < 1) { - return; - } - m = n >> 1; - - insert(k[m + offset], v[m + offset]); - insertBalanced(k, v, offset, m); - - insertBalanced(k, v, offset + m + 1, n - m - 1); - } - - - /** - * Balance the tree for best search performance - */ - public void balance() { - // System.out.print("Before root splitchar = "); System.out.println(sc[root]); - - int i = 0, n = length; - String[] k = new String[n]; - char[] v = new char[n]; - Iterator iter = new Iterator(); - while (iter.hasMoreElements()) { - v[i] = iter.getValue(); - k[i++] = (String)iter.nextElement(); - } - init(); - insertBalanced(k, v, 0, n); - - // With uniform letter distribution sc[root] should be around 'm' - // System.out.print("After root splitchar = "); System.out.println(sc[root]); - } - - /** - * Each node stores a character (splitchar) which is part of - * some key(s). In a compressed branch (one that only contain - * a single string key) the trailer of the key which is not - * already in nodes is stored externally in the kv array. - * As items are inserted, key substrings decrease. - * Some substrings may completely disappear when the whole - * branch is totally decompressed. - * The tree is traversed to find the key substrings actually - * used. In addition, duplicate substrings are removed using - * a map (implemented with a TernaryTree!). - * - */ - public void trimToSize() { - // first balance the tree for best performance - balance(); - - // redimension the node arrays - redimNodeArrays(freenode); - - // ok, compact kv array - CharVector kx = new CharVector(); - kx.alloc(1); - TernaryTree map = new TernaryTree(); - compact(kx, map, root); - kv = kx; - kv.trimToSize(); - } - - private void compact(CharVector kx, TernaryTree map, char p) { - int k; - if (p == 0) { - return; - } - if (sc[p] == 0xFFFF) { - k = map.find(kv.getArray(), lo[p]); - if (k < 0) { - k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1); - strcpy(kx.getArray(), k, kv.getArray(), lo[p]); - map.insert(kx.getArray(), k, (char)k); - } - lo[p] = (char)k; - } else { - compact(kx, map, lo[p]); - if (sc[p] != 0) { - compact(kx, map, eq[p]); - } - compact(kx, map, hi[p]); - } - } - - - public Enumeration keys() { - return new Iterator(); - } - - public class Iterator implements Enumeration { - - /** - * current node index - */ - int cur; - - /** - * current key - */ - String curkey; - - private class Item implements Cloneable { - char parent; - char child; - - public Item() { - parent = 0; - child = 0; - } - - public Item(char p, char c) { - parent = p; - child = c; - } - - public Object clone() { - return new Item(parent, child); - } - - } - - /** - * Node stack - */ - Stack ns; - - /** - * key stack implemented with a StringBuffer - */ - StringBuffer ks; - - public Iterator() { - cur = -1; - ns = new Stack(); - ks = new StringBuffer(); - rewind(); - } - - public void rewind() { - ns.removeAllElements(); - ks.setLength(0); - cur = root; - run(); - } - - public Object nextElement() { - String res = new String(curkey); - cur = up(); - run(); - return res; - } - - public char getValue() { - if (cur >= 0) { - return eq[cur]; - } - return 0; - } - - public boolean hasMoreElements() { - return (cur != -1); - } - - /** - * traverse upwards - */ - private int up() { - Item i = new Item(); - int res = 0; - - if (ns.empty()) { - return -1; - } - - if (cur != 0 && sc[cur] == 0) { - return lo[cur]; - } - - boolean climb = true; - - while (climb) { - i = (Item)ns.pop(); - i.child++; - switch (i.child) { - case 1: - if (sc[i.parent] != 0) { - res = eq[i.parent]; - ns.push(i.clone()); - ks.append(sc[i.parent]); - } else { - i.child++; - ns.push(i.clone()); - res = hi[i.parent]; - } - climb = false; - break; - - case 2: - res = hi[i.parent]; - ns.push(i.clone()); - if (ks.length() > 0) { - ks.setLength(ks.length() - 1); // pop - } - climb = false; - break; - - default: - if (ns.empty()) { - return -1; - } - climb = true; - break; - } - } - return res; - } - - /** - * traverse the tree to find next key - */ - private int run() { - if (cur == -1) { - return -1; - } - - boolean leaf = false; - while (true) { - // first go down on low branch until leaf or compressed branch - while (cur != 0) { - if (sc[cur] == 0xFFFF) { - leaf = true; - break; - } - ns.push(new Item((char)cur, '\u0000')); - if (sc[cur] == 0) { - leaf = true; - break; - } - cur = lo[cur]; - } - if (leaf) { - break; - } - // nothing found, go up one node and try again - cur = up(); - if (cur == -1) { - return -1; - } - } - // The current node should be a data node and - // the key should be in the key stack (at least partially) - StringBuffer buf = new StringBuffer(ks.toString()); - if (sc[cur] == 0xFFFF) { - int p = lo[cur]; - while (kv.get(p) != 0) { - buf.append(kv.get(p++)); - } - } - curkey = buf.toString(); - return 0; - } - - } - - public void printStats() { - System.out.println("Number of keys = " + Integer.toString(length)); - System.out.println("Node count = " + Integer.toString(freenode)); - // System.out.println("Array length = " + Integer.toString(eq.length)); - System.out.println("Key Array length = " - + Integer.toString(kv.length())); - - /* - * for(int i=0; i