You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

HyphenationTree.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. /* $Id$
  2. * Copyright (C) 2001 The Apache Software Foundation. All rights reserved.
  3. * For details on use and redistribution please refer to the
  4. * LICENSE file included with these sources.
  5. */
  6. package org.apache.fop.layout.hyphenation;
  7. import java.io.*;
  8. import java.util.Vector;
  9. import java.util.Hashtable;
  10. /**
  11. * This tree structure stores the hyphenation patterns in an efficient
  12. * way for fast lookup. It provides the provides the method to
  13. * hyphenate a word.
  14. *
  15. * @author Carlos Villegas <cav@uniscope.co.jp>
  16. */
  17. public class HyphenationTree extends TernaryTree
  18. implements PatternConsumer, Serializable
  19. {
  20. /** value space: stores the inteletter values */
  21. protected ByteVector vspace;
  22. /** This map stores hyphenation exceptions */
  23. protected Hashtable stoplist;
  24. /** This map stores the character classes */
  25. protected TernaryTree classmap;
  26. /** Temporary map to store interletter values on pattern loading. */
  27. private transient TernaryTree ivalues;
  28. public HyphenationTree()
  29. {
  30. stoplist = new Hashtable(23); // usually a small table
  31. classmap = new TernaryTree();
  32. vspace = new ByteVector();
  33. vspace.alloc(1); // this reserves index 0, which we don't use
  34. }
  35. /**
  36. * Packs the values by storing them in 4 bits, two values into a byte
  37. * Values range is from 0 to 9. We use zero as terminator,
  38. * so we'll add 1 to the value.
  39. * @param values a string of digits from '0' to '9' representing the
  40. * interletter values.
  41. * @return the index into the vspace array where the packed values
  42. * are stored.
  43. */
  44. protected int packValues(String values)
  45. {
  46. int i, n = values.length();
  47. int m = (n & 1) == 1 ? (n>>1)+2 : (n>>1)+1;
  48. int offset = vspace.alloc(m);
  49. byte[] va = vspace.getArray();
  50. for(i=0; i<n; i++) {
  51. int j = i>>1;
  52. byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
  53. if ( (i&1) == 1 )
  54. va[j+offset] = (byte)( va[j+offset] | v );
  55. else
  56. va[j+offset] = (byte)(v << 4); // big endian
  57. }
  58. va[m-1+offset] = 0; // terminator
  59. return offset;
  60. }
  61. protected String unpackValues(int k)
  62. {
  63. StringBuffer buf = new StringBuffer();
  64. byte v = vspace.get(k++);
  65. while( v != 0 ) {
  66. char c = (char)((v >>> 4) -1 + '0');
  67. buf.append(c);
  68. c = (char)(v & 0x0f);
  69. if ( c == 0 ) break;
  70. c = (char)(c - 1 + '0');
  71. buf.append(c);
  72. v = vspace.get(k++);
  73. }
  74. return buf.toString();
  75. }
  76. /**
  77. * Read hyphenation patterns from an XML file.
  78. */
  79. public void loadPatterns(String filename)
  80. throws HyphenationException
  81. {
  82. PatternParser pp = new PatternParser(this);
  83. ivalues = new TernaryTree();
  84. pp.parse(filename);
  85. // patterns/values should be now in the tree
  86. // let's optimize a bit
  87. trimToSize();
  88. vspace.trimToSize();
  89. classmap.trimToSize();
  90. // get rid of the auxiliary map
  91. ivalues = null;
  92. }
  93. public String findPattern(String pat)
  94. {
  95. int k = super.find(pat);
  96. if ( k >= 0 )
  97. return unpackValues(k);
  98. return "";
  99. }
  100. /**
  101. * String compare, returns 0 if equal or
  102. * t is a substring of s
  103. */
  104. protected int hstrcmp(char[] s, int si, char[] t, int ti)
  105. {
  106. for ( ; s[si] == t[ti]; si++, ti++)
  107. if (s[si] == 0)
  108. return 0;
  109. if ( t[ti] == 0 )
  110. return 0;
  111. return s[si] - t[ti];
  112. }
  113. protected byte[] getValues(int k)
  114. {
  115. StringBuffer buf = new StringBuffer();
  116. byte v = vspace.get(k++);
  117. while( v != 0 ) {
  118. char c = (char)((v >>> 4) - 1);
  119. buf.append(c);
  120. c = (char)(v & 0x0f);
  121. if ( c == 0 ) break;
  122. c = (char)(c - 1);
  123. buf.append(c);
  124. v = vspace.get(k++);
  125. }
  126. byte[] res = new byte[buf.length()];
  127. for(int i=0;i<res.length;i++)
  128. res[i] = (byte)buf.charAt(i);
  129. return res;
  130. }
  131. /**
  132. * <p>Search for all possible partial matches of word starting
  133. * at index an update interletter values. In other words, it
  134. * does something like:</p>
  135. * <code>
  136. * for(i=0; i<patterns.length; i++) {
  137. * if ( word.substring(index).startsWidth(patterns[i]) )
  138. * update_interletter_values(patterns[i]);
  139. * }
  140. * </code>
  141. * <p>But it is done in an efficient way since the patterns are
  142. * stored in a ternary tree. In fact, this is the whole purpose
  143. * of having the tree: doing this search without having to test
  144. * every single pattern. The number of patterns for languages
  145. * such as English range from 4000 to 10000. Thus, doing thousands
  146. * of string comparisons for each word to hyphenate would be
  147. * really slow without the tree. The tradeoff is memory, but
  148. * using a ternary tree instead of a trie, almost halves the
  149. * the memory used by Lout or TeX. It's also faster than using
  150. * a hash table</p>
  151. * @param word null terminated word to match
  152. * @param index start index from word
  153. * @param il interletter values array to update
  154. */
  155. protected void searchPatterns(char[] word, int index, byte[] il)
  156. {
  157. byte[] values;
  158. int i=index;
  159. char p, q;
  160. char sp = word[i];
  161. p = root;
  162. while( p > 0 && p < sc.length){
  163. if (sc[p] == 0xFFFF) {
  164. if ( hstrcmp(word, i, kv.getArray(), lo[p]) == 0 ) {
  165. values = getValues(eq[p]); // data pointer is in eq[]
  166. int j=index;
  167. for(int k=0; k<values.length; k++) {
  168. if ( j < il.length && values[k] > il[j] )
  169. il[j] = values[k];
  170. j++;
  171. }
  172. }
  173. return;
  174. }
  175. int d = sp - sc[p];
  176. if ( d == 0 ) {
  177. if ( sp == 0 ) {
  178. break;
  179. }
  180. sp = word[++i];
  181. p = eq[p];
  182. q = p;
  183. // look for a pattern ending at this position by searching for
  184. // the null char ( splitchar == 0 )
  185. while ( q > 0 && q < sc.length ) {
  186. if ( sc[q] == 0xFFFF ) {// stop at compressed branch
  187. break;
  188. }
  189. if ( sc[q] == 0 ) {
  190. values = getValues(eq[q]);
  191. int j=index;
  192. for(int k=0; k<values.length; k++) {
  193. if (j < il.length && values[k] > il[j] ) {
  194. il[j] = values[k];
  195. }
  196. j++;
  197. }
  198. break;
  199. } else {
  200. q = lo[q];
  201. /** actually the code should be:
  202. q = sc[q] < 0 ? hi[q] : lo[q];
  203. but java chars are unsigned
  204. */
  205. }
  206. }
  207. } else
  208. p = d < 0 ? lo[p] : hi[p];
  209. }
  210. }
  211. /**
  212. * Hyphenate word and return a Hyphenation object.
  213. * @param word the word to be hyphenated
  214. * @param remainCharCount Minimum number of characters allowed
  215. * before the hyphenation point.
  216. * @param pushCharCount Minimum number of characters allowed after
  217. * the hyphenation point.
  218. * @return a {@link Hyphenation Hyphenation} object representing
  219. * the hyphenated word or null if word is not hyphenated.
  220. */
  221. public Hyphenation hyphenate(String word, int remainCharCount, int pushCharCount)
  222. {
  223. char[] w = word.toCharArray();
  224. return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
  225. }
  226. /**
  227. * Hyphenate word and return an array of hyphenation points.
  228. * @param w char array that contains the word
  229. * @param offset Offset to first character in word
  230. * @param len Length of word
  231. * @param remainCharCount Minimum number of characters allowed
  232. * before the hyphenation point.
  233. * @param pushCharCount Minimum number of characters allowed after
  234. * the hyphenation point.
  235. * @return a {@link Hyphenation Hyphenation} object representing
  236. * the hyphenated word or null if word is not hyphenated.
  237. */
  238. public Hyphenation hyphenate(char[] w, int offset, int len,
  239. int remainCharCount, int pushCharCount)
  240. {
  241. int i;
  242. char[] word = new char[len+3];
  243. // normalize word
  244. char[] c = new char[2];
  245. for(i=1; i<=len; i++) {
  246. c[0] = w[offset+i-1];
  247. int nc = classmap.find(c,0);
  248. if ( nc < 0 ) { // found a non-letter character, abort
  249. return null;
  250. }
  251. word[i] = (char)nc;
  252. }
  253. int[] result = new int[len+1];
  254. int k=0;
  255. // check exception list first
  256. String sw = new String(word,1,len);
  257. if ( stoplist.containsKey(sw) ) {
  258. // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
  259. Vector hw = (Vector)stoplist.get(sw);
  260. int j = 0;
  261. for(i=0; i<hw.size(); i++) {
  262. Object o = hw.elementAt(i);
  263. if ( o instanceof String ) {
  264. j += ((String)o).length();
  265. if ( j >= remainCharCount && j < (len - pushCharCount) )
  266. result[k++] = j;
  267. }
  268. }
  269. } else {
  270. // use algorithm to get hyphenation points
  271. word[0] = '.'; // word start marker
  272. word[len+1] = '.'; // word end marker
  273. word[len+2] = 0; // null terminated
  274. byte[] il = new byte[len+3]; // initialized to zero
  275. for(i=0; i<len+1; i++) {
  276. searchPatterns(word, i, il);
  277. }
  278. // hyphenation points are located where interletter value is odd
  279. for(i=0; i<len; i++) {
  280. if ( ((il[i+1] & 1) == 1) && i >= remainCharCount
  281. && i < (len-pushCharCount) ) {
  282. result[k++] = i;
  283. }
  284. }
  285. }
  286. if ( k > 0 ) {
  287. // trim result array
  288. int[] res = new int[k];
  289. System.arraycopy(result, 0, res, 0, k);
  290. return new Hyphenation(new String(w,offset,len), res);
  291. } else {
  292. return null;
  293. }
  294. }
  295. /**
  296. * Add a character class to the tree. It is used by
  297. * {@link PatternParser PatternParser} as callback to
  298. * add character classes. Character classes define the
  299. * valid word characters for hyphenation. If a word contains
  300. * a character not defined in any of the classes, it is not hyphenated.
  301. * It also defines a way to normalize the characters in order
  302. * to compare them with the stored patterns. Usually pattern
  303. * files use only lower case characters, in this case a class
  304. * for letter 'a', for example, should be defined as "aA", the first
  305. * character being the normalization char.
  306. */
  307. public void addClass(String chargroup)
  308. {
  309. if ( chargroup.length() > 0 ) {
  310. char equivChar = chargroup.charAt(0);
  311. char[] key = new char[2];
  312. key[1] = 0;
  313. for(int i=0; i<chargroup.length(); i++ ) {
  314. key[0] = chargroup.charAt(i);
  315. classmap.insert(key, 0, equivChar);
  316. }
  317. }
  318. }
  319. /**
  320. * Add an exception to the tree. It is used by
  321. * {@link PatternParser PatternParser} class as callback to
  322. * store the hyphenation exceptions.
  323. * @param word normalized word
  324. * @param hyphenatedword a vector of alternating strings and
  325. * {@link Hyphen hyphen} objects.
  326. */
  327. public void addException(String word, Vector hyphenatedword)
  328. {
  329. stoplist.put(word, hyphenatedword);
  330. }
  331. /**
  332. * Add a pattern to the tree. Mainly, to be used by
  333. * {@link PatternParser PatternParser} class as callback to
  334. * add a pattern to the tree.
  335. * @param pattern the hyphenation pattern
  336. * @param ivalue interletter weight values indicating the
  337. * desirability and priority of hyphenating at a given point
  338. * within the pattern. It should contain only digit characters.
  339. * (i.e. '0' to '9').
  340. */
  341. public void addPattern(String pattern, String ivalue)
  342. {
  343. int k = ivalues.find(ivalue);
  344. if ( k <= 0 ) {
  345. k = packValues(ivalue);
  346. ivalues.insert(ivalue, (char)k);
  347. }
  348. insert(pattern, (char)k);
  349. }
  350. public void printStats()
  351. {
  352. System.out.println("Value space size = " + Integer.toString(vspace.length()));
  353. super.printStats();
  354. }
  355. public static void main(String[] argv)
  356. throws Exception
  357. {
  358. HyphenationTree ht = null;
  359. int minCharCount = 2;
  360. BufferedReader in
  361. = new BufferedReader(new InputStreamReader(System.in));
  362. for(;;) {
  363. System.out.print("l:\tload patterns from XML\nL:\tload patterns from serialized object\ns:\tset minimun character count\nw:\twrite hyphenation tree to object file\nh:\thyphenate\nf:\tfind pattern\nb:\tbenchmark\nq:\tquit\n\nCommand:");
  364. String token = in.readLine().trim();
  365. if ( token.equals("f") ) {
  366. System.out.print("Pattern: ");
  367. token = in.readLine().trim();
  368. System.out.println("Values: " + ht.findPattern(token));
  369. } else if ( token.equals("s")) {
  370. System.out.print("Minimun value: " );
  371. token = in.readLine().trim();
  372. minCharCount = Integer.parseInt(token);
  373. } else if ( token.equals("l") ) {
  374. ht = new HyphenationTree();
  375. System.out.print("XML file name: ");
  376. token = in.readLine().trim();
  377. ht.loadPatterns(token);
  378. } else if ( token.equals("L") ) {
  379. ObjectInputStream ois = null;
  380. System.out.print("Object file name: ");
  381. token = in.readLine().trim();
  382. try {
  383. ois = new ObjectInputStream(new FileInputStream(token));
  384. ht = (HyphenationTree)ois.readObject();
  385. }
  386. catch (Exception e) {
  387. e.printStackTrace();
  388. }
  389. finally {
  390. if ( ois != null ) {
  391. try { ois.close(); }
  392. catch (IOException e) { }
  393. }
  394. }
  395. } else if ( token.equals("w") ) {
  396. System.out.print("Object file name: ");
  397. token = in.readLine().trim();
  398. ObjectOutputStream oos = null;
  399. try {
  400. oos = new ObjectOutputStream(new FileOutputStream(token));
  401. oos.writeObject(ht);
  402. }
  403. catch (Exception e) {
  404. e.printStackTrace();
  405. }
  406. finally {
  407. if ( oos != null ) {
  408. try { oos.flush(); }
  409. catch (IOException e) {}
  410. try { oos.close(); }
  411. catch (IOException e) {}
  412. }
  413. }
  414. } else if ( token.equals("h") ) {
  415. System.out.print("Word: ");
  416. token = in.readLine().trim();
  417. System.out.print("Hyphenation points: ");
  418. System.out.println(ht.hyphenate(token,minCharCount,minCharCount));
  419. } else if ( token.equals("b") ) {
  420. if ( ht == null ) {
  421. System.out.println("No patterns has been loaded.");
  422. break;
  423. }
  424. System.out.print("Word list filename: ");
  425. token = in.readLine().trim();
  426. long starttime = 0;
  427. int counter = 0;;
  428. try {
  429. BufferedReader reader = new BufferedReader ( new FileReader(token));
  430. String line;
  431. starttime = System.currentTimeMillis();
  432. while ((line = reader.readLine())!= null) {
  433. //System.out.print("\nline: ");
  434. Hyphenation hyp = ht.hyphenate(line,minCharCount,minCharCount);
  435. if (hyp != null) {
  436. String hword = hyp.toString();
  437. //System.out.println(line);
  438. //System.out.println(hword);
  439. } else {
  440. //System.out.println("No hyphenation");
  441. }
  442. counter++;
  443. }
  444. } catch (Exception ioe) {
  445. System.out.println("Exception " + ioe);
  446. ioe.printStackTrace();
  447. }
  448. long endtime = System.currentTimeMillis();
  449. long result = endtime - starttime;
  450. System.out.println(counter+ " words in " + result + " Millisekunden hyphenated");
  451. } else if ( token.equals("q") )
  452. break;
  453. }
  454. }
  455. }