You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PatternParser.java 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.hyphenation;
  19. // SAX
  20. import org.xml.sax.XMLReader;
  21. import org.xml.sax.InputSource;
  22. import org.xml.sax.SAXException;
  23. import org.xml.sax.SAXParseException;
  24. import org.xml.sax.helpers.DefaultHandler;
  25. import org.xml.sax.Attributes;
  26. // Java
  27. import java.io.File;
  28. import java.io.FileNotFoundException;
  29. import java.io.FileOutputStream;
  30. import java.io.IOException;
  31. import java.io.InputStream;
  32. import java.io.PrintStream;
  33. import java.net.MalformedURLException;
  34. import java.util.ArrayList;
  35. import javax.xml.parsers.SAXParserFactory;
  36. /**
  37. * A SAX document handler to read and parse hyphenation patterns
  38. * from a XML file.
  39. *
  40. * @author Carlos Villegas <cav@uniscope.co.jp>
  41. */
  42. public class PatternParser extends DefaultHandler implements PatternConsumer {
  43. XMLReader parser;
  44. int currElement;
  45. PatternConsumer consumer;
  46. StringBuffer token;
  47. ArrayList exception;
  48. char hyphenChar;
  49. String errMsg;
  50. boolean hasClasses = false;
  51. static final int ELEM_CLASSES = 1;
  52. static final int ELEM_EXCEPTIONS = 2;
  53. static final int ELEM_PATTERNS = 3;
  54. static final int ELEM_HYPHEN = 4;
  55. public PatternParser() throws HyphenationException {
  56. this.consumer = this;
  57. token = new StringBuffer();
  58. parser = createParser();
  59. parser.setContentHandler(this);
  60. parser.setErrorHandler(this);
  61. hyphenChar = '-'; // default
  62. }
  63. public PatternParser(PatternConsumer consumer) throws HyphenationException {
  64. this();
  65. this.consumer = consumer;
  66. }
  67. /**
  68. * Parses a hyphenation pattern file.
  69. * @param filename the filename
  70. * @throws HyphenationException In case of an exception while parsing
  71. */
  72. public void parse(String filename) throws HyphenationException {
  73. parse(new File(filename));
  74. }
  75. /**
  76. * Parses a hyphenation pattern file.
  77. * @param file the pattern file
  78. * @throws HyphenationException In case of an exception while parsing
  79. */
  80. public void parse(File file) throws HyphenationException {
  81. try {
  82. InputSource src = new InputSource(file.toURI().toURL().toExternalForm());
  83. parse(src);
  84. } catch (MalformedURLException e) {
  85. throw new HyphenationException("Error converting the File '" + file + "' to a URL: "
  86. + e.getMessage());
  87. }
  88. }
  89. /**
  90. * Parses a hyphenation pattern file.
  91. * @param source the InputSource for the file
  92. * @throws HyphenationException In case of an exception while parsing
  93. */
  94. public void parse(InputSource source) throws HyphenationException {
  95. try {
  96. parser.parse(source);
  97. } catch (FileNotFoundException fnfe) {
  98. throw new HyphenationException("File not found: " + fnfe.getMessage());
  99. } catch (IOException ioe) {
  100. throw new HyphenationException(ioe.getMessage());
  101. } catch (SAXException e) {
  102. throw new HyphenationException(errMsg);
  103. }
  104. }
  105. /**
  106. * Creates a SAX parser using JAXP
  107. * @return the created SAX parser
  108. */
  109. static XMLReader createParser() {
  110. try {
  111. SAXParserFactory factory = SAXParserFactory.newInstance();
  112. factory.setNamespaceAware(true);
  113. return factory.newSAXParser().getXMLReader();
  114. } catch (Exception e) {
  115. throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
  116. }
  117. }
  118. protected String readToken(StringBuffer chars) {
  119. String word;
  120. boolean space = false;
  121. int i;
  122. for (i = 0; i < chars.length(); i++) {
  123. if (Character.isWhitespace(chars.charAt(i))) {
  124. space = true;
  125. } else {
  126. break;
  127. }
  128. }
  129. if (space) {
  130. // chars.delete(0,i);
  131. for (int countr = i; countr < chars.length(); countr++) {
  132. chars.setCharAt(countr - i, chars.charAt(countr));
  133. }
  134. chars.setLength(chars.length() - i);
  135. if (token.length() > 0) {
  136. word = token.toString();
  137. token.setLength(0);
  138. return word;
  139. }
  140. }
  141. space = false;
  142. for (i = 0; i < chars.length(); i++) {
  143. if (Character.isWhitespace(chars.charAt(i))) {
  144. space = true;
  145. break;
  146. }
  147. }
  148. token.append(chars.toString().substring(0, i));
  149. // chars.delete(0,i);
  150. for (int countr = i; countr < chars.length(); countr++) {
  151. chars.setCharAt(countr - i, chars.charAt(countr));
  152. }
  153. chars.setLength(chars.length() - i);
  154. if (space) {
  155. word = token.toString();
  156. token.setLength(0);
  157. return word;
  158. }
  159. token.append(chars);
  160. return null;
  161. }
  162. protected static String getPattern(String word) {
  163. StringBuffer pat = new StringBuffer();
  164. int len = word.length();
  165. for (int i = 0; i < len; i++) {
  166. if (!Character.isDigit(word.charAt(i))) {
  167. pat.append(word.charAt(i));
  168. }
  169. }
  170. return pat.toString();
  171. }
  172. protected ArrayList normalizeException(ArrayList ex) {
  173. ArrayList res = new ArrayList();
  174. for (int i = 0; i < ex.size(); i++) {
  175. Object item = ex.get(i);
  176. if (item instanceof String) {
  177. String str = (String)item;
  178. StringBuffer buf = new StringBuffer();
  179. for (int j = 0; j < str.length(); j++) {
  180. char c = str.charAt(j);
  181. if (c != hyphenChar) {
  182. buf.append(c);
  183. } else {
  184. res.add(buf.toString());
  185. buf.setLength(0);
  186. char[] h = new char[1];
  187. h[0] = hyphenChar;
  188. // we use here hyphenChar which is not necessarily
  189. // the one to be printed
  190. res.add(new Hyphen(new String(h), null, null));
  191. }
  192. }
  193. if (buf.length() > 0) {
  194. res.add(buf.toString());
  195. }
  196. } else {
  197. res.add(item);
  198. }
  199. }
  200. return res;
  201. }
  202. protected String getExceptionWord(ArrayList ex) {
  203. StringBuffer res = new StringBuffer();
  204. for (int i = 0; i < ex.size(); i++) {
  205. Object item = ex.get(i);
  206. if (item instanceof String) {
  207. res.append((String)item);
  208. } else {
  209. if (((Hyphen)item).noBreak != null) {
  210. res.append(((Hyphen)item).noBreak);
  211. }
  212. }
  213. }
  214. return res.toString();
  215. }
  216. protected static String getInterletterValues(String pat) {
  217. StringBuffer il = new StringBuffer();
  218. String word = pat + "a"; // add dummy letter to serve as sentinel
  219. int len = word.length();
  220. for (int i = 0; i < len; i++) {
  221. char c = word.charAt(i);
  222. if (Character.isDigit(c)) {
  223. il.append(c);
  224. i++;
  225. } else {
  226. il.append('0');
  227. }
  228. }
  229. return il.toString();
  230. }
  231. protected void getExternalClasses() throws SAXException {
  232. XMLReader mainParser = parser;
  233. parser = createParser();
  234. parser.setContentHandler(this);
  235. parser.setErrorHandler(this);
  236. InputStream stream = this.getClass().getResourceAsStream("classes.xml");
  237. InputSource source = new InputSource(stream);
  238. try {
  239. parser.parse(source);
  240. } catch (IOException ioe) {
  241. throw new SAXException(ioe.getMessage());
  242. } finally {
  243. parser = mainParser;
  244. }
  245. }
  246. //
  247. // ContentHandler methods
  248. //
  249. /**
  250. * {@inheritDoc}
  251. * @throws SAXException
  252. */
  253. public void startElement(String uri, String local, String raw,
  254. Attributes attrs) throws SAXException {
  255. if (local.equals("hyphen-char")) {
  256. String h = attrs.getValue("value");
  257. if (h != null && h.length() == 1) {
  258. hyphenChar = h.charAt(0);
  259. }
  260. } else if (local.equals("classes")) {
  261. currElement = ELEM_CLASSES;
  262. } else if (local.equals("patterns")) {
  263. if (!hasClasses) {
  264. getExternalClasses();
  265. }
  266. currElement = ELEM_PATTERNS;
  267. } else if (local.equals("exceptions")) {
  268. if (!hasClasses) {
  269. getExternalClasses();
  270. }
  271. currElement = ELEM_EXCEPTIONS;
  272. exception = new ArrayList();
  273. } else if (local.equals("hyphen")) {
  274. if (token.length() > 0) {
  275. exception.add(token.toString());
  276. }
  277. exception.add(new Hyphen(attrs.getValue("pre"),
  278. attrs.getValue("no"),
  279. attrs.getValue("post")));
  280. currElement = ELEM_HYPHEN;
  281. }
  282. token.setLength(0);
  283. }
  284. /**
  285. * {@inheritDoc}
  286. */
  287. public void endElement(String uri, String local, String raw) {
  288. if (token.length() > 0) {
  289. String word = token.toString();
  290. switch (currElement) {
  291. case ELEM_CLASSES:
  292. consumer.addClass(word);
  293. break;
  294. case ELEM_EXCEPTIONS:
  295. exception.add(word);
  296. exception = normalizeException(exception);
  297. consumer.addException(getExceptionWord(exception),
  298. (ArrayList)exception.clone());
  299. break;
  300. case ELEM_PATTERNS:
  301. consumer.addPattern(getPattern(word),
  302. getInterletterValues(word));
  303. break;
  304. case ELEM_HYPHEN:
  305. // nothing to do
  306. break;
  307. }
  308. if (currElement != ELEM_HYPHEN) {
  309. token.setLength(0);
  310. }
  311. }
  312. if (currElement == ELEM_CLASSES) {
  313. hasClasses = true;
  314. }
  315. if (currElement == ELEM_HYPHEN) {
  316. currElement = ELEM_EXCEPTIONS;
  317. } else {
  318. currElement = 0;
  319. }
  320. }
  321. /**
  322. * {@inheritDoc}
  323. */
  324. public void characters(char ch[], int start, int length) {
  325. StringBuffer chars = new StringBuffer(length);
  326. chars.append(ch, start, length);
  327. String word = readToken(chars);
  328. while (word != null) {
  329. // System.out.println("\"" + word + "\"");
  330. switch (currElement) {
  331. case ELEM_CLASSES:
  332. consumer.addClass(word);
  333. break;
  334. case ELEM_EXCEPTIONS:
  335. exception.add(word);
  336. exception = normalizeException(exception);
  337. consumer.addException(getExceptionWord(exception),
  338. (ArrayList)exception.clone());
  339. exception.clear();
  340. break;
  341. case ELEM_PATTERNS:
  342. consumer.addPattern(getPattern(word),
  343. getInterletterValues(word));
  344. break;
  345. }
  346. word = readToken(chars);
  347. }
  348. }
  349. //
  350. // ErrorHandler methods
  351. //
  352. /**
  353. * {@inheritDoc}
  354. */
  355. public void warning(SAXParseException ex) {
  356. errMsg = "[Warning] " + getLocationString(ex) + ": "
  357. + ex.getMessage();
  358. }
  359. /**
  360. * {@inheritDoc}
  361. */
  362. public void error(SAXParseException ex) {
  363. errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
  364. }
  365. /**
  366. * {@inheritDoc}
  367. */
  368. public void fatalError(SAXParseException ex) throws SAXException {
  369. errMsg = "[Fatal Error] " + getLocationString(ex) + ": "
  370. + ex.getMessage();
  371. throw ex;
  372. }
  373. /**
  374. * Returns a string of the location.
  375. */
  376. private String getLocationString(SAXParseException ex) {
  377. StringBuffer str = new StringBuffer();
  378. String systemId = ex.getSystemId();
  379. if (systemId != null) {
  380. int index = systemId.lastIndexOf('/');
  381. if (index != -1) {
  382. systemId = systemId.substring(index + 1);
  383. }
  384. str.append(systemId);
  385. }
  386. str.append(':');
  387. str.append(ex.getLineNumber());
  388. str.append(':');
  389. str.append(ex.getColumnNumber());
  390. return str.toString();
  391. } // getLocationString(SAXParseException):String
  392. // PatternConsumer implementation for testing purposes
  393. public void addClass(String c) {
  394. testOut.println("class: " + c);
  395. }
  396. public void addException(String w, ArrayList e) {
  397. testOut.println("exception: " + w + " : " + e.toString());
  398. }
  399. public void addPattern(String p, String v) {
  400. testOut.println("pattern: " + p + " : " + v);
  401. }
  402. private PrintStream testOut = System.out;
  403. /**
  404. * @param testOut the testOut to set
  405. */
  406. public void setTestOut(PrintStream testOut) {
  407. this.testOut = testOut;
  408. }
  409. public void closeTestOut() {
  410. testOut.flush();
  411. testOut.close();
  412. }
  413. public static void main(String[] args) throws Exception {
  414. if (args.length > 0) {
  415. PatternParser pp = new PatternParser();
  416. PrintStream p = null;
  417. if (args.length > 1) {
  418. FileOutputStream f = new FileOutputStream(args[1]);
  419. p = new PrintStream(f, false, "utf-8");
  420. pp.setTestOut(p);
  421. }
  422. pp.parse(args[0]);
  423. if (pp != null) {
  424. pp.closeTestOut();
  425. }
  426. }
  427. }
  428. }