You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PatternParser.java 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.hyphenation;
  19. import java.io.File;
  20. import java.io.FileNotFoundException;
  21. import java.io.FileOutputStream;
  22. import java.io.IOException;
  23. import java.io.InputStream;
  24. import java.io.PrintStream;
  25. import java.net.MalformedURLException;
  26. import java.util.ArrayList;
  27. import javax.xml.parsers.SAXParserFactory;
  28. import org.xml.sax.Attributes;
  29. import org.xml.sax.InputSource;
  30. import org.xml.sax.SAXException;
  31. import org.xml.sax.SAXParseException;
  32. import org.xml.sax.XMLReader;
  33. import org.xml.sax.helpers.DefaultHandler;
  34. /**
  35. * <p>A SAX document handler to read and parse hyphenation patterns
  36. * from a XML file.</p>
  37. *
  38. * <p>This work was authored by Carlos Villegas (cav@uniscope.co.jp).</p>
  39. */
  40. public class PatternParser extends DefaultHandler implements PatternConsumer {
  41. private XMLReader parser;
  42. private int currElement;
  43. private PatternConsumer consumer;
  44. private StringBuffer token;
  45. private ArrayList exception;
  46. private char hyphenChar;
  47. private String errMsg;
  48. private boolean hasClasses = false;
  49. static final int ELEM_CLASSES = 1;
  50. static final int ELEM_EXCEPTIONS = 2;
  51. static final int ELEM_PATTERNS = 3;
  52. static final int ELEM_HYPHEN = 4;
  53. /**
  54. * Construct a pattern parser.
  55. * @throws HyphenationException if a hyphenation exception is raised
  56. */
  57. public PatternParser() throws HyphenationException {
  58. this.consumer = this;
  59. token = new StringBuffer();
  60. parser = createParser();
  61. parser.setContentHandler(this);
  62. parser.setErrorHandler(this);
  63. hyphenChar = '-'; // default
  64. }
  65. /**
  66. * Construct a pattern parser.
  67. * @param consumer a pattern consumer
  68. * @throws HyphenationException if a hyphenation exception is raised
  69. */
  70. public PatternParser(PatternConsumer consumer) throws HyphenationException {
  71. this();
  72. this.consumer = consumer;
  73. }
  74. /**
  75. * Parses a hyphenation pattern file.
  76. * @param filename the filename
  77. * @throws HyphenationException In case of an exception while parsing
  78. */
  79. public void parse(String filename) throws HyphenationException {
  80. parse(new File(filename));
  81. }
  82. /**
  83. * Parses a hyphenation pattern file.
  84. * @param file the pattern file
  85. * @throws HyphenationException In case of an exception while parsing
  86. */
  87. public void parse(File file) throws HyphenationException {
  88. try {
  89. InputSource src = new InputSource(file.toURI().toURL().toExternalForm());
  90. parse(src);
  91. } catch (MalformedURLException e) {
  92. throw new HyphenationException("Error converting the File '" + file + "' to a URL: "
  93. + e.getMessage());
  94. }
  95. }
  96. /**
  97. * Parses a hyphenation pattern file.
  98. * @param source the InputSource for the file
  99. * @throws HyphenationException In case of an exception while parsing
  100. */
  101. public void parse(InputSource source) throws HyphenationException {
  102. try {
  103. parser.parse(source);
  104. } catch (FileNotFoundException fnfe) {
  105. throw new HyphenationException("File not found: " + fnfe.getMessage());
  106. } catch (IOException ioe) {
  107. throw new HyphenationException(ioe.getMessage());
  108. } catch (SAXException e) {
  109. throw new HyphenationException(errMsg);
  110. }
  111. }
  112. /**
  113. * Creates a SAX parser using JAXP
  114. * @return the created SAX parser
  115. */
  116. static XMLReader createParser() {
  117. try {
  118. SAXParserFactory factory = SAXParserFactory.newInstance();
  119. factory.setNamespaceAware(true);
  120. return factory.newSAXParser().getXMLReader();
  121. } catch (Exception e) {
  122. throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
  123. }
  124. }
  125. private String readToken(StringBuffer chars) {
  126. String word;
  127. boolean space = false;
  128. int i;
  129. for (i = 0; i < chars.length(); i++) {
  130. if (Character.isWhitespace(chars.charAt(i))) {
  131. space = true;
  132. } else {
  133. break;
  134. }
  135. }
  136. if (space) {
  137. // chars.delete(0,i);
  138. for (int countr = i; countr < chars.length(); countr++) {
  139. chars.setCharAt(countr - i, chars.charAt(countr));
  140. }
  141. chars.setLength(chars.length() - i);
  142. if (token.length() > 0) {
  143. word = token.toString();
  144. token.setLength(0);
  145. return word;
  146. }
  147. }
  148. space = false;
  149. for (i = 0; i < chars.length(); i++) {
  150. if (Character.isWhitespace(chars.charAt(i))) {
  151. space = true;
  152. break;
  153. }
  154. }
  155. token.append(chars.toString().substring(0, i));
  156. // chars.delete(0,i);
  157. for (int countr = i; countr < chars.length(); countr++) {
  158. chars.setCharAt(countr - i, chars.charAt(countr));
  159. }
  160. chars.setLength(chars.length() - i);
  161. if (space) {
  162. word = token.toString();
  163. token.setLength(0);
  164. return word;
  165. }
  166. token.append(chars);
  167. return null;
  168. }
  169. private static String getPattern(String word) {
  170. StringBuffer pat = new StringBuffer();
  171. int len = word.length();
  172. for (int i = 0; i < len; i++) {
  173. if (!Character.isDigit(word.charAt(i))) {
  174. pat.append(word.charAt(i));
  175. }
  176. }
  177. return pat.toString();
  178. }
  179. private ArrayList normalizeException(ArrayList ex) {
  180. ArrayList res = new ArrayList();
  181. for (int i = 0; i < ex.size(); i++) {
  182. Object item = ex.get(i);
  183. if (item instanceof String) {
  184. String str = (String)item;
  185. StringBuffer buf = new StringBuffer();
  186. for (int j = 0; j < str.length(); j++) {
  187. char c = str.charAt(j);
  188. if (c != hyphenChar) {
  189. buf.append(c);
  190. } else {
  191. res.add(buf.toString());
  192. buf.setLength(0);
  193. char[] h = new char[1];
  194. h[0] = hyphenChar;
  195. // we use here hyphenChar which is not necessarily
  196. // the one to be printed
  197. res.add(new Hyphen(new String(h), null, null));
  198. }
  199. }
  200. if (buf.length() > 0) {
  201. res.add(buf.toString());
  202. }
  203. } else {
  204. res.add(item);
  205. }
  206. }
  207. return res;
  208. }
  209. private String getExceptionWord(ArrayList ex) {
  210. StringBuffer res = new StringBuffer();
  211. for (int i = 0; i < ex.size(); i++) {
  212. Object item = ex.get(i);
  213. if (item instanceof String) {
  214. res.append((String)item);
  215. } else {
  216. if (((Hyphen)item).noBreak != null) {
  217. res.append(((Hyphen)item).noBreak);
  218. }
  219. }
  220. }
  221. return res.toString();
  222. }
  223. private static String getInterletterValues(String pat) {
  224. StringBuffer il = new StringBuffer();
  225. String word = pat + "a"; // add dummy letter to serve as sentinel
  226. int len = word.length();
  227. for (int i = 0; i < len; i++) {
  228. char c = word.charAt(i);
  229. if (Character.isDigit(c)) {
  230. il.append(c);
  231. i++;
  232. } else {
  233. il.append('0');
  234. }
  235. }
  236. return il.toString();
  237. }
  238. /** @throws SAXException if not caught */
  239. protected void getExternalClasses() throws SAXException {
  240. XMLReader mainParser = parser;
  241. parser = createParser();
  242. parser.setContentHandler(this);
  243. parser.setErrorHandler(this);
  244. InputStream stream = this.getClass().getResourceAsStream("classes.xml");
  245. InputSource source = new InputSource(stream);
  246. try {
  247. parser.parse(source);
  248. } catch (IOException ioe) {
  249. throw new SAXException(ioe.getMessage());
  250. } finally {
  251. parser = mainParser;
  252. }
  253. }
  254. //
  255. // ContentHandler methods
  256. //
  257. /**
  258. * {@inheritDoc}
  259. * @throws SAXException
  260. */
  261. public void startElement(String uri, String local, String raw,
  262. Attributes attrs) throws SAXException {
  263. if (local.equals("hyphen-char")) {
  264. String h = attrs.getValue("value");
  265. if (h != null && h.length() == 1) {
  266. hyphenChar = h.charAt(0);
  267. }
  268. } else if (local.equals("classes")) {
  269. currElement = ELEM_CLASSES;
  270. } else if (local.equals("patterns")) {
  271. if (!hasClasses) {
  272. getExternalClasses();
  273. }
  274. currElement = ELEM_PATTERNS;
  275. } else if (local.equals("exceptions")) {
  276. if (!hasClasses) {
  277. getExternalClasses();
  278. }
  279. currElement = ELEM_EXCEPTIONS;
  280. exception = new ArrayList();
  281. } else if (local.equals("hyphen")) {
  282. if (token.length() > 0) {
  283. exception.add(token.toString());
  284. }
  285. exception.add(new Hyphen(attrs.getValue("pre"),
  286. attrs.getValue("no"),
  287. attrs.getValue("post")));
  288. currElement = ELEM_HYPHEN;
  289. }
  290. token.setLength(0);
  291. }
  292. /**
  293. * {@inheritDoc}
  294. */
  295. public void endElement(String uri, String local, String raw) {
  296. if (token.length() > 0) {
  297. String word = token.toString();
  298. switch (currElement) {
  299. case ELEM_CLASSES:
  300. consumer.addClass(word);
  301. break;
  302. case ELEM_EXCEPTIONS:
  303. exception.add(word);
  304. exception = normalizeException(exception);
  305. consumer.addException(getExceptionWord(exception),
  306. (ArrayList)exception.clone());
  307. break;
  308. case ELEM_PATTERNS:
  309. consumer.addPattern(getPattern(word),
  310. getInterletterValues(word));
  311. break;
  312. case ELEM_HYPHEN:
  313. // nothing to do
  314. break;
  315. default:
  316. break;
  317. }
  318. if (currElement != ELEM_HYPHEN) {
  319. token.setLength(0);
  320. }
  321. }
  322. if (currElement == ELEM_CLASSES) {
  323. hasClasses = true;
  324. }
  325. if (currElement == ELEM_HYPHEN) {
  326. currElement = ELEM_EXCEPTIONS;
  327. } else {
  328. currElement = 0;
  329. }
  330. }
  331. /**
  332. * {@inheritDoc}
  333. */
  334. public void characters(char[] ch, int start, int length) {
  335. StringBuffer chars = new StringBuffer(length);
  336. chars.append(ch, start, length);
  337. String word = readToken(chars);
  338. while (word != null) {
  339. // System.out.println("\"" + word + "\"");
  340. switch (currElement) {
  341. case ELEM_CLASSES:
  342. consumer.addClass(word);
  343. break;
  344. case ELEM_EXCEPTIONS:
  345. exception.add(word);
  346. exception = normalizeException(exception);
  347. consumer.addException(getExceptionWord(exception),
  348. (ArrayList)exception.clone());
  349. exception.clear();
  350. break;
  351. case ELEM_PATTERNS:
  352. consumer.addPattern(getPattern(word),
  353. getInterletterValues(word));
  354. break;
  355. default:
  356. break;
  357. }
  358. word = readToken(chars);
  359. }
  360. }
  361. //
  362. // ErrorHandler methods
  363. //
  364. /**
  365. * {@inheritDoc}
  366. */
  367. public void warning(SAXParseException ex) {
  368. errMsg = "[Warning] " + getLocationString(ex) + ": "
  369. + ex.getMessage();
  370. }
  371. /**
  372. * {@inheritDoc}
  373. */
  374. public void error(SAXParseException ex) {
  375. errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
  376. }
  377. /**
  378. * {@inheritDoc}
  379. */
  380. public void fatalError(SAXParseException ex) throws SAXException {
  381. errMsg = "[Fatal Error] " + getLocationString(ex) + ": "
  382. + ex.getMessage();
  383. throw ex;
  384. }
  385. /**
  386. * Returns a string of the location.
  387. */
  388. private String getLocationString(SAXParseException ex) {
  389. StringBuffer str = new StringBuffer();
  390. String systemId = ex.getSystemId();
  391. if (systemId != null) {
  392. int index = systemId.lastIndexOf('/');
  393. if (index != -1) {
  394. systemId = systemId.substring(index + 1);
  395. }
  396. str.append(systemId);
  397. }
  398. str.append(':');
  399. str.append(ex.getLineNumber());
  400. str.append(':');
  401. str.append(ex.getColumnNumber());
  402. return str.toString();
  403. } // getLocationString(SAXParseException):String
  404. /**
  405. * For testing purposes only.
  406. * {@inheritDoc}
  407. */
  408. public void addClass(String c) {
  409. testOut.println("class: " + c);
  410. }
  411. /**
  412. * For testing purposes only.
  413. * {@inheritDoc}
  414. */
  415. public void addException(String w, ArrayList e) {
  416. testOut.println("exception: " + w + " : " + e.toString());
  417. }
  418. /**
  419. * For testing purposes only.
  420. * {@inheritDoc}
  421. */
  422. public void addPattern(String p, String v) {
  423. testOut.println("pattern: " + p + " : " + v);
  424. }
  425. private PrintStream testOut = System.out;
  426. /**
  427. * Set test out stream.
  428. * @param testOut the testOut to set
  429. */
  430. public void setTestOut(PrintStream testOut) {
  431. this.testOut = testOut;
  432. }
  433. /**
  434. * Close test out file.
  435. */
  436. public void closeTestOut() {
  437. testOut.flush();
  438. testOut.close();
  439. }
  440. /**
  441. * Main entry point when used as an application.
  442. * @param args array of command line arguments
  443. * @throws Exception in case of uncaught exception
  444. */
  445. public static void main(String[] args) throws Exception {
  446. if (args.length > 0) {
  447. PatternParser pp = new PatternParser();
  448. PrintStream p = null;
  449. if (args.length > 1) {
  450. FileOutputStream f = new FileOutputStream(args[1]);
  451. p = new PrintStream(f, false, "utf-8");
  452. pp.setTestOut(p);
  453. }
  454. pp.parse(args[0]);
  455. if (pp != null) {
  456. pp.closeTestOut();
  457. }
  458. }
  459. }
  460. }