You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

GujaratiScriptProcessor.java 21KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.complexscripts.scripts;
  19. import org.apache.commons.logging.Log;
  20. import org.apache.commons.logging.LogFactory;
  21. import org.apache.fop.complexscripts.util.GlyphSequence;
  22. // CSOFF: LineLengthCheck
  23. /**
  24. * <p>The <code>GujaratiScriptProcessor</code> class implements a script processor for
  25. * performing glyph substitution and positioning operations on content associated with the Gujarati script.</p>
  26. *
  27. * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
  28. */
  29. public class GujaratiScriptProcessor extends IndicScriptProcessor {
  30. /** logging instance */
  31. private static final Log log = LogFactory.getLog(GujaratiScriptProcessor.class);
  32. GujaratiScriptProcessor(String script) {
  33. super(script);
  34. }
  35. @Override
  36. protected Class<? extends GujaratiSyllabizer> getSyllabizerClass() {
  37. return GujaratiSyllabizer.class;
  38. }
  39. @Override
  40. // find rightmost pre-base matra
  41. protected int findPreBaseMatra(GlyphSequence gs) {
  42. int ng = gs.getGlyphCount();
  43. int lk = -1;
  44. for (int i = ng; i > 0; i--) {
  45. int k = i - 1;
  46. if (containsPreBaseMatra(gs, k)) {
  47. lk = k;
  48. break;
  49. }
  50. }
  51. return lk;
  52. }
  53. @Override
  54. // find leftmost pre-base matra target, starting from source
  55. protected int findPreBaseMatraTarget(GlyphSequence gs, int source) {
  56. int ng = gs.getGlyphCount();
  57. int lk = -1;
  58. for (int i = (source < ng) ? source : ng; i > 0; i--) {
  59. int k = i - 1;
  60. if (containsConsonant(gs, k)) {
  61. if (containsHalfConsonant(gs, k)) {
  62. lk = k;
  63. } else if (lk == -1) {
  64. lk = k;
  65. } else {
  66. break;
  67. }
  68. }
  69. }
  70. return lk;
  71. }
  72. private static boolean containsPreBaseMatra(GlyphSequence gs, int k) {
  73. GlyphSequence.CharAssociation a = gs.getAssociation(k);
  74. int[] ca = gs.getCharacterArray(false);
  75. for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
  76. if (isPreM(ca [ i ])) {
  77. return true;
  78. }
  79. }
  80. return false;
  81. }
  82. private static boolean containsConsonant(GlyphSequence gs, int k) {
  83. GlyphSequence.CharAssociation a = gs.getAssociation(k);
  84. int[] ca = gs.getCharacterArray(false);
  85. for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
  86. if (isC(ca [ i ])) {
  87. return true;
  88. }
  89. }
  90. return false;
  91. }
  92. private static boolean containsHalfConsonant(GlyphSequence gs, int k) {
  93. Boolean half = (Boolean) gs.getAssociation(k) .getPredication("half");
  94. return (half != null) ? half.booleanValue() : false;
  95. }
  96. @Override
  97. protected int findReph(GlyphSequence gs) {
  98. int ng = gs.getGlyphCount();
  99. int li = -1;
  100. for (int i = 0; i < ng; i++) {
  101. if (containsReph(gs, i)) {
  102. li = i;
  103. break;
  104. }
  105. }
  106. return li;
  107. }
  108. @Override
  109. protected int findRephTarget(GlyphSequence gs, int source) {
  110. int ng = gs.getGlyphCount();
  111. int c1 = -1;
  112. int c2 = -1;
  113. // first candidate target is after first non-half consonant
  114. for (int i = 0; i < ng; i++) {
  115. if ((i != source) && containsConsonant(gs, i)) {
  116. if (!containsHalfConsonant(gs, i)) {
  117. c1 = i + 1;
  118. break;
  119. }
  120. }
  121. }
  122. // second candidate target is after last non-prebase matra after first candidate or before first syllable or vedic mark
  123. for (int i = (c1 >= 0) ? c1 : 0; i < ng; i++) {
  124. if (containsMatra(gs, i) && !containsPreBaseMatra(gs, i)) {
  125. c2 = i + 1;
  126. } else if (containsOtherMark(gs, i)) {
  127. c2 = i;
  128. break;
  129. }
  130. }
  131. if (c2 >= 0) {
  132. return c2;
  133. } else if (c1 >= 0) {
  134. return c1;
  135. } else {
  136. return source;
  137. }
  138. }
  139. private static boolean containsReph(GlyphSequence gs, int k) {
  140. Boolean rphf = (Boolean) gs.getAssociation(k) .getPredication("rphf");
  141. return (rphf != null) ? rphf.booleanValue() : false;
  142. }
  143. private static boolean containsMatra(GlyphSequence gs, int k) {
  144. GlyphSequence.CharAssociation a = gs.getAssociation(k);
  145. int[] ca = gs.getCharacterArray(false);
  146. for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
  147. if (isM(ca [ i ])) {
  148. return true;
  149. }
  150. }
  151. return false;
  152. }
  153. private static boolean containsOtherMark(GlyphSequence gs, int k) {
  154. GlyphSequence.CharAssociation a = gs.getAssociation(k);
  155. int[] ca = gs.getCharacterArray(false);
  156. for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
  157. switch (typeOf(ca [ i ])) {
  158. case C_T: // tone (e.g., udatta, anudatta)
  159. case C_A: // accent (e.g., acute, grave)
  160. case C_O: // other (e.g., candrabindu, anusvara, visarga, etc)
  161. return true;
  162. default:
  163. break;
  164. }
  165. }
  166. return false;
  167. }
  168. private static class GujaratiSyllabizer extends DefaultSyllabizer {
  169. GujaratiSyllabizer(String script, String language) {
  170. super(script, language);
  171. }
  172. @Override
  173. // | C ...
  174. protected int findStartOfSyllable(int[] ca, int s, int e) {
  175. if ((s < 0) || (s >= e)) {
  176. return -1;
  177. } else {
  178. while (s < e) {
  179. int c = ca [ s ];
  180. if (isC(c)) {
  181. break;
  182. } else {
  183. s++;
  184. }
  185. }
  186. return s;
  187. }
  188. }
  189. @Override
  190. // D* L? | ...
  191. protected int findEndOfSyllable(int[] ca, int s, int e) {
  192. if ((s < 0) || (s >= e)) {
  193. return -1;
  194. } else {
  195. int nd = 0;
  196. int nl = 0;
  197. int i;
  198. // consume dead consonants
  199. while ((i = isDeadConsonant(ca, s, e)) > s) {
  200. s = i;
  201. nd++;
  202. }
  203. // consume zero or one live consonant
  204. if ((i = isLiveConsonant(ca, s, e)) > s) {
  205. s = i;
  206. nl++;
  207. }
  208. return ((nd > 0) || (nl > 0)) ? s : -1;
  209. }
  210. }
  211. // D := ( C N? H )?
  212. private int isDeadConsonant(int[] ca, int s, int e) {
  213. if (s < 0) {
  214. return -1;
  215. } else {
  216. int c;
  217. int i = 0;
  218. int nc = 0;
  219. int nh = 0;
  220. do {
  221. // C
  222. if ((s + i) < e) {
  223. c = ca [ s + i ];
  224. if (isC(c)) {
  225. i++;
  226. nc++;
  227. } else {
  228. break;
  229. }
  230. }
  231. // N?
  232. if ((s + i) < e) {
  233. c = ca [ s + 1 ];
  234. if (isN(c)) {
  235. i++;
  236. }
  237. }
  238. // H
  239. if ((s + i) < e) {
  240. c = ca [ s + i ];
  241. if (isH(c)) {
  242. i++;
  243. nh++;
  244. } else {
  245. break;
  246. }
  247. }
  248. } while (false);
  249. return (nc > 0) && (nh > 0) ? s + i : -1;
  250. }
  251. }
  252. // L := ( (C|V) N? X* )?; where X = ( MATRA | ACCENT MARK | TONE MARK | OTHER MARK )
  253. private int isLiveConsonant(int[] ca, int s, int e) {
  254. if (s < 0) {
  255. return -1;
  256. } else {
  257. int c;
  258. int i = 0;
  259. int nc = 0;
  260. int nv = 0;
  261. int nx = 0;
  262. do {
  263. // C
  264. if ((s + i) < e) {
  265. c = ca [ s + i ];
  266. if (isC(c)) {
  267. i++;
  268. nc++;
  269. } else if (isV(c)) {
  270. i++;
  271. nv++;
  272. } else {
  273. break;
  274. }
  275. }
  276. // N?
  277. if ((s + i) < e) {
  278. c = ca [ s + i ];
  279. if (isN(c)) {
  280. i++;
  281. }
  282. }
  283. // X*
  284. while ((s + i) < e) {
  285. c = ca [ s + i ];
  286. if (isX(c)) {
  287. i++;
  288. nx++;
  289. } else {
  290. break;
  291. }
  292. }
  293. } while (false);
  294. // if no X but has H, then ignore C|I
  295. if (nx == 0) {
  296. if ((s + i) < e) {
  297. c = ca [ s + i ];
  298. if (isH(c)) {
  299. if (nc > 0) {
  300. nc--;
  301. } else if (nv > 0) {
  302. nv--;
  303. }
  304. }
  305. }
  306. }
  307. return ((nc > 0) || (nv > 0)) ? s + i : -1;
  308. }
  309. }
  310. }
  311. // gujarati character types
  312. static final short C_U = 0; // unassigned
  313. static final short C_C = 1; // consonant
  314. static final short C_V = 2; // vowel
  315. static final short C_M = 3; // vowel sign (matra)
  316. static final short C_S = 4; // symbol or sign
  317. static final short C_T = 5; // tone mark
  318. static final short C_A = 6; // accent mark
  319. static final short C_P = 7; // punctuation
  320. static final short C_D = 8; // digit
  321. static final short C_H = 9; // halant (virama)
  322. static final short C_O = 10; // other signs
  323. static final short C_N = 0x0100; // nukta(ized)
  324. static final short C_R = 0x0200; // reph(ized)
  325. static final short C_PRE = 0x0400; // pre-base
  326. static final short C_M_TYPE = 0x00FF; // type mask
  327. static final short C_M_FLAGS = 0x7F00; // flag mask
  328. // gujarati block range
  329. static final int CCA_START = 0x0A80; // first code point mapped by cca
  330. static final int CCA_END = 0x0B00; // last code point + 1 mapped by cca
  331. // gujarati character type lookups
  332. static final short[] CCA = {
  333. C_U, // 0x0A80 // UNASSIGNED
  334. C_O, // 0x0A81 // CANDRABINDU
  335. C_O, // 0x0A82 // ANUSVARA
  336. C_O, // 0x0A83 // VISARGA
  337. C_U, // 0x0A84 // UNASSIGNED
  338. C_V, // 0x0A85 // A
  339. C_V, // 0x0A86 // AA
  340. C_V, // 0x0A87 // I
  341. C_V, // 0x0A88 // II
  342. C_V, // 0x0A89 // U
  343. C_V, // 0x0A8A // UU
  344. C_V, // 0x0A8B // VOCALIC R
  345. C_V, // 0x0A8C // VOCALIC L
  346. C_V, // 0x0A8D // CANDRA E
  347. C_U, // 0x0A8E // UNASSIGNED
  348. C_V, // 0x0A8F // E
  349. C_V, // 0x0A90 // AI
  350. C_V, // 0x0A91 // CANDRA O
  351. C_U, // 0x0A92 // UNASSIGNED
  352. C_V, // 0x0A93 // O
  353. C_V, // 0x0A94 // AU
  354. C_C, // 0x0A95 // KA
  355. C_C, // 0x0A96 // KHA
  356. C_C, // 0x0A97 // GA
  357. C_C, // 0x0A98 // GHA
  358. C_C, // 0x0A99 // NGA
  359. C_C, // 0x0A9A // CA
  360. C_C, // 0x0A9B // CHA
  361. C_C, // 0x0A9C // JA
  362. C_C, // 0x0A9D // JHA
  363. C_C, // 0x0A9E // NYA
  364. C_C, // 0x0A9F // TTA
  365. C_C, // 0x0AA0 // TTHA
  366. C_C, // 0x0AA1 // DDA
  367. C_C, // 0x0AA2 // DDHA
  368. C_C, // 0x0AA3 // NNA
  369. C_C, // 0x0AA4 // TA
  370. C_C, // 0x0AA5 // THA
  371. C_C, // 0x0AA6 // DA
  372. C_C, // 0x0AA7 // DHA
  373. C_C, // 0x0AA8 // NA
  374. C_U, // 0x0AA9 // UNASSIGNED
  375. C_C, // 0x0AAA // PA
  376. C_C, // 0x0AAB // PHA
  377. C_C, // 0x0AAC // BA
  378. C_C, // 0x0AAD // BHA
  379. C_C, // 0x0AAE // MA
  380. C_C, // 0x0AAF // YA
  381. C_C | C_R, // 0x0AB0 // RA
  382. C_U, // 0x0AB1 // UNASSIGNED
  383. C_C, // 0x0AB2 // LA
  384. C_C, // 0x0AB3 // LLA
  385. C_U, // 0x0AB4 // UNASSIGNED
  386. C_C, // 0x0AB5 // VA
  387. C_C, // 0x0AB6 // SHA
  388. C_C, // 0x0AB7 // SSA
  389. C_C, // 0x0AB8 // SA
  390. C_C, // 0x0AB9 // HA
  391. C_U, // 0x0ABA // UNASSIGNED
  392. C_U, // 0x0ABB // UNASSIGNED
  393. C_N, // 0x0ABC // NUKTA
  394. C_S, // 0x0ABD // AVAGRAHA
  395. C_M, // 0x0ABE // AA
  396. C_M | C_PRE, // 0x0ABF // I
  397. C_M, // 0x0AC0 // II
  398. C_M, // 0x0AC1 // U
  399. C_M, // 0x0AC2 // UU
  400. C_M, // 0x0AC3 // VOCALIC R
  401. C_M, // 0x0AC4 // VOCALIC RR
  402. C_M, // 0x0AC5 // CANDRA E
  403. C_U, // 0x0AC6 // UNASSIGNED
  404. C_M, // 0x0AC7 // E
  405. C_M, // 0x0AC8 // AI
  406. C_M, // 0x0AC9 // CANDRA O
  407. C_U, // 0x0ACA // UNASSIGNED
  408. C_M, // 0x0ACB // O
  409. C_M, // 0x0ACC // AU
  410. C_H, // 0x0ACD // VIRAMA (HALANT)
  411. C_U, // 0x0ACE // UNASSIGNED
  412. C_U, // 0x0ACF // UNASSIGNED
  413. C_S, // 0x0AD0 // OM
  414. C_U, // 0x0AD1 // UNASSIGNED
  415. C_U, // 0x0AD2 // UNASSIGNED
  416. C_U, // 0x0AD3 // UNASSIGNED
  417. C_U, // 0x0AD4 // UNASSIGNED
  418. C_U, // 0x0AD5 // UNASSIGNED
  419. C_U, // 0x0AD6 // UNASSIGNED
  420. C_U, // 0x0AD7 // UNASSIGNED
  421. C_U, // 0x0AD8 // UNASSIGNED
  422. C_U, // 0x0AD9 // UNASSIGNED
  423. C_U, // 0x0ADA // UNASSIGNED
  424. C_U, // 0x0ADB // UNASSIGNED
  425. C_U, // 0x0ADC // UNASSIGNED
  426. C_U, // 0x0ADD // UNASSIGNED
  427. C_U, // 0x0ADE // UNASSIGNED
  428. C_U, // 0x0ADF // UNASSIGNED
  429. C_V, // 0x0AE0 // VOCALIC RR
  430. C_V, // 0x0AE1 // VOCALIC LL
  431. C_M, // 0x0AE2 // VOCALIC L
  432. C_M, // 0x0AE3 // VOCALIC LL
  433. C_U, // 0x0AE4 // UNASSIGNED
  434. C_U, // 0x0AE5 // UNASSIGNED
  435. C_D, // 0x0AE6 // ZERO
  436. C_D, // 0x0AE7 // ONE
  437. C_D, // 0x0AE8 // TWO
  438. C_D, // 0x0AE9 // THREE
  439. C_D, // 0x0AEA // FOUR
  440. C_D, // 0x0AEB // FIVE
  441. C_D, // 0x0AEC // SIX
  442. C_D, // 0x0AED // SEVEN
  443. C_D, // 0x0AEE // EIGHT
  444. C_D, // 0x0AEF // NINE
  445. C_U, // 0x0AF0 // UNASSIGNED
  446. C_S, // 0x0AF1 // RUPEE SIGN
  447. C_U, // 0x0AF2 // UNASSIGNED
  448. C_U, // 0x0AF3 // UNASSIGNED
  449. C_U, // 0x0AF4 // UNASSIGNED
  450. C_U, // 0x0AF5 // UNASSIGNED
  451. C_U, // 0x0AF6 // UNASSIGNED
  452. C_U, // 0x0AF7 // UNASSIGNED
  453. C_U, // 0x0AF8 // UNASSIGNED
  454. C_U, // 0x0AF9 // UNASSIGNED
  455. C_U, // 0x0AFA // UNASSIGNED
  456. C_U, // 0x0AFB // UNASSIGNED
  457. C_U, // 0x0AFC // UNASSIGNED
  458. C_U, // 0x0AFD // UNASSIGNED
  459. C_U, // 0x0AFE // UNASSIGNED
  460. C_U // 0x0AFF // UNASSIGNED
  461. };
  462. static int typeOf(int c) {
  463. if ((c >= CCA_START) && (c < CCA_END)) {
  464. return CCA [ c - CCA_START ] & C_M_TYPE;
  465. } else {
  466. return C_U;
  467. }
  468. }
  469. static boolean isType(int c, int t) {
  470. return typeOf(c) == t;
  471. }
  472. static boolean hasFlag(int c, int f) {
  473. if ((c >= CCA_START) && (c < CCA_END)) {
  474. return (CCA [ c - CCA_START ] & f) == f;
  475. } else {
  476. return false;
  477. }
  478. }
  479. static boolean isC(int c) {
  480. return isType(c, C_C);
  481. }
  482. static boolean isR(int c) {
  483. return isType(c, C_C) && hasR(c);
  484. }
  485. static boolean isV(int c) {
  486. return isType(c, C_V);
  487. }
  488. static boolean isN(int c) {
  489. return c == 0x0ABC;
  490. }
  491. static boolean isH(int c) {
  492. return c == 0x0ACD;
  493. }
  494. static boolean isM(int c) {
  495. return isType(c, C_M);
  496. }
  497. static boolean isPreM(int c) {
  498. return isType(c, C_M) && hasFlag(c, C_PRE);
  499. }
  500. static boolean isX(int c) {
  501. switch (typeOf(c)) {
  502. case C_M: // matra (combining vowel)
  503. case C_A: // accent mark
  504. case C_T: // tone mark
  505. case C_O: // other (modifying) mark
  506. return true;
  507. default:
  508. return false;
  509. }
  510. }
  511. static boolean hasR(int c) {
  512. return hasFlag(c, C_R);
  513. }
  514. static boolean hasN(int c) {
  515. return hasFlag(c, C_N);
  516. }
  517. }