You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CharScript.java 35KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.complexscripts.util;
  19. import java.util.Arrays;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.Iterator;
  23. import java.util.Map;
  24. import java.util.Set;
  25. import org.apache.fop.util.CharUtilities;
  26. /**
  27. * <p>Script related utilities.</p>
  28. *
  29. * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
  30. */
  31. public final class CharScript {
  32. // CSOFF: LineLength
  33. //
  34. // The following script codes are based on ISO 15924. Codes less than 1000 are
  35. // official assignments from 15924; those equal to or greater than 1000 are FOP
  36. // implementation specific.
  37. //
  38. /** hebrew script constant */
  39. public static final int SCRIPT_HEBREW = 125; // 'hebr'
  40. /** mongolian script constant */
  41. public static final int SCRIPT_MONGOLIAN = 145; // 'mong'
  42. /** arabic script constant */
  43. public static final int SCRIPT_ARABIC = 160; // 'arab'
  44. /** greek script constant */
  45. public static final int SCRIPT_GREEK = 200; // 'grek'
  46. /** latin script constant */
  47. public static final int SCRIPT_LATIN = 215; // 'latn'
  48. /** cyrillic script constant */
  49. public static final int SCRIPT_CYRILLIC = 220; // 'cyrl'
  50. /** georgian script constant */
  51. public static final int SCRIPT_GEORGIAN = 240; // 'geor'
  52. /** bopomofo script constant */
  53. public static final int SCRIPT_BOPOMOFO = 285; // 'bopo'
  54. /** hangul script constant */
  55. public static final int SCRIPT_HANGUL = 286; // 'hang'
  56. /** gurmukhi script constant */
  57. public static final int SCRIPT_GURMUKHI = 310; // 'guru'
  58. /** gurmukhi 2 script constant */
  59. public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics
  60. /** devanagari script constant */
  61. public static final int SCRIPT_DEVANAGARI = 315; // 'deva'
  62. /** devanagari 2 script constant */
  63. public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics
  64. /** gujarati script constant */
  65. public static final int SCRIPT_GUJARATI = 320; // 'gujr'
  66. /** gujarati 2 script constant */
  67. public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics
  68. /** bengali script constant */
  69. public static final int SCRIPT_BENGALI = 326; // 'beng'
  70. /** bengali 2 script constant */
  71. public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics
  72. /** oriya script constant */
  73. public static final int SCRIPT_ORIYA = 327; // 'orya'
  74. /** oriya 2 script constant */
  75. public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics
  76. /** tibetan script constant */
  77. public static final int SCRIPT_TIBETAN = 330; // 'tibt'
  78. /** telugu script constant */
  79. public static final int SCRIPT_TELUGU = 340; // 'telu'
  80. /** telugu 2 script constant */
  81. public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics
  82. /** kannada script constant */
  83. public static final int SCRIPT_KANNADA = 345; // 'knda'
  84. /** kannada 2 script constant */
  85. public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics
  86. /** tamil script constant */
  87. public static final int SCRIPT_TAMIL = 346; // 'taml'
  88. /** tamil 2 script constant */
  89. public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics
  90. /** malayalam script constant */
  91. public static final int SCRIPT_MALAYALAM = 347; // 'mlym'
  92. /** malayalam 2 script constant */
  93. public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics
  94. /** sinhalese script constant */
  95. public static final int SCRIPT_SINHALESE = 348; // 'sinh'
  96. /** burmese script constant */
  97. public static final int SCRIPT_BURMESE = 350; // 'mymr'
  98. /** thai script constant */
  99. public static final int SCRIPT_THAI = 352; // 'thai'
  100. /** khmer script constant */
  101. public static final int SCRIPT_KHMER = 355; // 'khmr'
  102. /** lao script constant */
  103. public static final int SCRIPT_LAO = 356; // 'laoo'
  104. /** hiragana script constant */
  105. public static final int SCRIPT_HIRAGANA = 410; // 'hira'
  106. /** ethiopic script constant */
  107. public static final int SCRIPT_ETHIOPIC = 430; // 'ethi'
  108. /** han script constant */
  109. public static final int SCRIPT_HAN = 500; // 'hani'
  110. /** katakana script constant */
  111. public static final int SCRIPT_KATAKANA = 410; // 'kana'
  112. /** math script constant */
  113. public static final int SCRIPT_MATH = 995; // 'zmth'
  114. /** symbol script constant */
  115. public static final int SCRIPT_SYMBOL = 996; // 'zsym'
  116. /** undetermined script constant */
  117. public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy'
  118. /** uncoded script constant */
  119. public static final int SCRIPT_UNCODED = 999; // 'zzzz'
  120. /**
  121. * A static (class) parameter indicating whether V2 indic shaping
  122. * rules apply or not, with default being <code>true</code>.
  123. */
  124. private static final boolean USE_V2_INDIC = true;
  125. private CharScript() {
  126. }
  127. /**
  128. * Determine if character c is punctuation.
  129. * @param c a character represented as a unicode scalar value
  130. * @return true if character is punctuation
  131. */
  132. public static boolean isPunctuation(int c) {
  133. if ((c >= 0x0021) && (c <= 0x002F)) { // basic latin punctuation
  134. return true;
  135. } else if ((c >= 0x003A) && (c <= 0x0040)) { // basic latin punctuation
  136. return true;
  137. } else if ((c >= 0x005F) && (c <= 0x0060)) { // basic latin punctuation
  138. return true;
  139. } else if ((c >= 0x007E) && (c <= 0x007E)) { // basic latin punctuation
  140. return true;
  141. } else if ((c >= 0x007E) && (c <= 0x007E)) { // basic latin punctuation
  142. return true;
  143. } else if ((c >= 0x00A1) && (c <= 0x00BF)) { // latin supplement punctuation
  144. return true;
  145. } else if ((c >= 0x00D7) && (c <= 0x00D7)) { // latin supplement punctuation
  146. return true;
  147. } else if ((c >= 0x00F7) && (c <= 0x00F7)) { // latin supplement punctuation
  148. return true;
  149. } else if ((c >= 0x2000) && (c <= 0x206F)) { // general punctuation
  150. return true;
  151. } else { // [TBD] - not complete
  152. return false;
  153. }
  154. }
  155. /**
  156. * Determine if character c is a digit.
  157. * @param c a character represented as a unicode scalar value
  158. * @return true if character is a digit
  159. */
  160. public static boolean isDigit(int c) {
  161. if ((c >= 0x0030) && (c <= 0x0039)) { // basic latin digits
  162. return true;
  163. } else { // [TBD] - not complete
  164. return false;
  165. }
  166. }
  167. /**
  168. * Determine if character c belong to the hebrew script.
  169. * @param c a character represented as a unicode scalar value
  170. * @return true if character belongs to hebrew script
  171. */
  172. public static boolean isHebrew(int c) {
  173. if ((c >= 0x0590) && (c <= 0x05FF)) { // hebrew block
  174. return true;
  175. } else if ((c >= 0xFB00) && (c <= 0xFB4F)) { // hebrew presentation forms block
  176. return true;
  177. } else {
  178. return false;
  179. }
  180. }
  181. /**
  182. * Determine if character c belong to the mongolian script.
  183. * @param c a character represented as a unicode scalar value
  184. * @return true if character belongs to mongolian script
  185. */
  186. public static boolean isMongolian(int c) {
  187. if ((c >= 0x1800) && (c <= 0x18AF)) { // mongolian block
  188. return true;
  189. } else {
  190. return false;
  191. }
  192. }
  193. /**
  194. * Determine if character c belong to the arabic script.
  195. * @param c a character represented as a unicode scalar value
  196. * @return true if character belongs to arabic script
  197. */
  198. public static boolean isArabic(int c) {
  199. if ((c >= 0x0600) && (c <= 0x06FF)) { // arabic block
  200. return true;
  201. } else if ((c >= 0x0750) && (c <= 0x077F)) { // arabic supplement block
  202. return true;
  203. } else if ((c >= 0xFB50) && (c <= 0xFDFF)) { // arabic presentation forms a block
  204. return true;
  205. } else if ((c >= 0xFE70) && (c <= 0xFEFF)) { // arabic presentation forms b block
  206. return true;
  207. } else {
  208. return false;
  209. }
  210. }
  211. /**
  212. * Determine if character c belong to the greek script.
  213. * @param c a character represented as a unicode scalar value
  214. * @return true if character belongs to greek script
  215. */
  216. public static boolean isGreek(int c) {
  217. if ((c >= 0x0370) && (c <= 0x03FF)) { // greek (and coptic) block
  218. return true;
  219. } else if ((c >= 0x1F00) && (c <= 0x1FFF)) { // greek extended block
  220. return true;
  221. } else {
  222. return false;
  223. }
  224. }
  225. /**
  226. * Determine if character c belong to the latin script.
  227. * @param c a character represented as a unicode scalar value
  228. * @return true if character belongs to latin script
  229. */
  230. public static boolean isLatin(int c) {
  231. if ((c >= 0x0041) && (c <= 0x005A)) { // basic latin upper case
  232. return true;
  233. } else if ((c >= 0x0061) && (c <= 0x007A)) { // basic latin lower case
  234. return true;
  235. } else if ((c >= 0x00C0) && (c <= 0x00D6)) { // latin supplement upper case
  236. return true;
  237. } else if ((c >= 0x00D8) && (c <= 0x00DF)) { // latin supplement upper case
  238. return true;
  239. } else if ((c >= 0x00E0) && (c <= 0x00F6)) { // latin supplement lower case
  240. return true;
  241. } else if ((c >= 0x00F8) && (c <= 0x00FF)) { // latin supplement lower case
  242. return true;
  243. } else if ((c >= 0x0100) && (c <= 0x017F)) { // latin extended a
  244. return true;
  245. } else if ((c >= 0x0180) && (c <= 0x024F)) { // latin extended b
  246. return true;
  247. } else if ((c >= 0x1E00) && (c <= 0x1EFF)) { // latin extended additional
  248. return true;
  249. } else if ((c >= 0x2C60) && (c <= 0x2C7F)) { // latin extended c
  250. return true;
  251. } else if ((c >= 0xA720) && (c <= 0xA7FF)) { // latin extended d
  252. return true;
  253. } else if ((c >= 0xFB00) && (c <= 0xFB0F)) { // latin ligatures
  254. return true;
  255. } else {
  256. return false;
  257. }
  258. }
  259. /**
  260. * Determine if character c belong to the cyrillic script.
  261. * @param c a character represented as a unicode scalar value
  262. * @return true if character belongs to cyrillic script
  263. */
  264. public static boolean isCyrillic(int c) {
  265. if ((c >= 0x0400) && (c <= 0x04FF)) { // cyrillic block
  266. return true;
  267. } else if ((c >= 0x0500) && (c <= 0x052F)) { // cyrillic supplement block
  268. return true;
  269. } else if ((c >= 0x2DE0) && (c <= 0x2DFF)) { // cyrillic extended-a block
  270. return true;
  271. } else if ((c >= 0xA640) && (c <= 0xA69F)) { // cyrillic extended-b block
  272. return true;
  273. } else {
  274. return false;
  275. }
  276. }
  277. /**
  278. * Determine if character c belong to the georgian script.
  279. * @param c a character represented as a unicode scalar value
  280. * @return true if character belongs to georgian script
  281. */
  282. public static boolean isGeorgian(int c) {
  283. if ((c >= 0x10A0) && (c <= 0x10FF)) { // georgian block
  284. return true;
  285. } else if ((c >= 0x2D00) && (c <= 0x2D2F)) { // georgian supplement block
  286. return true;
  287. } else {
  288. return false;
  289. }
  290. }
  291. /**
  292. * Determine if character c belong to the hangul script.
  293. * @param c a character represented as a unicode scalar value
  294. * @return true if character belongs to hangul script
  295. */
  296. public static boolean isHangul(int c) {
  297. if ((c >= 0x1100) && (c <= 0x11FF)) { // hangul jamo
  298. return true;
  299. } else if ((c >= 0x3130) && (c <= 0x318F)) { // hangul compatibility jamo
  300. return true;
  301. } else if ((c >= 0xA960) && (c <= 0xA97F)) { // hangul jamo extended a
  302. return true;
  303. } else if ((c >= 0xAC00) && (c <= 0xD7A3)) { // hangul syllables
  304. return true;
  305. } else if ((c >= 0xD7B0) && (c <= 0xD7FF)) { // hangul jamo extended a
  306. return true;
  307. } else {
  308. return false;
  309. }
  310. }
  311. /**
  312. * Determine if character c belong to the gurmukhi script.
  313. * @param c a character represented as a unicode scalar value
  314. * @return true if character belongs to gurmukhi script
  315. */
  316. public static boolean isGurmukhi(int c) {
  317. if ((c >= 0x0A00) && (c <= 0x0A7F)) { // gurmukhi block
  318. return true;
  319. } else {
  320. return false;
  321. }
  322. }
  323. /**
  324. * Determine if character c belong to the devanagari script.
  325. * @param c a character represented as a unicode scalar value
  326. * @return true if character belongs to devanagari script
  327. */
  328. public static boolean isDevanagari(int c) {
  329. if ((c >= 0x0900) && (c <= 0x097F)) { // devangari block
  330. return true;
  331. } else if ((c >= 0xA8E0) && (c <= 0xA8FF)) { // devangari extended block
  332. return true;
  333. } else {
  334. return false;
  335. }
  336. }
  337. /**
  338. * Determine if character c belong to the gujarati script.
  339. * @param c a character represented as a unicode scalar value
  340. * @return true if character belongs to gujarati script
  341. */
  342. public static boolean isGujarati(int c) {
  343. if ((c >= 0x0A80) && (c <= 0x0AFF)) { // gujarati block
  344. return true;
  345. } else {
  346. return false;
  347. }
  348. }
  349. /**
  350. * Determine if character c belong to the bengali script.
  351. * @param c a character represented as a unicode scalar value
  352. * @return true if character belongs to bengali script
  353. */
  354. public static boolean isBengali(int c) {
  355. if ((c >= 0x0980) && (c <= 0x09FF)) { // bengali block
  356. return true;
  357. } else {
  358. return false;
  359. }
  360. }
  361. /**
  362. * Determine if character c belong to the oriya script.
  363. * @param c a character represented as a unicode scalar value
  364. * @return true if character belongs to oriya script
  365. */
  366. public static boolean isOriya(int c) {
  367. if ((c >= 0x0B00) && (c <= 0x0B7F)) { // oriya block
  368. return true;
  369. } else {
  370. return false;
  371. }
  372. }
  373. /**
  374. * Determine if character c belong to the tibetan script.
  375. * @param c a character represented as a unicode scalar value
  376. * @return true if character belongs to tibetan script
  377. */
  378. public static boolean isTibetan(int c) {
  379. if ((c >= 0x0F00) && (c <= 0x0FFF)) { // tibetan block
  380. return true;
  381. } else {
  382. return false;
  383. }
  384. }
  385. /**
  386. * Determine if character c belong to the telugu script.
  387. * @param c a character represented as a unicode scalar value
  388. * @return true if character belongs to telugu script
  389. */
  390. public static boolean isTelugu(int c) {
  391. if ((c >= 0x0C00) && (c <= 0x0C7F)) { // telugu block
  392. return true;
  393. } else {
  394. return false;
  395. }
  396. }
  397. /**
  398. * Determine if character c belong to the kannada script.
  399. * @param c a character represented as a unicode scalar value
  400. * @return true if character belongs to kannada script
  401. */
  402. public static boolean isKannada(int c) {
  403. if ((c >= 0x0C00) && (c <= 0x0C7F)) { // kannada block
  404. return true;
  405. } else {
  406. return false;
  407. }
  408. }
  409. /**
  410. * Determine if character c belong to the tamil script.
  411. * @param c a character represented as a unicode scalar value
  412. * @return true if character belongs to tamil script
  413. */
  414. public static boolean isTamil(int c) {
  415. if ((c >= 0x0B80) && (c <= 0x0BFF)) { // tamil block
  416. return true;
  417. } else {
  418. return false;
  419. }
  420. }
  421. /**
  422. * Determine if character c belong to the malayalam script.
  423. * @param c a character represented as a unicode scalar value
  424. * @return true if character belongs to malayalam script
  425. */
  426. public static boolean isMalayalam(int c) {
  427. if ((c >= 0x0D00) && (c <= 0x0D7F)) { // malayalam block
  428. return true;
  429. } else {
  430. return false;
  431. }
  432. }
  433. /**
  434. * Determine if character c belong to the sinhalese script.
  435. * @param c a character represented as a unicode scalar value
  436. * @return true if character belongs to sinhalese script
  437. */
  438. public static boolean isSinhalese(int c) {
  439. if ((c >= 0x0D80) && (c <= 0x0DFF)) { // sinhala block
  440. return true;
  441. } else {
  442. return false;
  443. }
  444. }
  445. /**
  446. * Determine if character c belong to the burmese script.
  447. * @param c a character represented as a unicode scalar value
  448. * @return true if character belongs to burmese script
  449. */
  450. public static boolean isBurmese(int c) {
  451. if ((c >= 0x1000) && (c <= 0x109F)) { // burmese (myanmar) block
  452. return true;
  453. } else if ((c >= 0xAA60) && (c <= 0xAA7F)) { // burmese (myanmar) extended block
  454. return true;
  455. } else {
  456. return false;
  457. }
  458. }
  459. /**
  460. * Determine if character c belong to the thai script.
  461. * @param c a character represented as a unicode scalar value
  462. * @return true if character belongs to thai script
  463. */
  464. public static boolean isThai(int c) {
  465. if ((c >= 0x0E00) && (c <= 0x0E7F)) { // thai block
  466. return true;
  467. } else {
  468. return false;
  469. }
  470. }
  471. /**
  472. * Determine if character c belong to the khmer script.
  473. * @param c a character represented as a unicode scalar value
  474. * @return true if character belongs to khmer script
  475. */
  476. public static boolean isKhmer(int c) {
  477. if ((c >= 0x1780) && (c <= 0x17FF)) { // khmer block
  478. return true;
  479. } else if ((c >= 0x19E0) && (c <= 0x19FF)) { // khmer symbols block
  480. return true;
  481. } else {
  482. return false;
  483. }
  484. }
  485. /**
  486. * Determine if character c belong to the lao script.
  487. * @param c a character represented as a unicode scalar value
  488. * @return true if character belongs to lao script
  489. */
  490. public static boolean isLao(int c) {
  491. if ((c >= 0x0E80) && (c <= 0x0EFF)) { // lao block
  492. return true;
  493. } else {
  494. return false;
  495. }
  496. }
  497. /**
  498. * Determine if character c belong to the ethiopic (amharic) script.
  499. * @param c a character represented as a unicode scalar value
  500. * @return true if character belongs to ethiopic (amharic) script
  501. */
  502. public static boolean isEthiopic(int c) {
  503. if ((c >= 0x1200) && (c <= 0x137F)) { // ethiopic block
  504. return true;
  505. } else if ((c >= 0x1380) && (c <= 0x139F)) { // ethoipic supplement block
  506. return true;
  507. } else if ((c >= 0x2D80) && (c <= 0x2DDF)) { // ethoipic extended block
  508. return true;
  509. } else if ((c >= 0xAB00) && (c <= 0xAB2F)) { // ethoipic extended-a block
  510. return true;
  511. } else {
  512. return false;
  513. }
  514. }
  515. /**
  516. * Determine if character c belong to the han (unified cjk) script.
  517. * @param c a character represented as a unicode scalar value
  518. * @return true if character belongs to han (unified cjk) script
  519. */
  520. public static boolean isHan(int c) {
  521. if ((c >= 0x3400) && (c <= 0x4DBF)) {
  522. return true; // cjk unified ideographs extension a
  523. } else if ((c >= 0x4E00) && (c <= 0x9FFF)) {
  524. return true; // cjk unified ideographs
  525. } else if ((c >= 0xF900) && (c <= 0xFAFF)) {
  526. return true; // cjk compatibility ideographs
  527. } else if ((c >= 0x20000) && (c <= 0x2A6DF)) {
  528. return true; // cjk unified ideographs extension b
  529. } else if ((c >= 0x2A700) && (c <= 0x2B73F)) {
  530. return true; // cjk unified ideographs extension c
  531. } else if ((c >= 0x2F800) && (c <= 0x2FA1F)) {
  532. return true; // cjk compatibility ideographs supplement
  533. } else {
  534. return false;
  535. }
  536. }
  537. /**
  538. * Determine if character c belong to the bopomofo script.
  539. * @param c a character represented as a unicode scalar value
  540. * @return true if character belongs to bopomofo script
  541. */
  542. public static boolean isBopomofo(int c) {
  543. if ((c >= 0x3100) && (c <= 0x312F)) {
  544. return true;
  545. } else {
  546. return false;
  547. }
  548. }
  549. /**
  550. * Determine if character c belong to the hiragana script.
  551. * @param c a character represented as a unicode scalar value
  552. * @return true if character belongs to hiragana script
  553. */
  554. public static boolean isHiragana(int c) {
  555. if ((c >= 0x3040) && (c <= 0x309F)) {
  556. return true;
  557. } else {
  558. return false;
  559. }
  560. }
  561. /**
  562. * Determine if character c belong to the katakana script.
  563. * @param c a character represented as a unicode scalar value
  564. * @return true if character belongs to katakana script
  565. */
  566. public static boolean isKatakana(int c) {
  567. if ((c >= 0x30A0) && (c <= 0x30FF)) {
  568. return true;
  569. } else if ((c >= 0x31F0) && (c <= 0x31FF)) {
  570. return true;
  571. } else {
  572. return false;
  573. }
  574. }
  575. /**
  576. * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined,
  577. * then the script code 998 ('zyyy') is returned.
  578. * @param c the character to obtain script
  579. * @return an ISO15924 script code
  580. */
  581. public static int scriptOf(int c) { // [TBD] - needs optimization!!!
  582. if (CharUtilities.isAnySpace(c)) {
  583. return SCRIPT_UNDETERMINED;
  584. } else if (isPunctuation(c)) {
  585. return SCRIPT_UNDETERMINED;
  586. } else if (isDigit(c)) {
  587. return SCRIPT_UNDETERMINED;
  588. } else if (isLatin(c)) {
  589. return SCRIPT_LATIN;
  590. } else if (isCyrillic(c)) {
  591. return SCRIPT_CYRILLIC;
  592. } else if (isGreek(c)) {
  593. return SCRIPT_GREEK;
  594. } else if (isHan(c)) {
  595. return SCRIPT_HAN;
  596. } else if (isBopomofo(c)) {
  597. return SCRIPT_BOPOMOFO;
  598. } else if (isKatakana(c)) {
  599. return SCRIPT_KATAKANA;
  600. } else if (isHiragana(c)) {
  601. return SCRIPT_HIRAGANA;
  602. } else if (isHangul(c)) {
  603. return SCRIPT_HANGUL;
  604. } else if (isArabic(c)) {
  605. return SCRIPT_ARABIC;
  606. } else if (isHebrew(c)) {
  607. return SCRIPT_HEBREW;
  608. } else if (isMongolian(c)) {
  609. return SCRIPT_MONGOLIAN;
  610. } else if (isGeorgian(c)) {
  611. return SCRIPT_GEORGIAN;
  612. } else if (isGurmukhi(c)) {
  613. return useV2IndicRules(SCRIPT_GURMUKHI);
  614. } else if (isDevanagari(c)) {
  615. return useV2IndicRules(SCRIPT_DEVANAGARI);
  616. } else if (isGujarati(c)) {
  617. return useV2IndicRules(SCRIPT_GUJARATI);
  618. } else if (isBengali(c)) {
  619. return useV2IndicRules(SCRIPT_BENGALI);
  620. } else if (isOriya(c)) {
  621. return useV2IndicRules(SCRIPT_ORIYA);
  622. } else if (isTibetan(c)) {
  623. return SCRIPT_TIBETAN;
  624. } else if (isTelugu(c)) {
  625. return useV2IndicRules(SCRIPT_TELUGU);
  626. } else if (isKannada(c)) {
  627. return useV2IndicRules(SCRIPT_KANNADA);
  628. } else if (isTamil(c)) {
  629. return useV2IndicRules(SCRIPT_TAMIL);
  630. } else if (isMalayalam(c)) {
  631. return useV2IndicRules(SCRIPT_MALAYALAM);
  632. } else if (isSinhalese(c)) {
  633. return SCRIPT_SINHALESE;
  634. } else if (isBurmese(c)) {
  635. return SCRIPT_BURMESE;
  636. } else if (isThai(c)) {
  637. return SCRIPT_THAI;
  638. } else if (isKhmer(c)) {
  639. return SCRIPT_KHMER;
  640. } else if (isLao(c)) {
  641. return SCRIPT_LAO;
  642. } else if (isEthiopic(c)) {
  643. return SCRIPT_ETHIOPIC;
  644. } else {
  645. return SCRIPT_UNDETERMINED;
  646. }
  647. }
  648. /**
  649. * Obtain the V2 indic script code corresponding to V1 indic script code SC if
  650. * and only iff V2 indic rules apply; otherwise return SC.
  651. * @param sc a V1 indic script code
  652. * @return either SC or the V2 flavor of SC if V2 indic rules apply
  653. */
  654. public static int useV2IndicRules(int sc) {
  655. if (USE_V2_INDIC) {
  656. return (sc < 1000) ? (sc + 1000) : sc;
  657. } else {
  658. return sc;
  659. }
  660. }
  661. /**
  662. * Obtain the script codes of each character in a character sequence. If script
  663. * is not or cannot be determined for some character, then the script code 998
  664. * ('zyyy') is returned.
  665. * @param cs the character sequence
  666. * @return a (possibly empty) array of script codes
  667. */
  668. public static int[] scriptsOf(CharSequence cs) {
  669. Set s = new HashSet();
  670. for (int i = 0, n = cs.length(); i < n; i++) {
  671. s.add(Integer.valueOf(scriptOf(cs.charAt(i))));
  672. }
  673. int[] sa = new int [ s.size() ];
  674. int ns = 0;
  675. for (Iterator it = s.iterator(); it.hasNext();) {
  676. sa [ ns++ ] = ((Integer) it.next()) .intValue();
  677. }
  678. Arrays.sort(sa);
  679. return sa;
  680. }
  681. /**
  682. * Determine the dominant script of a character sequence.
  683. * @param cs the character sequence
  684. * @return the dominant script or SCRIPT_UNDETERMINED
  685. */
  686. public static int dominantScript(CharSequence cs) {
  687. Map m = new HashMap();
  688. for (int i = 0, n = cs.length(); i < n; i++) {
  689. int c = cs.charAt(i);
  690. int s = scriptOf(c);
  691. Integer k = Integer.valueOf(s);
  692. Integer v = (Integer) m.get(k);
  693. if (v != null) {
  694. m.put(k, Integer.valueOf(v.intValue() + 1));
  695. } else {
  696. m.put(k, Integer.valueOf(0));
  697. }
  698. }
  699. int sMax = -1;
  700. int cMax = -1;
  701. for (Iterator it = m.entrySet().iterator(); it.hasNext();) {
  702. Map.Entry e = (Map.Entry) it.next();
  703. Integer k = (Integer) e.getKey();
  704. int s = k.intValue();
  705. switch (s) {
  706. case SCRIPT_UNDETERMINED:
  707. case SCRIPT_UNCODED:
  708. break;
  709. default:
  710. Integer v = (Integer) e.getValue();
  711. assert v != null;
  712. int c = v.intValue();
  713. if (c > cMax) {
  714. cMax = c;
  715. sMax = s;
  716. }
  717. break;
  718. }
  719. }
  720. if (sMax < 0) {
  721. sMax = SCRIPT_UNDETERMINED;
  722. }
  723. return sMax;
  724. }
  725. /**
  726. * Determine if script tag denotes an 'Indic' script, where a
  727. * script is an 'Indic' script if it is intended to be processed by
  728. * the generic 'Indic' Script Processor.
  729. * @param script a script tag
  730. * @return true if script tag is a designated 'Indic' script
  731. */
  732. public static boolean isIndicScript(String script) {
  733. return isIndicScript(scriptCodeFromTag(script));
  734. }
  735. /**
  736. * Determine if script tag denotes an 'Indic' script, where a
  737. * script is an 'Indic' script if it is intended to be processed by
  738. * the generic 'Indic' Script Processor.
  739. * @param script a script code
  740. * @return true if script code is a designated 'Indic' script
  741. */
  742. public static boolean isIndicScript(int script) {
  743. switch (script) {
  744. case SCRIPT_BENGALI:
  745. case SCRIPT_BENGALI_2:
  746. case SCRIPT_BURMESE:
  747. case SCRIPT_DEVANAGARI:
  748. case SCRIPT_DEVANAGARI_2:
  749. case SCRIPT_GUJARATI:
  750. case SCRIPT_GUJARATI_2:
  751. case SCRIPT_GURMUKHI:
  752. case SCRIPT_GURMUKHI_2:
  753. case SCRIPT_KANNADA:
  754. case SCRIPT_KANNADA_2:
  755. case SCRIPT_MALAYALAM:
  756. case SCRIPT_MALAYALAM_2:
  757. case SCRIPT_ORIYA:
  758. case SCRIPT_ORIYA_2:
  759. case SCRIPT_TAMIL:
  760. case SCRIPT_TAMIL_2:
  761. case SCRIPT_TELUGU:
  762. case SCRIPT_TELUGU_2:
  763. return true;
  764. default:
  765. return false;
  766. }
  767. }
  768. /**
  769. * Determine the script tag associated with an internal script code.
  770. * @param code the script code
  771. * @return a script tag
  772. */
  773. public static String scriptTagFromCode(int code) {
  774. Map<Integer, String> m = getScriptTagsMap();
  775. if (m != null) {
  776. String tag;
  777. if ((tag = m.get(Integer.valueOf(code))) != null) {
  778. return tag;
  779. } else {
  780. return "";
  781. }
  782. } else {
  783. return "";
  784. }
  785. }
  786. /**
  787. * Determine the internal script code associated with a script tag.
  788. * @param tag the script tag
  789. * @return a script code
  790. */
  791. public static int scriptCodeFromTag(String tag) {
  792. Map<String, Integer> m = getScriptCodeMap();
  793. if (m != null) {
  794. Integer c;
  795. if ((c = m.get(tag)) != null) {
  796. return (int) c;
  797. } else {
  798. return SCRIPT_UNDETERMINED;
  799. }
  800. } else {
  801. return SCRIPT_UNDETERMINED;
  802. }
  803. }
  804. private static Map<Integer, String> scriptTagsMap;
  805. private static Map<String, Integer> scriptCodeMap;
  806. private static void putScriptTag(Map tm, Map cm, int code, String tag) {
  807. assert tag != null;
  808. assert tag.length() != 0;
  809. assert code >= 0;
  810. assert code < 2000;
  811. tm.put(Integer.valueOf(code), tag);
  812. cm.put(tag, Integer.valueOf(code));
  813. }
  814. private static void makeScriptMaps() {
  815. HashMap<Integer, String> tm = new HashMap<Integer, String>();
  816. HashMap<String, Integer> cm = new HashMap<String, Integer>();
  817. putScriptTag(tm, cm, SCRIPT_HEBREW, "hebr");
  818. putScriptTag(tm, cm, SCRIPT_MONGOLIAN, "mong");
  819. putScriptTag(tm, cm, SCRIPT_ARABIC, "arab");
  820. putScriptTag(tm, cm, SCRIPT_GREEK, "grek");
  821. putScriptTag(tm, cm, SCRIPT_LATIN, "latn");
  822. putScriptTag(tm, cm, SCRIPT_CYRILLIC, "cyrl");
  823. putScriptTag(tm, cm, SCRIPT_GEORGIAN, "geor");
  824. putScriptTag(tm, cm, SCRIPT_BOPOMOFO, "bopo");
  825. putScriptTag(tm, cm, SCRIPT_HANGUL, "hang");
  826. putScriptTag(tm, cm, SCRIPT_GURMUKHI, "guru");
  827. putScriptTag(tm, cm, SCRIPT_GURMUKHI_2, "gur2");
  828. putScriptTag(tm, cm, SCRIPT_DEVANAGARI, "deva");
  829. putScriptTag(tm, cm, SCRIPT_DEVANAGARI_2, "dev2");
  830. putScriptTag(tm, cm, SCRIPT_GUJARATI, "gujr");
  831. putScriptTag(tm, cm, SCRIPT_GUJARATI_2, "gjr2");
  832. putScriptTag(tm, cm, SCRIPT_BENGALI, "beng");
  833. putScriptTag(tm, cm, SCRIPT_BENGALI_2, "bng2");
  834. putScriptTag(tm, cm, SCRIPT_ORIYA, "orya");
  835. putScriptTag(tm, cm, SCRIPT_ORIYA_2, "ory2");
  836. putScriptTag(tm, cm, SCRIPT_TIBETAN, "tibt");
  837. putScriptTag(tm, cm, SCRIPT_TELUGU, "telu");
  838. putScriptTag(tm, cm, SCRIPT_TELUGU_2, "tel2");
  839. putScriptTag(tm, cm, SCRIPT_KANNADA, "knda");
  840. putScriptTag(tm, cm, SCRIPT_KANNADA_2, "knd2");
  841. putScriptTag(tm, cm, SCRIPT_TAMIL, "taml");
  842. putScriptTag(tm, cm, SCRIPT_TAMIL_2, "tml2");
  843. putScriptTag(tm, cm, SCRIPT_MALAYALAM, "mlym");
  844. putScriptTag(tm, cm, SCRIPT_MALAYALAM_2, "mlm2");
  845. putScriptTag(tm, cm, SCRIPT_SINHALESE, "sinh");
  846. putScriptTag(tm, cm, SCRIPT_BURMESE, "mymr");
  847. putScriptTag(tm, cm, SCRIPT_THAI, "thai");
  848. putScriptTag(tm, cm, SCRIPT_KHMER, "khmr");
  849. putScriptTag(tm, cm, SCRIPT_LAO, "laoo");
  850. putScriptTag(tm, cm, SCRIPT_HIRAGANA, "hira");
  851. putScriptTag(tm, cm, SCRIPT_ETHIOPIC, "ethi");
  852. putScriptTag(tm, cm, SCRIPT_HAN, "hani");
  853. putScriptTag(tm, cm, SCRIPT_KATAKANA, "kana");
  854. putScriptTag(tm, cm, SCRIPT_MATH, "zmth");
  855. putScriptTag(tm, cm, SCRIPT_SYMBOL, "zsym");
  856. putScriptTag(tm, cm, SCRIPT_UNDETERMINED, "zyyy");
  857. putScriptTag(tm, cm, SCRIPT_UNCODED, "zzzz");
  858. scriptTagsMap = tm;
  859. scriptCodeMap = cm;
  860. }
  861. private static Map<Integer, String> getScriptTagsMap() {
  862. if (scriptTagsMap == null) {
  863. makeScriptMaps();
  864. }
  865. return scriptTagsMap;
  866. }
  867. private static Map<String, Integer> getScriptCodeMap() {
  868. if (scriptCodeMap == null) {
  869. makeScriptMaps();
  870. }
  871. return scriptCodeMap;
  872. }
  873. }