You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CharScript.java 35KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.complexscripts.util;
  19. import java.util.Arrays;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.Map;
  23. import java.util.Set;
  24. import org.apache.fop.util.CharUtilities;
  25. /**
  26. * <p>Script related utilities.</p>
  27. *
  28. * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
  29. */
  30. public final class CharScript {
  31. // CSOFF: LineLength
  32. //
  33. // The following script codes are based on ISO 15924. Codes less than 1000 are
  34. // official assignments from 15924; those equal to or greater than 1000 are FOP
  35. // implementation specific.
  36. //
  37. /** hebrew script constant */
  38. public static final int SCRIPT_HEBREW = 125; // 'hebr'
  39. /** mongolian script constant */
  40. public static final int SCRIPT_MONGOLIAN = 145; // 'mong'
  41. /** arabic script constant */
  42. public static final int SCRIPT_ARABIC = 160; // 'arab'
  43. /** greek script constant */
  44. public static final int SCRIPT_GREEK = 200; // 'grek'
  45. /** latin script constant */
  46. public static final int SCRIPT_LATIN = 215; // 'latn'
  47. /** cyrillic script constant */
  48. public static final int SCRIPT_CYRILLIC = 220; // 'cyrl'
  49. /** georgian script constant */
  50. public static final int SCRIPT_GEORGIAN = 240; // 'geor'
  51. /** bopomofo script constant */
  52. public static final int SCRIPT_BOPOMOFO = 285; // 'bopo'
  53. /** hangul script constant */
  54. public static final int SCRIPT_HANGUL = 286; // 'hang'
  55. /** gurmukhi script constant */
  56. public static final int SCRIPT_GURMUKHI = 310; // 'guru'
  57. /** gurmukhi 2 script constant */
  58. public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics
  59. /** devanagari script constant */
  60. public static final int SCRIPT_DEVANAGARI = 315; // 'deva'
  61. /** devanagari 2 script constant */
  62. public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics
  63. /** gujarati script constant */
  64. public static final int SCRIPT_GUJARATI = 320; // 'gujr'
  65. /** gujarati 2 script constant */
  66. public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics
  67. /** bengali script constant */
  68. public static final int SCRIPT_BENGALI = 326; // 'beng'
  69. /** bengali 2 script constant */
  70. public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics
  71. /** oriya script constant */
  72. public static final int SCRIPT_ORIYA = 327; // 'orya'
  73. /** oriya 2 script constant */
  74. public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics
  75. /** tibetan script constant */
  76. public static final int SCRIPT_TIBETAN = 330; // 'tibt'
  77. /** telugu script constant */
  78. public static final int SCRIPT_TELUGU = 340; // 'telu'
  79. /** telugu 2 script constant */
  80. public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics
  81. /** kannada script constant */
  82. public static final int SCRIPT_KANNADA = 345; // 'knda'
  83. /** kannada 2 script constant */
  84. public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics
  85. /** tamil script constant */
  86. public static final int SCRIPT_TAMIL = 346; // 'taml'
  87. /** tamil 2 script constant */
  88. public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics
  89. /** malayalam script constant */
  90. public static final int SCRIPT_MALAYALAM = 347; // 'mlym'
  91. /** malayalam 2 script constant */
  92. public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics
  93. /** sinhalese script constant */
  94. public static final int SCRIPT_SINHALESE = 348; // 'sinh'
  95. /** burmese script constant */
  96. public static final int SCRIPT_BURMESE = 350; // 'mymr'
  97. /** thai script constant */
  98. public static final int SCRIPT_THAI = 352; // 'thai'
  99. /** khmer script constant */
  100. public static final int SCRIPT_KHMER = 355; // 'khmr'
  101. /** lao script constant */
  102. public static final int SCRIPT_LAO = 356; // 'laoo'
  103. /** hiragana script constant */
  104. public static final int SCRIPT_HIRAGANA = 410; // 'hira'
  105. /** ethiopic script constant */
  106. public static final int SCRIPT_ETHIOPIC = 430; // 'ethi'
  107. /** han script constant */
  108. public static final int SCRIPT_HAN = 500; // 'hani'
  109. /** katakana script constant */
  110. public static final int SCRIPT_KATAKANA = 410; // 'kana'
  111. /** math script constant */
  112. public static final int SCRIPT_MATH = 995; // 'zmth'
  113. /** symbol script constant */
  114. public static final int SCRIPT_SYMBOL = 996; // 'zsym'
  115. /** undetermined script constant */
  116. public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy'
  117. /** uncoded script constant */
  118. public static final int SCRIPT_UNCODED = 999; // 'zzzz'
  119. /**
  120. * A static (class) parameter indicating whether V2 indic shaping
  121. * rules apply or not, with default being <code>true</code>.
  122. */
  123. private static final boolean USE_V2_INDIC = true;
  124. private CharScript() {
  125. }
  126. /**
  127. * Determine if character c is punctuation.
  128. * @param c a character represented as a unicode scalar value
  129. * @return true if character is punctuation
  130. */
  131. public static boolean isPunctuation(int c) {
  132. if ((c >= 0x0021) && (c <= 0x002F)) { // basic latin punctuation
  133. return true;
  134. } else if ((c >= 0x003A) && (c <= 0x0040)) { // basic latin punctuation
  135. return true;
  136. } else if ((c >= 0x005F) && (c <= 0x0060)) { // basic latin punctuation
  137. return true;
  138. } else if ((c >= 0x007E) && (c <= 0x007E)) { // basic latin punctuation
  139. return true;
  140. } else if ((c >= 0x00A1) && (c <= 0x00BF)) { // latin supplement punctuation
  141. return true;
  142. } else if ((c >= 0x00D7) && (c <= 0x00D7)) { // latin supplement punctuation
  143. return true;
  144. } else if ((c >= 0x00F7) && (c <= 0x00F7)) { // latin supplement punctuation
  145. return true;
  146. } else if ((c >= 0x2000) && (c <= 0x206F)) { // general punctuation
  147. return true;
  148. } else { // [TBD] - not complete
  149. return false;
  150. }
  151. }
  152. /**
  153. * Determine if character c is a digit.
  154. * @param c a character represented as a unicode scalar value
  155. * @return true if character is a digit
  156. */
  157. public static boolean isDigit(int c) {
  158. if ((c >= 0x0030) && (c <= 0x0039)) { // basic latin digits
  159. return true;
  160. } else { // [TBD] - not complete
  161. return false;
  162. }
  163. }
  164. /**
  165. * Determine if character c belong to the hebrew script.
  166. * @param c a character represented as a unicode scalar value
  167. * @return true if character belongs to hebrew script
  168. */
  169. public static boolean isHebrew(int c) {
  170. if ((c >= 0x0590) && (c <= 0x05FF)) { // hebrew block
  171. return true;
  172. } else if ((c >= 0xFB00) && (c <= 0xFB4F)) { // hebrew presentation forms block
  173. return true;
  174. } else {
  175. return false;
  176. }
  177. }
  178. /**
  179. * Determine if character c belong to the mongolian script.
  180. * @param c a character represented as a unicode scalar value
  181. * @return true if character belongs to mongolian script
  182. */
  183. public static boolean isMongolian(int c) {
  184. if ((c >= 0x1800) && (c <= 0x18AF)) { // mongolian block
  185. return true;
  186. } else {
  187. return false;
  188. }
  189. }
  190. /**
  191. * Determine if character c belong to the arabic script.
  192. * @param c a character represented as a unicode scalar value
  193. * @return true if character belongs to arabic script
  194. */
  195. public static boolean isArabic(int c) {
  196. if ((c >= 0x0600) && (c <= 0x06FF)) { // arabic block
  197. return true;
  198. } else if ((c >= 0x0750) && (c <= 0x077F)) { // arabic supplement block
  199. return true;
  200. } else if ((c >= 0xFB50) && (c <= 0xFDFF)) { // arabic presentation forms a block
  201. return true;
  202. } else if ((c >= 0xFE70) && (c <= 0xFEFF)) { // arabic presentation forms b block
  203. return true;
  204. } else {
  205. return false;
  206. }
  207. }
  208. /**
  209. * Determine if character c belong to the greek script.
  210. * @param c a character represented as a unicode scalar value
  211. * @return true if character belongs to greek script
  212. */
  213. public static boolean isGreek(int c) {
  214. if ((c >= 0x0370) && (c <= 0x03FF)) { // greek (and coptic) block
  215. return true;
  216. } else if ((c >= 0x1F00) && (c <= 0x1FFF)) { // greek extended block
  217. return true;
  218. } else {
  219. return false;
  220. }
  221. }
  222. /**
  223. * Determine if character c belong to the latin script.
  224. * @param c a character represented as a unicode scalar value
  225. * @return true if character belongs to latin script
  226. */
  227. public static boolean isLatin(int c) {
  228. if ((c >= 0x0041) && (c <= 0x005A)) { // basic latin upper case
  229. return true;
  230. } else if ((c >= 0x0061) && (c <= 0x007A)) { // basic latin lower case
  231. return true;
  232. } else if ((c >= 0x00C0) && (c <= 0x00D6)) { // latin supplement upper case
  233. return true;
  234. } else if ((c >= 0x00D8) && (c <= 0x00DF)) { // latin supplement upper case
  235. return true;
  236. } else if ((c >= 0x00E0) && (c <= 0x00F6)) { // latin supplement lower case
  237. return true;
  238. } else if ((c >= 0x00F8) && (c <= 0x00FF)) { // latin supplement lower case
  239. return true;
  240. } else if ((c >= 0x0100) && (c <= 0x017F)) { // latin extended a
  241. return true;
  242. } else if ((c >= 0x0180) && (c <= 0x024F)) { // latin extended b
  243. return true;
  244. } else if ((c >= 0x1E00) && (c <= 0x1EFF)) { // latin extended additional
  245. return true;
  246. } else if ((c >= 0x2C60) && (c <= 0x2C7F)) { // latin extended c
  247. return true;
  248. } else if ((c >= 0xA720) && (c <= 0xA7FF)) { // latin extended d
  249. return true;
  250. } else if ((c >= 0xFB00) && (c <= 0xFB0F)) { // latin ligatures
  251. return true;
  252. } else {
  253. return false;
  254. }
  255. }
  256. /**
  257. * Determine if character c belong to the cyrillic script.
  258. * @param c a character represented as a unicode scalar value
  259. * @return true if character belongs to cyrillic script
  260. */
  261. public static boolean isCyrillic(int c) {
  262. if ((c >= 0x0400) && (c <= 0x04FF)) { // cyrillic block
  263. return true;
  264. } else if ((c >= 0x0500) && (c <= 0x052F)) { // cyrillic supplement block
  265. return true;
  266. } else if ((c >= 0x2DE0) && (c <= 0x2DFF)) { // cyrillic extended-a block
  267. return true;
  268. } else if ((c >= 0xA640) && (c <= 0xA69F)) { // cyrillic extended-b block
  269. return true;
  270. } else {
  271. return false;
  272. }
  273. }
  274. /**
  275. * Determine if character c belong to the georgian script.
  276. * @param c a character represented as a unicode scalar value
  277. * @return true if character belongs to georgian script
  278. */
  279. public static boolean isGeorgian(int c) {
  280. if ((c >= 0x10A0) && (c <= 0x10FF)) { // georgian block
  281. return true;
  282. } else if ((c >= 0x2D00) && (c <= 0x2D2F)) { // georgian supplement block
  283. return true;
  284. } else {
  285. return false;
  286. }
  287. }
  288. /**
  289. * Determine if character c belong to the hangul script.
  290. * @param c a character represented as a unicode scalar value
  291. * @return true if character belongs to hangul script
  292. */
  293. public static boolean isHangul(int c) {
  294. if ((c >= 0x1100) && (c <= 0x11FF)) { // hangul jamo
  295. return true;
  296. } else if ((c >= 0x3130) && (c <= 0x318F)) { // hangul compatibility jamo
  297. return true;
  298. } else if ((c >= 0xA960) && (c <= 0xA97F)) { // hangul jamo extended a
  299. return true;
  300. } else if ((c >= 0xAC00) && (c <= 0xD7A3)) { // hangul syllables
  301. return true;
  302. } else if ((c >= 0xD7B0) && (c <= 0xD7FF)) { // hangul jamo extended a
  303. return true;
  304. } else {
  305. return false;
  306. }
  307. }
  308. /**
  309. * Determine if character c belong to the gurmukhi script.
  310. * @param c a character represented as a unicode scalar value
  311. * @return true if character belongs to gurmukhi script
  312. */
  313. public static boolean isGurmukhi(int c) {
  314. if ((c >= 0x0A00) && (c <= 0x0A7F)) { // gurmukhi block
  315. return true;
  316. } else {
  317. return false;
  318. }
  319. }
  320. /**
  321. * Determine if character c belong to the devanagari script.
  322. * @param c a character represented as a unicode scalar value
  323. * @return true if character belongs to devanagari script
  324. */
  325. public static boolean isDevanagari(int c) {
  326. if ((c >= 0x0900) && (c <= 0x097F)) { // devangari block
  327. return true;
  328. } else if ((c >= 0xA8E0) && (c <= 0xA8FF)) { // devangari extended block
  329. return true;
  330. } else {
  331. return false;
  332. }
  333. }
  334. /**
  335. * Determine if character c belong to the gujarati script.
  336. * @param c a character represented as a unicode scalar value
  337. * @return true if character belongs to gujarati script
  338. */
  339. public static boolean isGujarati(int c) {
  340. if ((c >= 0x0A80) && (c <= 0x0AFF)) { // gujarati block
  341. return true;
  342. } else {
  343. return false;
  344. }
  345. }
  346. /**
  347. * Determine if character c belong to the bengali script.
  348. * @param c a character represented as a unicode scalar value
  349. * @return true if character belongs to bengali script
  350. */
  351. public static boolean isBengali(int c) {
  352. if ((c >= 0x0980) && (c <= 0x09FF)) { // bengali block
  353. return true;
  354. } else {
  355. return false;
  356. }
  357. }
  358. /**
  359. * Determine if character c belong to the oriya script.
  360. * @param c a character represented as a unicode scalar value
  361. * @return true if character belongs to oriya script
  362. */
  363. public static boolean isOriya(int c) {
  364. if ((c >= 0x0B00) && (c <= 0x0B7F)) { // oriya block
  365. return true;
  366. } else {
  367. return false;
  368. }
  369. }
  370. /**
  371. * Determine if character c belong to the tibetan script.
  372. * @param c a character represented as a unicode scalar value
  373. * @return true if character belongs to tibetan script
  374. */
  375. public static boolean isTibetan(int c) {
  376. if ((c >= 0x0F00) && (c <= 0x0FFF)) { // tibetan block
  377. return true;
  378. } else {
  379. return false;
  380. }
  381. }
  382. /**
  383. * Determine if character c belong to the telugu script.
  384. * @param c a character represented as a unicode scalar value
  385. * @return true if character belongs to telugu script
  386. */
  387. public static boolean isTelugu(int c) {
  388. if ((c >= 0x0C00) && (c <= 0x0C7F)) { // telugu block
  389. return true;
  390. } else {
  391. return false;
  392. }
  393. }
  394. /**
  395. * Determine if character c belong to the kannada script.
  396. * @param c a character represented as a unicode scalar value
  397. * @return true if character belongs to kannada script
  398. */
  399. public static boolean isKannada(int c) {
  400. if ((c >= 0x0C00) && (c <= 0x0C7F)) { // kannada block
  401. return true;
  402. } else {
  403. return false;
  404. }
  405. }
  406. /**
  407. * Determine if character c belong to the tamil script.
  408. * @param c a character represented as a unicode scalar value
  409. * @return true if character belongs to tamil script
  410. */
  411. public static boolean isTamil(int c) {
  412. if ((c >= 0x0B80) && (c <= 0x0BFF)) { // tamil block
  413. return true;
  414. } else {
  415. return false;
  416. }
  417. }
  418. /**
  419. * Determine if character c belong to the malayalam script.
  420. * @param c a character represented as a unicode scalar value
  421. * @return true if character belongs to malayalam script
  422. */
  423. public static boolean isMalayalam(int c) {
  424. if ((c >= 0x0D00) && (c <= 0x0D7F)) { // malayalam block
  425. return true;
  426. } else {
  427. return false;
  428. }
  429. }
  430. /**
  431. * Determine if character c belong to the sinhalese script.
  432. * @param c a character represented as a unicode scalar value
  433. * @return true if character belongs to sinhalese script
  434. */
  435. public static boolean isSinhalese(int c) {
  436. if ((c >= 0x0D80) && (c <= 0x0DFF)) { // sinhala block
  437. return true;
  438. } else {
  439. return false;
  440. }
  441. }
  442. /**
  443. * Determine if character c belong to the burmese script.
  444. * @param c a character represented as a unicode scalar value
  445. * @return true if character belongs to burmese script
  446. */
  447. public static boolean isBurmese(int c) {
  448. if ((c >= 0x1000) && (c <= 0x109F)) { // burmese (myanmar) block
  449. return true;
  450. } else if ((c >= 0xAA60) && (c <= 0xAA7F)) { // burmese (myanmar) extended block
  451. return true;
  452. } else {
  453. return false;
  454. }
  455. }
  456. /**
  457. * Determine if character c belong to the thai script.
  458. * @param c a character represented as a unicode scalar value
  459. * @return true if character belongs to thai script
  460. */
  461. public static boolean isThai(int c) {
  462. if ((c >= 0x0E00) && (c <= 0x0E7F)) { // thai block
  463. return true;
  464. } else {
  465. return false;
  466. }
  467. }
  468. /**
  469. * Determine if character c belong to the khmer script.
  470. * @param c a character represented as a unicode scalar value
  471. * @return true if character belongs to khmer script
  472. */
  473. public static boolean isKhmer(int c) {
  474. if ((c >= 0x1780) && (c <= 0x17FF)) { // khmer block
  475. return true;
  476. } else if ((c >= 0x19E0) && (c <= 0x19FF)) { // khmer symbols block
  477. return true;
  478. } else {
  479. return false;
  480. }
  481. }
  482. /**
  483. * Determine if character c belong to the lao script.
  484. * @param c a character represented as a unicode scalar value
  485. * @return true if character belongs to lao script
  486. */
  487. public static boolean isLao(int c) {
  488. if ((c >= 0x0E80) && (c <= 0x0EFF)) { // lao block
  489. return true;
  490. } else {
  491. return false;
  492. }
  493. }
  494. /**
  495. * Determine if character c belong to the ethiopic (amharic) script.
  496. * @param c a character represented as a unicode scalar value
  497. * @return true if character belongs to ethiopic (amharic) script
  498. */
  499. public static boolean isEthiopic(int c) {
  500. if ((c >= 0x1200) && (c <= 0x137F)) { // ethiopic block
  501. return true;
  502. } else if ((c >= 0x1380) && (c <= 0x139F)) { // ethoipic supplement block
  503. return true;
  504. } else if ((c >= 0x2D80) && (c <= 0x2DDF)) { // ethoipic extended block
  505. return true;
  506. } else if ((c >= 0xAB00) && (c <= 0xAB2F)) { // ethoipic extended-a block
  507. return true;
  508. } else {
  509. return false;
  510. }
  511. }
  512. /**
  513. * Determine if character c belong to the han (unified cjk) script.
  514. * @param c a character represented as a unicode scalar value
  515. * @return true if character belongs to han (unified cjk) script
  516. */
  517. public static boolean isHan(int c) {
  518. if ((c >= 0x3400) && (c <= 0x4DBF)) {
  519. return true; // cjk unified ideographs extension a
  520. } else if ((c >= 0x4E00) && (c <= 0x9FFF)) {
  521. return true; // cjk unified ideographs
  522. } else if ((c >= 0xF900) && (c <= 0xFAFF)) {
  523. return true; // cjk compatibility ideographs
  524. } else if ((c >= 0x20000) && (c <= 0x2A6DF)) {
  525. return true; // cjk unified ideographs extension b
  526. } else if ((c >= 0x2A700) && (c <= 0x2B73F)) {
  527. return true; // cjk unified ideographs extension c
  528. } else if ((c >= 0x2F800) && (c <= 0x2FA1F)) {
  529. return true; // cjk compatibility ideographs supplement
  530. } else {
  531. return false;
  532. }
  533. }
  534. /**
  535. * Determine if character c belong to the bopomofo script.
  536. * @param c a character represented as a unicode scalar value
  537. * @return true if character belongs to bopomofo script
  538. */
  539. public static boolean isBopomofo(int c) {
  540. if ((c >= 0x3100) && (c <= 0x312F)) {
  541. return true;
  542. } else {
  543. return false;
  544. }
  545. }
  546. /**
  547. * Determine if character c belong to the hiragana script.
  548. * @param c a character represented as a unicode scalar value
  549. * @return true if character belongs to hiragana script
  550. */
  551. public static boolean isHiragana(int c) {
  552. if ((c >= 0x3040) && (c <= 0x309F)) {
  553. return true;
  554. } else {
  555. return false;
  556. }
  557. }
  558. /**
  559. * Determine if character c belong to the katakana script.
  560. * @param c a character represented as a unicode scalar value
  561. * @return true if character belongs to katakana script
  562. */
  563. public static boolean isKatakana(int c) {
  564. if ((c >= 0x30A0) && (c <= 0x30FF)) {
  565. return true;
  566. } else if ((c >= 0x31F0) && (c <= 0x31FF)) {
  567. return true;
  568. } else {
  569. return false;
  570. }
  571. }
  572. /**
  573. * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined,
  574. * then the script code 998 ('zyyy') is returned.
  575. * @param c the character to obtain script
  576. * @return an ISO15924 script code
  577. */
  578. public static int scriptOf(int c) { // [TBD] - needs optimization!!!
  579. if (CharUtilities.isAnySpace(c)) {
  580. return SCRIPT_UNDETERMINED;
  581. } else if (isPunctuation(c)) {
  582. return SCRIPT_UNDETERMINED;
  583. } else if (isDigit(c)) {
  584. return SCRIPT_UNDETERMINED;
  585. } else if (isLatin(c)) {
  586. return SCRIPT_LATIN;
  587. } else if (isCyrillic(c)) {
  588. return SCRIPT_CYRILLIC;
  589. } else if (isGreek(c)) {
  590. return SCRIPT_GREEK;
  591. } else if (isHan(c)) {
  592. return SCRIPT_HAN;
  593. } else if (isBopomofo(c)) {
  594. return SCRIPT_BOPOMOFO;
  595. } else if (isKatakana(c)) {
  596. return SCRIPT_KATAKANA;
  597. } else if (isHiragana(c)) {
  598. return SCRIPT_HIRAGANA;
  599. } else if (isHangul(c)) {
  600. return SCRIPT_HANGUL;
  601. } else if (isArabic(c)) {
  602. return SCRIPT_ARABIC;
  603. } else if (isHebrew(c)) {
  604. return SCRIPT_HEBREW;
  605. } else if (isMongolian(c)) {
  606. return SCRIPT_MONGOLIAN;
  607. } else if (isGeorgian(c)) {
  608. return SCRIPT_GEORGIAN;
  609. } else if (isGurmukhi(c)) {
  610. return useV2IndicRules(SCRIPT_GURMUKHI);
  611. } else if (isDevanagari(c)) {
  612. return useV2IndicRules(SCRIPT_DEVANAGARI);
  613. } else if (isGujarati(c)) {
  614. return useV2IndicRules(SCRIPT_GUJARATI);
  615. } else if (isBengali(c)) {
  616. return useV2IndicRules(SCRIPT_BENGALI);
  617. } else if (isOriya(c)) {
  618. return useV2IndicRules(SCRIPT_ORIYA);
  619. } else if (isTibetan(c)) {
  620. return SCRIPT_TIBETAN;
  621. } else if (isTelugu(c)) {
  622. return useV2IndicRules(SCRIPT_TELUGU);
  623. } else if (isKannada(c)) {
  624. return useV2IndicRules(SCRIPT_KANNADA);
  625. } else if (isTamil(c)) {
  626. return useV2IndicRules(SCRIPT_TAMIL);
  627. } else if (isMalayalam(c)) {
  628. return useV2IndicRules(SCRIPT_MALAYALAM);
  629. } else if (isSinhalese(c)) {
  630. return SCRIPT_SINHALESE;
  631. } else if (isBurmese(c)) {
  632. return SCRIPT_BURMESE;
  633. } else if (isThai(c)) {
  634. return SCRIPT_THAI;
  635. } else if (isKhmer(c)) {
  636. return SCRIPT_KHMER;
  637. } else if (isLao(c)) {
  638. return SCRIPT_LAO;
  639. } else if (isEthiopic(c)) {
  640. return SCRIPT_ETHIOPIC;
  641. } else {
  642. return SCRIPT_UNDETERMINED;
  643. }
  644. }
  645. /**
  646. * Obtain the V2 indic script code corresponding to V1 indic script code SC if
  647. * and only iff V2 indic rules apply; otherwise return SC.
  648. * @param sc a V1 indic script code
  649. * @return either SC or the V2 flavor of SC if V2 indic rules apply
  650. */
  651. public static int useV2IndicRules(int sc) {
  652. if (USE_V2_INDIC) {
  653. return (sc < 1000) ? (sc + 1000) : sc;
  654. } else {
  655. return sc;
  656. }
  657. }
  658. /**
  659. * Obtain the script codes of each character in a character sequence. If script
  660. * is not or cannot be determined for some character, then the script code 998
  661. * ('zyyy') is returned.
  662. * @param cs the character sequence
  663. * @return a (possibly empty) array of script codes
  664. */
  665. public static int[] scriptsOf(CharSequence cs) {
  666. Set s = new HashSet();
  667. for (int i = 0, n = cs.length(); i < n; i++) {
  668. s.add(scriptOf(cs.charAt(i)));
  669. }
  670. int[] sa = new int [ s.size() ];
  671. int ns = 0;
  672. for (Object value : s) {
  673. sa[ns++] = (Integer) value;
  674. }
  675. Arrays.sort(sa);
  676. return sa;
  677. }
  678. /**
  679. * Determine the dominant script of a character sequence.
  680. * @param cs the character sequence
  681. * @return the dominant script or SCRIPT_UNDETERMINED
  682. */
  683. public static int dominantScript(CharSequence cs) {
  684. Map m = new HashMap();
  685. for (int i = 0, n = cs.length(); i < n; i++) {
  686. int c = cs.charAt(i);
  687. int s = scriptOf(c);
  688. Integer k = s;
  689. Integer v = (Integer) m.get(k);
  690. if (v != null) {
  691. m.put(k, v + 1);
  692. } else {
  693. m.put(k, 0);
  694. }
  695. }
  696. int sMax = -1;
  697. int cMax = -1;
  698. for (Object o : m.entrySet()) {
  699. Map.Entry e = (Map.Entry) o;
  700. Integer k = (Integer) e.getKey();
  701. int s = k;
  702. switch (s) {
  703. case SCRIPT_UNDETERMINED:
  704. case SCRIPT_UNCODED:
  705. break;
  706. default:
  707. Integer v = (Integer) e.getValue();
  708. assert v != null;
  709. int c = v;
  710. if (c > cMax) {
  711. cMax = c;
  712. sMax = s;
  713. }
  714. break;
  715. }
  716. }
  717. if (sMax < 0) {
  718. sMax = SCRIPT_UNDETERMINED;
  719. }
  720. return sMax;
  721. }
  722. /**
  723. * Determine if script tag denotes an 'Indic' script, where a
  724. * script is an 'Indic' script if it is intended to be processed by
  725. * the generic 'Indic' Script Processor.
  726. * @param script a script tag
  727. * @return true if script tag is a designated 'Indic' script
  728. */
  729. public static boolean isIndicScript(String script) {
  730. return isIndicScript(scriptCodeFromTag(script));
  731. }
  732. /**
  733. * Determine if script tag denotes an 'Indic' script, where a
  734. * script is an 'Indic' script if it is intended to be processed by
  735. * the generic 'Indic' Script Processor.
  736. * @param script a script code
  737. * @return true if script code is a designated 'Indic' script
  738. */
  739. public static boolean isIndicScript(int script) {
  740. switch (script) {
  741. case SCRIPT_BENGALI:
  742. case SCRIPT_BENGALI_2:
  743. case SCRIPT_BURMESE:
  744. case SCRIPT_DEVANAGARI:
  745. case SCRIPT_DEVANAGARI_2:
  746. case SCRIPT_GUJARATI:
  747. case SCRIPT_GUJARATI_2:
  748. case SCRIPT_GURMUKHI:
  749. case SCRIPT_GURMUKHI_2:
  750. case SCRIPT_KANNADA:
  751. case SCRIPT_KANNADA_2:
  752. case SCRIPT_MALAYALAM:
  753. case SCRIPT_MALAYALAM_2:
  754. case SCRIPT_ORIYA:
  755. case SCRIPT_ORIYA_2:
  756. case SCRIPT_TAMIL:
  757. case SCRIPT_TAMIL_2:
  758. case SCRIPT_TELUGU:
  759. case SCRIPT_TELUGU_2:
  760. case SCRIPT_KHMER:
  761. return true;
  762. default:
  763. return false;
  764. }
  765. }
  766. /**
  767. * Determine the script tag associated with an internal script code.
  768. * @param code the script code
  769. * @return a script tag
  770. */
  771. public static String scriptTagFromCode(int code) {
  772. Map<Integer, String> m = getScriptTagsMap();
  773. if (m != null) {
  774. String tag;
  775. if ((tag = m.get(code)) != null) {
  776. return tag;
  777. } else {
  778. return "";
  779. }
  780. } else {
  781. return "";
  782. }
  783. }
  784. /**
  785. * Determine the internal script code associated with a script tag.
  786. * @param tag the script tag
  787. * @return a script code
  788. */
  789. public static int scriptCodeFromTag(String tag) {
  790. Map<String, Integer> m = getScriptCodeMap();
  791. if (m != null) {
  792. Integer c;
  793. if ((c = m.get(tag)) != null) {
  794. return (int) c;
  795. } else {
  796. return SCRIPT_UNDETERMINED;
  797. }
  798. } else {
  799. return SCRIPT_UNDETERMINED;
  800. }
  801. }
  802. private static Map<Integer, String> scriptTagsMap;
  803. private static Map<String, Integer> scriptCodeMap;
  804. private static void putScriptTag(Map tm, Map cm, int code, String tag) {
  805. assert tag != null;
  806. assert tag.length() != 0;
  807. assert code >= 0;
  808. assert code < 2000;
  809. tm.put(code, tag);
  810. cm.put(tag, code);
  811. }
  812. private static void makeScriptMaps() {
  813. HashMap<Integer, String> tm = new HashMap<Integer, String>();
  814. HashMap<String, Integer> cm = new HashMap<String, Integer>();
  815. putScriptTag(tm, cm, SCRIPT_HEBREW, "hebr");
  816. putScriptTag(tm, cm, SCRIPT_MONGOLIAN, "mong");
  817. putScriptTag(tm, cm, SCRIPT_ARABIC, "arab");
  818. putScriptTag(tm, cm, SCRIPT_GREEK, "grek");
  819. putScriptTag(tm, cm, SCRIPT_LATIN, "latn");
  820. putScriptTag(tm, cm, SCRIPT_CYRILLIC, "cyrl");
  821. putScriptTag(tm, cm, SCRIPT_GEORGIAN, "geor");
  822. putScriptTag(tm, cm, SCRIPT_BOPOMOFO, "bopo");
  823. putScriptTag(tm, cm, SCRIPT_HANGUL, "hang");
  824. putScriptTag(tm, cm, SCRIPT_GURMUKHI, "guru");
  825. putScriptTag(tm, cm, SCRIPT_GURMUKHI_2, "gur2");
  826. putScriptTag(tm, cm, SCRIPT_DEVANAGARI, "deva");
  827. putScriptTag(tm, cm, SCRIPT_DEVANAGARI_2, "dev2");
  828. putScriptTag(tm, cm, SCRIPT_GUJARATI, "gujr");
  829. putScriptTag(tm, cm, SCRIPT_GUJARATI_2, "gjr2");
  830. putScriptTag(tm, cm, SCRIPT_BENGALI, "beng");
  831. putScriptTag(tm, cm, SCRIPT_BENGALI_2, "bng2");
  832. putScriptTag(tm, cm, SCRIPT_ORIYA, "orya");
  833. putScriptTag(tm, cm, SCRIPT_ORIYA_2, "ory2");
  834. putScriptTag(tm, cm, SCRIPT_TIBETAN, "tibt");
  835. putScriptTag(tm, cm, SCRIPT_TELUGU, "telu");
  836. putScriptTag(tm, cm, SCRIPT_TELUGU_2, "tel2");
  837. putScriptTag(tm, cm, SCRIPT_KANNADA, "knda");
  838. putScriptTag(tm, cm, SCRIPT_KANNADA_2, "knd2");
  839. putScriptTag(tm, cm, SCRIPT_TAMIL, "taml");
  840. putScriptTag(tm, cm, SCRIPT_TAMIL_2, "tml2");
  841. putScriptTag(tm, cm, SCRIPT_MALAYALAM, "mlym");
  842. putScriptTag(tm, cm, SCRIPT_MALAYALAM_2, "mlm2");
  843. putScriptTag(tm, cm, SCRIPT_SINHALESE, "sinh");
  844. putScriptTag(tm, cm, SCRIPT_BURMESE, "mymr");
  845. putScriptTag(tm, cm, SCRIPT_THAI, "thai");
  846. putScriptTag(tm, cm, SCRIPT_KHMER, "khmr");
  847. putScriptTag(tm, cm, SCRIPT_LAO, "laoo");
  848. putScriptTag(tm, cm, SCRIPT_HIRAGANA, "hira");
  849. putScriptTag(tm, cm, SCRIPT_ETHIOPIC, "ethi");
  850. putScriptTag(tm, cm, SCRIPT_HAN, "hani");
  851. putScriptTag(tm, cm, SCRIPT_KATAKANA, "kana");
  852. putScriptTag(tm, cm, SCRIPT_MATH, "zmth");
  853. putScriptTag(tm, cm, SCRIPT_SYMBOL, "zsym");
  854. putScriptTag(tm, cm, SCRIPT_UNDETERMINED, "zyyy");
  855. putScriptTag(tm, cm, SCRIPT_UNCODED, "zzzz");
  856. scriptTagsMap = tm;
  857. scriptCodeMap = cm;
  858. }
  859. private static Map<Integer, String> getScriptTagsMap() {
  860. if (scriptTagsMap == null) {
  861. makeScriptMaps();
  862. }
  863. return scriptTagsMap;
  864. }
  865. private static Map<String, Integer> getScriptCodeMap() {
  866. if (scriptCodeMap == null) {
  867. makeScriptMaps();
  868. }
  869. return scriptCodeMap;
  870. }
  871. }