123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926 |
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /* $Id$ */
-
- package org.apache.fop.complexscripts.util;
-
- import java.util.Arrays;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Iterator;
- import java.util.Map;
- import java.util.Set;
-
- import org.apache.fop.util.CharUtilities;
-
- /**
- * <p>Script related utilities.</p>
- *
- * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
- */
- public final class CharScript {
-
- // CSOFF: LineLength
-
- //
- // The following script codes are based on ISO 15924. Codes less than 1000 are
- // official assignments from 15924; those equal to or greater than 1000 are FOP
- // implementation specific.
- //
- /** hebrew script constant */
- public static final int SCRIPT_HEBREW = 125; // 'hebr'
- /** mongolian script constant */
- public static final int SCRIPT_MONGOLIAN = 145; // 'mong'
- /** arabic script constant */
- public static final int SCRIPT_ARABIC = 160; // 'arab'
- /** greek script constant */
- public static final int SCRIPT_GREEK = 200; // 'grek'
- /** latin script constant */
- public static final int SCRIPT_LATIN = 215; // 'latn'
- /** cyrillic script constant */
- public static final int SCRIPT_CYRILLIC = 220; // 'cyrl'
- /** georgian script constant */
- public static final int SCRIPT_GEORGIAN = 240; // 'geor'
- /** bopomofo script constant */
- public static final int SCRIPT_BOPOMOFO = 285; // 'bopo'
- /** hangul script constant */
- public static final int SCRIPT_HANGUL = 286; // 'hang'
- /** gurmukhi script constant */
- public static final int SCRIPT_GURMUKHI = 310; // 'guru'
- /** gurmukhi 2 script constant */
- public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** devanagari script constant */
- public static final int SCRIPT_DEVANAGARI = 315; // 'deva'
- /** devanagari 2 script constant */
- public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** gujarati script constant */
- public static final int SCRIPT_GUJARATI = 320; // 'gujr'
- /** gujarati 2 script constant */
- public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** bengali script constant */
- public static final int SCRIPT_BENGALI = 326; // 'beng'
- /** bengali 2 script constant */
- public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** oriya script constant */
- public static final int SCRIPT_ORIYA = 327; // 'orya'
- /** oriya 2 script constant */
- public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** tibetan script constant */
- public static final int SCRIPT_TIBETAN = 330; // 'tibt'
- /** telugu script constant */
- public static final int SCRIPT_TELUGU = 340; // 'telu'
- /** telugu 2 script constant */
- public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** kannada script constant */
- public static final int SCRIPT_KANNADA = 345; // 'knda'
- /** kannada 2 script constant */
- public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** tamil script constant */
- public static final int SCRIPT_TAMIL = 346; // 'taml'
- /** tamil 2 script constant */
- public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** malayalam script constant */
- public static final int SCRIPT_MALAYALAM = 347; // 'mlym'
- /** malayalam 2 script constant */
- public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics
- /** sinhalese script constant */
- public static final int SCRIPT_SINHALESE = 348; // 'sinh'
- /** burmese script constant */
- public static final int SCRIPT_BURMESE = 350; // 'mymr'
- /** thai script constant */
- public static final int SCRIPT_THAI = 352; // 'thai'
- /** khmer script constant */
- public static final int SCRIPT_KHMER = 355; // 'khmr'
- /** lao script constant */
- public static final int SCRIPT_LAO = 356; // 'laoo'
- /** hiragana script constant */
- public static final int SCRIPT_HIRAGANA = 410; // 'hira'
- /** ethiopic script constant */
- public static final int SCRIPT_ETHIOPIC = 430; // 'ethi'
- /** han script constant */
- public static final int SCRIPT_HAN = 500; // 'hani'
- /** katakana script constant */
- public static final int SCRIPT_KATAKANA = 410; // 'kana'
- /** math script constant */
- public static final int SCRIPT_MATH = 995; // 'zmth'
- /** symbol script constant */
- public static final int SCRIPT_SYMBOL = 996; // 'zsym'
- /** undetermined script constant */
- public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy'
- /** uncoded script constant */
- public static final int SCRIPT_UNCODED = 999; // 'zzzz'
-
- /**
- * A static (class) parameter indicating whether V2 indic shaping
- * rules apply or not, with default being <code>true</code>.
- */
- private static final boolean USE_V2_INDIC = true;
-
- private CharScript() {
- }
-
- /**
- * Determine if character c is punctuation.
- * @param c a character represented as a unicode scalar value
- * @return true if character is punctuation
- */
- public static boolean isPunctuation(int c) {
- if ((c >= 0x0021) && (c <= 0x002F)) { // basic latin punctuation
- return true;
- } else if ((c >= 0x003A) && (c <= 0x0040)) { // basic latin punctuation
- return true;
- } else if ((c >= 0x005F) && (c <= 0x0060)) { // basic latin punctuation
- return true;
- } else if ((c >= 0x007E) && (c <= 0x007E)) { // basic latin punctuation
- return true;
- } else if ((c >= 0x007E) && (c <= 0x007E)) { // basic latin punctuation
- return true;
- } else if ((c >= 0x00A1) && (c <= 0x00BF)) { // latin supplement punctuation
- return true;
- } else if ((c >= 0x00D7) && (c <= 0x00D7)) { // latin supplement punctuation
- return true;
- } else if ((c >= 0x00F7) && (c <= 0x00F7)) { // latin supplement punctuation
- return true;
- } else if ((c >= 0x2000) && (c <= 0x206F)) { // general punctuation
- return true;
- } else { // [TBD] - not complete
- return false;
- }
- }
-
- /**
- * Determine if character c is a digit.
- * @param c a character represented as a unicode scalar value
- * @return true if character is a digit
- */
- public static boolean isDigit(int c) {
- if ((c >= 0x0030) && (c <= 0x0039)) { // basic latin digits
- return true;
- } else { // [TBD] - not complete
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the hebrew script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to hebrew script
- */
- public static boolean isHebrew(int c) {
- if ((c >= 0x0590) && (c <= 0x05FF)) { // hebrew block
- return true;
- } else if ((c >= 0xFB00) && (c <= 0xFB4F)) { // hebrew presentation forms block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the mongolian script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to mongolian script
- */
- public static boolean isMongolian(int c) {
- if ((c >= 0x1800) && (c <= 0x18AF)) { // mongolian block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the arabic script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to arabic script
- */
- public static boolean isArabic(int c) {
- if ((c >= 0x0600) && (c <= 0x06FF)) { // arabic block
- return true;
- } else if ((c >= 0x0750) && (c <= 0x077F)) { // arabic supplement block
- return true;
- } else if ((c >= 0xFB50) && (c <= 0xFDFF)) { // arabic presentation forms a block
- return true;
- } else if ((c >= 0xFE70) && (c <= 0xFEFF)) { // arabic presentation forms b block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the greek script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to greek script
- */
- public static boolean isGreek(int c) {
- if ((c >= 0x0370) && (c <= 0x03FF)) { // greek (and coptic) block
- return true;
- } else if ((c >= 0x1F00) && (c <= 0x1FFF)) { // greek extended block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the latin script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to latin script
- */
- public static boolean isLatin(int c) {
- if ((c >= 0x0041) && (c <= 0x005A)) { // basic latin upper case
- return true;
- } else if ((c >= 0x0061) && (c <= 0x007A)) { // basic latin lower case
- return true;
- } else if ((c >= 0x00C0) && (c <= 0x00D6)) { // latin supplement upper case
- return true;
- } else if ((c >= 0x00D8) && (c <= 0x00DF)) { // latin supplement upper case
- return true;
- } else if ((c >= 0x00E0) && (c <= 0x00F6)) { // latin supplement lower case
- return true;
- } else if ((c >= 0x00F8) && (c <= 0x00FF)) { // latin supplement lower case
- return true;
- } else if ((c >= 0x0100) && (c <= 0x017F)) { // latin extended a
- return true;
- } else if ((c >= 0x0180) && (c <= 0x024F)) { // latin extended b
- return true;
- } else if ((c >= 0x1E00) && (c <= 0x1EFF)) { // latin extended additional
- return true;
- } else if ((c >= 0x2C60) && (c <= 0x2C7F)) { // latin extended c
- return true;
- } else if ((c >= 0xA720) && (c <= 0xA7FF)) { // latin extended d
- return true;
- } else if ((c >= 0xFB00) && (c <= 0xFB0F)) { // latin ligatures
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the cyrillic script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to cyrillic script
- */
- public static boolean isCyrillic(int c) {
- if ((c >= 0x0400) && (c <= 0x04FF)) { // cyrillic block
- return true;
- } else if ((c >= 0x0500) && (c <= 0x052F)) { // cyrillic supplement block
- return true;
- } else if ((c >= 0x2DE0) && (c <= 0x2DFF)) { // cyrillic extended-a block
- return true;
- } else if ((c >= 0xA640) && (c <= 0xA69F)) { // cyrillic extended-b block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the georgian script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to georgian script
- */
- public static boolean isGeorgian(int c) {
- if ((c >= 0x10A0) && (c <= 0x10FF)) { // georgian block
- return true;
- } else if ((c >= 0x2D00) && (c <= 0x2D2F)) { // georgian supplement block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the hangul script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to hangul script
- */
- public static boolean isHangul(int c) {
- if ((c >= 0x1100) && (c <= 0x11FF)) { // hangul jamo
- return true;
- } else if ((c >= 0x3130) && (c <= 0x318F)) { // hangul compatibility jamo
- return true;
- } else if ((c >= 0xA960) && (c <= 0xA97F)) { // hangul jamo extended a
- return true;
- } else if ((c >= 0xAC00) && (c <= 0xD7A3)) { // hangul syllables
- return true;
- } else if ((c >= 0xD7B0) && (c <= 0xD7FF)) { // hangul jamo extended a
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the gurmukhi script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to gurmukhi script
- */
- public static boolean isGurmukhi(int c) {
- if ((c >= 0x0A00) && (c <= 0x0A7F)) { // gurmukhi block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the devanagari script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to devanagari script
- */
- public static boolean isDevanagari(int c) {
- if ((c >= 0x0900) && (c <= 0x097F)) { // devangari block
- return true;
- } else if ((c >= 0xA8E0) && (c <= 0xA8FF)) { // devangari extended block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the gujarati script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to gujarati script
- */
- public static boolean isGujarati(int c) {
- if ((c >= 0x0A80) && (c <= 0x0AFF)) { // gujarati block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the bengali script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to bengali script
- */
- public static boolean isBengali(int c) {
- if ((c >= 0x0980) && (c <= 0x09FF)) { // bengali block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the oriya script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to oriya script
- */
- public static boolean isOriya(int c) {
- if ((c >= 0x0B00) && (c <= 0x0B7F)) { // oriya block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the tibetan script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to tibetan script
- */
- public static boolean isTibetan(int c) {
- if ((c >= 0x0F00) && (c <= 0x0FFF)) { // tibetan block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the telugu script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to telugu script
- */
- public static boolean isTelugu(int c) {
- if ((c >= 0x0C00) && (c <= 0x0C7F)) { // telugu block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the kannada script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to kannada script
- */
- public static boolean isKannada(int c) {
- if ((c >= 0x0C00) && (c <= 0x0C7F)) { // kannada block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the tamil script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to tamil script
- */
- public static boolean isTamil(int c) {
- if ((c >= 0x0B80) && (c <= 0x0BFF)) { // tamil block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the malayalam script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to malayalam script
- */
- public static boolean isMalayalam(int c) {
- if ((c >= 0x0D00) && (c <= 0x0D7F)) { // malayalam block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the sinhalese script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to sinhalese script
- */
- public static boolean isSinhalese(int c) {
- if ((c >= 0x0D80) && (c <= 0x0DFF)) { // sinhala block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the burmese script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to burmese script
- */
- public static boolean isBurmese(int c) {
- if ((c >= 0x1000) && (c <= 0x109F)) { // burmese (myanmar) block
- return true;
- } else if ((c >= 0xAA60) && (c <= 0xAA7F)) { // burmese (myanmar) extended block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the thai script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to thai script
- */
- public static boolean isThai(int c) {
- if ((c >= 0x0E00) && (c <= 0x0E7F)) { // thai block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the khmer script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to khmer script
- */
- public static boolean isKhmer(int c) {
- if ((c >= 0x1780) && (c <= 0x17FF)) { // khmer block
- return true;
- } else if ((c >= 0x19E0) && (c <= 0x19FF)) { // khmer symbols block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the lao script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to lao script
- */
- public static boolean isLao(int c) {
- if ((c >= 0x0E80) && (c <= 0x0EFF)) { // lao block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the ethiopic (amharic) script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to ethiopic (amharic) script
- */
- public static boolean isEthiopic(int c) {
- if ((c >= 0x1200) && (c <= 0x137F)) { // ethiopic block
- return true;
- } else if ((c >= 0x1380) && (c <= 0x139F)) { // ethoipic supplement block
- return true;
- } else if ((c >= 0x2D80) && (c <= 0x2DDF)) { // ethoipic extended block
- return true;
- } else if ((c >= 0xAB00) && (c <= 0xAB2F)) { // ethoipic extended-a block
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the han (unified cjk) script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to han (unified cjk) script
- */
- public static boolean isHan(int c) {
- if ((c >= 0x3400) && (c <= 0x4DBF)) {
- return true; // cjk unified ideographs extension a
- } else if ((c >= 0x4E00) && (c <= 0x9FFF)) {
- return true; // cjk unified ideographs
- } else if ((c >= 0xF900) && (c <= 0xFAFF)) {
- return true; // cjk compatibility ideographs
- } else if ((c >= 0x20000) && (c <= 0x2A6DF)) {
- return true; // cjk unified ideographs extension b
- } else if ((c >= 0x2A700) && (c <= 0x2B73F)) {
- return true; // cjk unified ideographs extension c
- } else if ((c >= 0x2F800) && (c <= 0x2FA1F)) {
- return true; // cjk compatibility ideographs supplement
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the bopomofo script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to bopomofo script
- */
- public static boolean isBopomofo(int c) {
- if ((c >= 0x3100) && (c <= 0x312F)) {
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the hiragana script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to hiragana script
- */
- public static boolean isHiragana(int c) {
- if ((c >= 0x3040) && (c <= 0x309F)) {
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Determine if character c belong to the katakana script.
- * @param c a character represented as a unicode scalar value
- * @return true if character belongs to katakana script
- */
- public static boolean isKatakana(int c) {
- if ((c >= 0x30A0) && (c <= 0x30FF)) {
- return true;
- } else if ((c >= 0x31F0) && (c <= 0x31FF)) {
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined,
- * then the script code 998 ('zyyy') is returned.
- * @param c the character to obtain script
- * @return an ISO15924 script code
- */
- public static int scriptOf(int c) { // [TBD] - needs optimization!!!
- if (CharUtilities.isAnySpace(c)) {
- return SCRIPT_UNDETERMINED;
- } else if (isPunctuation(c)) {
- return SCRIPT_UNDETERMINED;
- } else if (isDigit(c)) {
- return SCRIPT_UNDETERMINED;
- } else if (isLatin(c)) {
- return SCRIPT_LATIN;
- } else if (isCyrillic(c)) {
- return SCRIPT_CYRILLIC;
- } else if (isGreek(c)) {
- return SCRIPT_GREEK;
- } else if (isHan(c)) {
- return SCRIPT_HAN;
- } else if (isBopomofo(c)) {
- return SCRIPT_BOPOMOFO;
- } else if (isKatakana(c)) {
- return SCRIPT_KATAKANA;
- } else if (isHiragana(c)) {
- return SCRIPT_HIRAGANA;
- } else if (isHangul(c)) {
- return SCRIPT_HANGUL;
- } else if (isArabic(c)) {
- return SCRIPT_ARABIC;
- } else if (isHebrew(c)) {
- return SCRIPT_HEBREW;
- } else if (isMongolian(c)) {
- return SCRIPT_MONGOLIAN;
- } else if (isGeorgian(c)) {
- return SCRIPT_GEORGIAN;
- } else if (isGurmukhi(c)) {
- return useV2IndicRules(SCRIPT_GURMUKHI);
- } else if (isDevanagari(c)) {
- return useV2IndicRules(SCRIPT_DEVANAGARI);
- } else if (isGujarati(c)) {
- return useV2IndicRules(SCRIPT_GUJARATI);
- } else if (isBengali(c)) {
- return useV2IndicRules(SCRIPT_BENGALI);
- } else if (isOriya(c)) {
- return useV2IndicRules(SCRIPT_ORIYA);
- } else if (isTibetan(c)) {
- return SCRIPT_TIBETAN;
- } else if (isTelugu(c)) {
- return useV2IndicRules(SCRIPT_TELUGU);
- } else if (isKannada(c)) {
- return useV2IndicRules(SCRIPT_KANNADA);
- } else if (isTamil(c)) {
- return useV2IndicRules(SCRIPT_TAMIL);
- } else if (isMalayalam(c)) {
- return useV2IndicRules(SCRIPT_MALAYALAM);
- } else if (isSinhalese(c)) {
- return SCRIPT_SINHALESE;
- } else if (isBurmese(c)) {
- return SCRIPT_BURMESE;
- } else if (isThai(c)) {
- return SCRIPT_THAI;
- } else if (isKhmer(c)) {
- return SCRIPT_KHMER;
- } else if (isLao(c)) {
- return SCRIPT_LAO;
- } else if (isEthiopic(c)) {
- return SCRIPT_ETHIOPIC;
- } else {
- return SCRIPT_UNDETERMINED;
- }
- }
-
- /**
- * Obtain the V2 indic script code corresponding to V1 indic script code SC if
- * and only iff V2 indic rules apply; otherwise return SC.
- * @param sc a V1 indic script code
- * @return either SC or the V2 flavor of SC if V2 indic rules apply
- */
- public static int useV2IndicRules(int sc) {
- if (USE_V2_INDIC) {
- return (sc < 1000) ? (sc + 1000) : sc;
- } else {
- return sc;
- }
- }
-
- /**
- * Obtain the script codes of each character in a character sequence. If script
- * is not or cannot be determined for some character, then the script code 998
- * ('zyyy') is returned.
- * @param cs the character sequence
- * @return a (possibly empty) array of script codes
- */
- public static int[] scriptsOf(CharSequence cs) {
- Set s = new HashSet();
- for (int i = 0, n = cs.length(); i < n; i++) {
- s.add(Integer.valueOf(scriptOf(cs.charAt(i))));
- }
- int[] sa = new int [ s.size() ];
- int ns = 0;
- for (Iterator it = s.iterator(); it.hasNext();) {
- sa [ ns++ ] = ((Integer) it.next()) .intValue();
- }
- Arrays.sort(sa);
- return sa;
- }
-
- /**
- * Determine the dominant script of a character sequence.
- * @param cs the character sequence
- * @return the dominant script or SCRIPT_UNDETERMINED
- */
- public static int dominantScript(CharSequence cs) {
- Map m = new HashMap();
- for (int i = 0, n = cs.length(); i < n; i++) {
- int c = cs.charAt(i);
- int s = scriptOf(c);
- Integer k = Integer.valueOf(s);
- Integer v = (Integer) m.get(k);
- if (v != null) {
- m.put(k, Integer.valueOf(v.intValue() + 1));
- } else {
- m.put(k, Integer.valueOf(0));
- }
- }
- int sMax = -1;
- int cMax = -1;
- for (Iterator it = m.entrySet().iterator(); it.hasNext();) {
- Map.Entry e = (Map.Entry) it.next();
- Integer k = (Integer) e.getKey();
- int s = k.intValue();
- switch (s) {
- case SCRIPT_UNDETERMINED:
- case SCRIPT_UNCODED:
- break;
- default:
- Integer v = (Integer) e.getValue();
- assert v != null;
- int c = v.intValue();
- if (c > cMax) {
- cMax = c;
- sMax = s;
- }
- break;
- }
- }
- if (sMax < 0) {
- sMax = SCRIPT_UNDETERMINED;
- }
- return sMax;
- }
-
- /**
- * Determine if script tag denotes an 'Indic' script, where a
- * script is an 'Indic' script if it is intended to be processed by
- * the generic 'Indic' Script Processor.
- * @param script a script tag
- * @return true if script tag is a designated 'Indic' script
- */
- public static boolean isIndicScript(String script) {
- return isIndicScript(scriptCodeFromTag(script));
- }
-
- /**
- * Determine if script tag denotes an 'Indic' script, where a
- * script is an 'Indic' script if it is intended to be processed by
- * the generic 'Indic' Script Processor.
- * @param script a script code
- * @return true if script code is a designated 'Indic' script
- */
- public static boolean isIndicScript(int script) {
- switch (script) {
- case SCRIPT_BENGALI:
- case SCRIPT_BENGALI_2:
- case SCRIPT_BURMESE:
- case SCRIPT_DEVANAGARI:
- case SCRIPT_DEVANAGARI_2:
- case SCRIPT_GUJARATI:
- case SCRIPT_GUJARATI_2:
- case SCRIPT_GURMUKHI:
- case SCRIPT_GURMUKHI_2:
- case SCRIPT_KANNADA:
- case SCRIPT_KANNADA_2:
- case SCRIPT_MALAYALAM:
- case SCRIPT_MALAYALAM_2:
- case SCRIPT_ORIYA:
- case SCRIPT_ORIYA_2:
- case SCRIPT_TAMIL:
- case SCRIPT_TAMIL_2:
- case SCRIPT_TELUGU:
- case SCRIPT_TELUGU_2:
- return true;
- default:
- return false;
- }
- }
-
- /**
- * Determine the script tag associated with an internal script code.
- * @param code the script code
- * @return a script tag
- */
- public static String scriptTagFromCode(int code) {
- Map<Integer, String> m = getScriptTagsMap();
- if (m != null) {
- String tag;
- if ((tag = m.get(Integer.valueOf(code))) != null) {
- return tag;
- } else {
- return "";
- }
- } else {
- return "";
- }
- }
-
- /**
- * Determine the internal script code associated with a script tag.
- * @param tag the script tag
- * @return a script code
- */
- public static int scriptCodeFromTag(String tag) {
- Map<String, Integer> m = getScriptCodeMap();
- if (m != null) {
- Integer c;
- if ((c = m.get(tag)) != null) {
- return (int) c;
- } else {
- return SCRIPT_UNDETERMINED;
- }
- } else {
- return SCRIPT_UNDETERMINED;
- }
- }
-
- private static Map<Integer, String> scriptTagsMap = null;
- private static Map<String, Integer> scriptCodeMap = null;
-
- private static void putScriptTag(Map tm, Map cm, int code, String tag) {
- assert tag != null;
- assert tag.length() != 0;
- assert code >= 0;
- assert code < 2000;
- tm.put(Integer.valueOf(code), tag);
- cm.put(tag, Integer.valueOf(code));
- }
-
- private static void makeScriptMaps() {
- HashMap<Integer, String> tm = new HashMap<Integer, String>();
- HashMap<String, Integer> cm = new HashMap<String, Integer>();
- putScriptTag(tm, cm, SCRIPT_HEBREW, "hebr");
- putScriptTag(tm, cm, SCRIPT_MONGOLIAN, "mong");
- putScriptTag(tm, cm, SCRIPT_ARABIC, "arab");
- putScriptTag(tm, cm, SCRIPT_GREEK, "grek");
- putScriptTag(tm, cm, SCRIPT_LATIN, "latn");
- putScriptTag(tm, cm, SCRIPT_CYRILLIC, "cyrl");
- putScriptTag(tm, cm, SCRIPT_GEORGIAN, "geor");
- putScriptTag(tm, cm, SCRIPT_BOPOMOFO, "bopo");
- putScriptTag(tm, cm, SCRIPT_HANGUL, "hang");
- putScriptTag(tm, cm, SCRIPT_GURMUKHI, "guru");
- putScriptTag(tm, cm, SCRIPT_GURMUKHI_2, "gur2");
- putScriptTag(tm, cm, SCRIPT_DEVANAGARI, "deva");
- putScriptTag(tm, cm, SCRIPT_DEVANAGARI_2, "dev2");
- putScriptTag(tm, cm, SCRIPT_GUJARATI, "gujr");
- putScriptTag(tm, cm, SCRIPT_GUJARATI_2, "gjr2");
- putScriptTag(tm, cm, SCRIPT_BENGALI, "beng");
- putScriptTag(tm, cm, SCRIPT_BENGALI_2, "bng2");
- putScriptTag(tm, cm, SCRIPT_ORIYA, "orya");
- putScriptTag(tm, cm, SCRIPT_ORIYA_2, "ory2");
- putScriptTag(tm, cm, SCRIPT_TIBETAN, "tibt");
- putScriptTag(tm, cm, SCRIPT_TELUGU, "telu");
- putScriptTag(tm, cm, SCRIPT_TELUGU_2, "tel2");
- putScriptTag(tm, cm, SCRIPT_KANNADA, "knda");
- putScriptTag(tm, cm, SCRIPT_KANNADA_2, "knd2");
- putScriptTag(tm, cm, SCRIPT_TAMIL, "taml");
- putScriptTag(tm, cm, SCRIPT_TAMIL_2, "tml2");
- putScriptTag(tm, cm, SCRIPT_MALAYALAM, "mlym");
- putScriptTag(tm, cm, SCRIPT_MALAYALAM_2, "mlm2");
- putScriptTag(tm, cm, SCRIPT_SINHALESE, "sinh");
- putScriptTag(tm, cm, SCRIPT_BURMESE, "mymr");
- putScriptTag(tm, cm, SCRIPT_THAI, "thai");
- putScriptTag(tm, cm, SCRIPT_KHMER, "khmr");
- putScriptTag(tm, cm, SCRIPT_LAO, "laoo");
- putScriptTag(tm, cm, SCRIPT_HIRAGANA, "hira");
- putScriptTag(tm, cm, SCRIPT_ETHIOPIC, "ethi");
- putScriptTag(tm, cm, SCRIPT_HAN, "hani");
- putScriptTag(tm, cm, SCRIPT_KATAKANA, "kana");
- putScriptTag(tm, cm, SCRIPT_MATH, "zmth");
- putScriptTag(tm, cm, SCRIPT_SYMBOL, "zsym");
- putScriptTag(tm, cm, SCRIPT_UNDETERMINED, "zyyy");
- putScriptTag(tm, cm, SCRIPT_UNCODED, "zzzz");
- scriptTagsMap = tm;
- scriptCodeMap = cm;
- }
-
- private static Map<Integer, String> getScriptTagsMap() {
- if (scriptTagsMap == null) {
- makeScriptMaps();
- }
- return scriptTagsMap;
- }
-
- private static Map<String, Integer> getScriptCodeMap() {
- if (scriptCodeMap == null) {
- makeScriptMaps();
- }
- return scriptCodeMap;
- }
-
- }
|