diff options
author | Glenn Adams <gadams@apache.org> | 2012-02-26 02:29:01 +0000 |
---|---|---|
committer | Glenn Adams <gadams@apache.org> | 2012-02-26 02:29:01 +0000 |
commit | d6d8e57b17eb2e36631115517afa003ad3afa1a1 (patch) | |
tree | bf355ee4643080bf13b8f9fa5a1b14002e968561 /src/java/org/apache/fop/complexscripts/util | |
parent | fa6dc48793a4eb7476282141c1314f1198371a67 (diff) | |
download | xmlgraphics-fop-d6d8e57b17eb2e36631115517afa003ad3afa1a1.tar.gz xmlgraphics-fop-d6d8e57b17eb2e36631115517afa003ad3afa1a1.zip |
apply complex scripts patch
git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@1293736 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/java/org/apache/fop/complexscripts/util')
9 files changed, 4618 insertions, 0 deletions
diff --git a/src/java/org/apache/fop/complexscripts/util/CharMirror.java b/src/java/org/apache/fop/complexscripts/util/CharMirror.java new file mode 100644 index 000000000..bb1d1587f --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/CharMirror.java @@ -0,0 +1,715 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +import java.util.Arrays; + +/** + * Mirror related utilities. + * @author Glenn Adams + */ +public final class CharMirror { + + private CharMirror() { + } + + /** + * Mirror characters that are designated as having the bidi mirrorred property. + * @param s a string whose characters are to be mirrored + * @return the resulting string + */ + public static String mirror ( String s ) { + StringBuffer sb = new StringBuffer ( s ); + for ( int i = 0, n = sb.length(); i < n; i++ ) { + sb.setCharAt ( i, (char) mirror ( sb.charAt ( i ) ) ); + } + return sb.toString(); + } + + private static int[] mirroredCharacters = { + 0x0028, + 0x0029, + 0x003C, + 0x003E, + 0x005B, + 0x005D, + 0x007B, + 0x007D, + 0x00AB, + 0x00BB, + 0x0F3A, + 0x0F3B, + 0x0F3C, + 0x0F3D, + 0x169B, + 0x169C, + 0x2039, + 0x203A, + 0x2045, + 0x2046, + 0x207D, + 0x207E, + 0x208D, + 0x208E, + 0x2208, + 0x2209, + 0x220A, + 0x220B, + 0x220C, + 0x220D, + 0x2215, + 0x223C, + 0x223D, + 0x2243, + 0x2252, + 0x2253, + 0x2254, + 0x2255, + 0x2264, + 0x2265, + 0x2266, + 0x2267, + 0x2268, + 0x2269, + 0x226A, + 0x226B, + 0x226E, + 0x226F, + 0x2270, + 0x2271, + 0x2272, + 0x2273, + 0x2274, + 0x2275, + 0x2276, + 0x2277, + 0x2278, + 0x2279, + 0x227A, + 0x227B, + 0x227C, + 0x227D, + 0x227E, + 0x227F, + 0x2280, + 0x2281, + 0x2282, + 0x2283, + 0x2284, + 0x2285, + 0x2286, + 0x2287, + 0x2288, + 0x2289, + 0x228A, + 0x228B, + 0x228F, + 0x2290, + 0x2291, + 0x2292, + 0x2298, + 0x22A2, + 0x22A3, + 0x22A6, + 0x22A8, + 0x22A9, + 0x22AB, + 0x22B0, + 0x22B1, + 0x22B2, + 0x22B3, + 0x22B4, + 0x22B5, + 0x22B6, + 0x22B7, + 0x22C9, + 0x22CA, + 0x22CB, + 0x22CC, + 0x22CD, + 0x22D0, + 0x22D1, + 0x22D6, + 0x22D7, + 0x22D8, + 0x22D9, + 0x22DA, + 0x22DB, + 0x22DC, + 0x22DD, + 0x22DE, + 0x22DF, + 0x22E0, + 0x22E1, + 0x22E2, + 0x22E3, + 0x22E4, + 0x22E5, + 0x22E6, + 0x22E7, + 0x22E8, + 0x22E9, + 0x22EA, + 0x22EB, + 0x22EC, + 0x22ED, + 0x22F0, + 0x22F1, + 0x22F2, + 0x22F3, + 0x22F4, + 0x22F6, + 0x22F7, + 0x22FA, + 0x22FB, + 0x22FC, + 0x22FD, + 0x22FE, + 0x2308, + 0x2309, + 0x230A, + 0x230B, + 0x2329, + 0x232A, + 0x2768, + 0x2769, + 0x276A, + 0x276B, + 0x276C, + 0x276D, + 0x276E, + 0x276F, + 0x2770, + 0x2771, + 0x2772, + 0x2773, + 0x2774, + 0x2775, + 0x27C3, + 0x27C4, + 0x27C5, + 0x27C6, + 0x27C8, + 0x27C9, + 0x27D5, + 0x27D6, + 0x27DD, + 0x27DE, + 0x27E2, + 0x27E3, + 0x27E4, + 0x27E5, + 0x27E6, + 0x27E7, + 0x27E8, + 0x27E9, + 0x27EA, + 0x27EB, + 0x27EC, + 0x27ED, + 0x27EE, + 0x27EF, + 0x2983, + 0x2984, + 0x2985, + 0x2986, + 0x2987, + 0x2988, + 0x2989, + 0x298A, + 0x298B, + 0x298C, + 0x298D, + 0x298E, + 0x298F, + 0x2990, + 0x2991, + 0x2992, + 0x2993, + 0x2994, + 0x2995, + 0x2996, + 0x2997, + 0x2998, + 0x29B8, + 0x29C0, + 0x29C1, + 0x29C4, + 0x29C5, + 0x29CF, + 0x29D0, + 0x29D1, + 0x29D2, + 0x29D4, + 0x29D5, + 0x29D8, + 0x29D9, + 0x29DA, + 0x29DB, + 0x29F5, + 0x29F8, + 0x29F9, + 0x29FC, + 0x29FD, + 0x2A2B, + 0x2A2C, + 0x2A2D, + 0x2A2E, + 0x2A34, + 0x2A35, + 0x2A3C, + 0x2A3D, + 0x2A64, + 0x2A65, + 0x2A79, + 0x2A7A, + 0x2A7D, + 0x2A7E, + 0x2A7F, + 0x2A80, + 0x2A81, + 0x2A82, + 0x2A83, + 0x2A84, + 0x2A8B, + 0x2A8C, + 0x2A91, + 0x2A92, + 0x2A93, + 0x2A94, + 0x2A95, + 0x2A96, + 0x2A97, + 0x2A98, + 0x2A99, + 0x2A9A, + 0x2A9B, + 0x2A9C, + 0x2AA1, + 0x2AA2, + 0x2AA6, + 0x2AA7, + 0x2AA8, + 0x2AA9, + 0x2AAA, + 0x2AAB, + 0x2AAC, + 0x2AAD, + 0x2AAF, + 0x2AB0, + 0x2AB3, + 0x2AB4, + 0x2AC3, + 0x2AC4, + 0x2AC5, + 0x2AC6, + 0x2ACD, + 0x2ACE, + 0x2ACF, + 0x2AD0, + 0x2AD1, + 0x2AD2, + 0x2AD3, + 0x2AD4, + 0x2AD5, + 0x2AD6, + 0x2ADE, + 0x2AE3, + 0x2E02, + 0x2E03, + 0x2E04, + 0x2E05, + 0x2E09, + 0x2E0A, + 0x2E0C, + 0x2E0D, + 0x2E1C, + 0x2E1D, + 0x2E20, + 0x2E21, + 0x2E22, + 0x2E23, + 0x2E24, + 0x2E25, + 0x2E26, + 0x300E, + 0x300F, + 0x3010, + 0x3011, + 0x3014, + 0x3015, + 0x3016, + 0x3017, + 0x3018, + 0x3019, + 0x301A, + 0x301B, + 0xFE59, + 0xFE5A, + 0xFF3B, + 0xFF3D, + 0xFF5B, + 0xFF5D, + 0xFF5F, + 0xFF60, + 0xFF62, + 0xFF63 + }; + + private static int[] mirroredCharactersMapping = { + 0x0029, + 0x0028, + 0x003E, + 0x003C, + 0x005D, + 0x005B, + 0x007D, + 0x007B, + 0x00BB, + 0x00AB, + 0x0F3B, + 0x0F3A, + 0x0F3D, + 0x0F3C, + 0x169C, + 0x169B, + 0x203A, + 0x2039, + 0x2046, + 0x2045, + 0x207E, + 0x207D, + 0x208E, + 0x208D, + 0x220B, + 0x220C, + 0x220D, + 0x2208, + 0x2209, + 0x220A, + 0x29F5, + 0x223D, + 0x223C, + 0x22CD, + 0x2253, + 0x2252, + 0x2255, + 0x2254, + 0x2265, + 0x2264, + 0x2267, + 0x2266, + 0x2269, + 0x2268, + 0x226B, + 0x226A, + 0x226F, + 0x226E, + 0x2271, + 0x2270, + 0x2273, + 0x2272, + 0x2275, + 0x2274, + 0x2277, + 0x2276, + 0x2279, + 0x2278, + 0x227B, + 0x227A, + 0x227D, + 0x227C, + 0x227F, + 0x227E, + 0x2281, + 0x2280, + 0x2283, + 0x2282, + 0x2285, + 0x2284, + 0x2287, + 0x2286, + 0x2289, + 0x2288, + 0x228B, + 0x228A, + 0x2290, + 0x228F, + 0x2292, + 0x2291, + 0x29B8, + 0x22A3, + 0x22A2, + 0x2ADE, + 0x2AE4, + 0x2AE3, + 0x2AE5, + 0x22B1, + 0x22B0, + 0x22B3, + 0x22B2, + 0x22B5, + 0x22B4, + 0x22B7, + 0x22B6, + 0x22CA, + 0x22C9, + 0x22CC, + 0x22CB, + 0x2243, + 0x22D1, + 0x22D0, + 0x22D7, + 0x22D6, + 0x22D9, + 0x22D8, + 0x22DB, + 0x22DA, + 0x22DD, + 0x22DC, + 0x22DF, + 0x22DE, + 0x22E1, + 0x22E0, + 0x22E3, + 0x22E2, + 0x22E5, + 0x22E4, + 0x22E7, + 0x22E6, + 0x22E9, + 0x22E8, + 0x22EB, + 0x22EA, + 0x22ED, + 0x22EC, + 0x22F1, + 0x22F0, + 0x22FA, + 0x22FB, + 0x22FC, + 0x22FD, + 0x22FE, + 0x22F2, + 0x22F3, + 0x22F4, + 0x22F6, + 0x22F7, + 0x2309, + 0x2308, + 0x230B, + 0x230A, + 0x232A, + 0x2329, + 0x2769, + 0x2768, + 0x276B, + 0x276A, + 0x276D, + 0x276C, + 0x276F, + 0x276E, + 0x2771, + 0x2770, + 0x2773, + 0x2772, + 0x2775, + 0x2774, + 0x27C4, + 0x27C3, + 0x27C6, + 0x27C5, + 0x27C9, + 0x27C8, + 0x27D6, + 0x27D5, + 0x27DE, + 0x27DD, + 0x27E3, + 0x27E2, + 0x27E5, + 0x27E4, + 0x27E7, + 0x27E6, + 0x27E9, + 0x27E8, + 0x27EB, + 0x27EA, + 0x27ED, + 0x27EC, + 0x27EF, + 0x27EE, + 0x2984, + 0x2983, + 0x2986, + 0x2985, + 0x2988, + 0x2987, + 0x298A, + 0x2989, + 0x298C, + 0x298B, + 0x2990, + 0x298F, + 0x298E, + 0x298D, + 0x2992, + 0x2991, + 0x2994, + 0x2993, + 0x2996, + 0x2995, + 0x2998, + 0x2997, + 0x2298, + 0x29C1, + 0x29C0, + 0x29C5, + 0x29C4, + 0x29D0, + 0x29CF, + 0x29D2, + 0x29D1, + 0x29D5, + 0x29D4, + 0x29D9, + 0x29D8, + 0x29DB, + 0x29DA, + 0x2215, + 0x29F9, + 0x29F8, + 0x29FD, + 0x29FC, + 0x2A2C, + 0x2A2B, + 0x2A2E, + 0x2A2D, + 0x2A35, + 0x2A34, + 0x2A3D, + 0x2A3C, + 0x2A65, + 0x2A64, + 0x2A7A, + 0x2A79, + 0x2A7E, + 0x2A7D, + 0x2A80, + 0x2A7F, + 0x2A82, + 0x2A81, + 0x2A84, + 0x2A83, + 0x2A8C, + 0x2A8B, + 0x2A92, + 0x2A91, + 0x2A94, + 0x2A93, + 0x2A96, + 0x2A95, + 0x2A98, + 0x2A97, + 0x2A9A, + 0x2A99, + 0x2A9C, + 0x2A9B, + 0x2AA2, + 0x2AA1, + 0x2AA7, + 0x2AA6, + 0x2AA9, + 0x2AA8, + 0x2AAB, + 0x2AAA, + 0x2AAD, + 0x2AAC, + 0x2AB0, + 0x2AAF, + 0x2AB4, + 0x2AB3, + 0x2AC4, + 0x2AC3, + 0x2AC6, + 0x2AC5, + 0x2ACE, + 0x2ACD, + 0x2AD0, + 0x2ACF, + 0x2AD2, + 0x2AD1, + 0x2AD4, + 0x2AD3, + 0x2AD6, + 0x2AD5, + 0x22A6, + 0x22A9, + 0x2E03, + 0x2E02, + 0x2E05, + 0x2E04, + 0x2E0A, + 0x2E09, + 0x2E0D, + 0x2E0C, + 0x2E1D, + 0x2E1C, + 0x2E21, + 0x2E20, + 0x2E23, + 0x2E22, + 0x2E25, + 0x2E24, + 0x2E27, + 0x300F, + 0x300E, + 0x3011, + 0x3010, + 0x3015, + 0x3014, + 0x3017, + 0x3016, + 0x3019, + 0x3018, + 0x301B, + 0x301A, + 0xFE5A, + 0xFE59, + 0xFF3D, + 0xFF3B, + 0xFF5D, + 0xFF5B, + 0xFF60, + 0xFF5F, + 0xFF63, + 0xFF62 + }; + + private static int mirror ( int c ) { + int i = Arrays.binarySearch ( mirroredCharacters, c ); + if ( i < 0 ) { + return c; + } else { + return mirroredCharactersMapping [ i ]; + } + } + +} diff --git a/src/java/org/apache/fop/complexscripts/util/CharScript.java b/src/java/org/apache/fop/complexscripts/util/CharScript.java new file mode 100644 index 000000000..bcce31327 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/CharScript.java @@ -0,0 +1,930 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.fop.util.CharUtilities; + +// CSOFF: AvoidNestedBlocksCheck +// CSOFF: InnerAssignmentCheck +// CSOFF: LineLengthCheck +// CSOFF: SimplifyBooleanReturnCheck +// CSOFF: WhitespaceAfterCheck + +/** + * Script related utilities. + * @author Glenn Adams + */ +public final class CharScript { + + // + // The following script codes are based on ISO 15924. Codes less than 1000 are + // official assignments from 15924; those equal to or greater than 1000 are FOP + // implementation specific. + // + /** hebrew script constant */ + public static final int SCRIPT_HEBREW = 125; // 'hebr' + /** mongolian script constant */ + public static final int SCRIPT_MONGOLIAN = 145; // 'mong' + /** arabic script constant */ + public static final int SCRIPT_ARABIC = 160; // 'arab' + /** greek script constant */ + public static final int SCRIPT_GREEK = 200; // 'grek' + /** latin script constant */ + public static final int SCRIPT_LATIN = 215; // 'latn' + /** cyrillic script constant */ + public static final int SCRIPT_CYRILLIC = 220; // 'cyrl' + /** georgian script constant */ + public static final int SCRIPT_GEORGIAN = 240; // 'geor' + /** bopomofo script constant */ + public static final int SCRIPT_BOPOMOFO = 285; // 'bopo' + /** hangul script constant */ + public static final int SCRIPT_HANGUL = 286; // 'hang' + /** gurmukhi script constant */ + public static final int SCRIPT_GURMUKHI = 310; // 'guru' + /** gurmukhi 2 script constant */ + public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics + /** devanagari script constant */ + public static final int SCRIPT_DEVANAGARI = 315; // 'deva' + /** devanagari 2 script constant */ + public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics + /** gujarati script constant */ + public static final int SCRIPT_GUJARATI = 320; // 'gujr' + /** gujarati 2 script constant */ + public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics + /** bengali script constant */ + public static final int SCRIPT_BENGALI = 326; // 'beng' + /** bengali 2 script constant */ + public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics + /** oriya script constant */ + public static final int SCRIPT_ORIYA = 327; // 'orya' + /** oriya 2 script constant */ + public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics + /** tibetan script constant */ + public static final int SCRIPT_TIBETAN = 330; // 'tibt' + /** telugu script constant */ + public static final int SCRIPT_TELUGU = 340; // 'telu' + /** telugu 2 script constant */ + public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics + /** kannada script constant */ + public static final int SCRIPT_KANNADA = 345; // 'knda' + /** kannada 2 script constant */ + public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics + /** tamil script constant */ + public static final int SCRIPT_TAMIL = 346; // 'taml' + /** tamil 2 script constant */ + public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics + /** malayalam script constant */ + public static final int SCRIPT_MALAYALAM = 347; // 'mlym' + /** malayalam 2 script constant */ + public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics + /** sinhalese script constant */ + public static final int SCRIPT_SINHALESE = 348; // 'sinh' + /** burmese script constant */ + public static final int SCRIPT_BURMESE = 350; // 'mymr' + /** thai script constant */ + public static final int SCRIPT_THAI = 352; // 'thai' + /** khmer script constant */ + public static final int SCRIPT_KHMER = 355; // 'khmr' + /** lao script constant */ + public static final int SCRIPT_LAO = 356; // 'laoo' + /** hiragana script constant */ + public static final int SCRIPT_HIRAGANA = 410; // 'hira' + /** ethiopic script constant */ + public static final int SCRIPT_ETHIOPIC = 430; // 'ethi' + /** han script constant */ + public static final int SCRIPT_HAN = 500; // 'hani' + /** katakana script constant */ + public static final int SCRIPT_KATAKANA = 410; // 'kana' + /** math script constant */ + public static final int SCRIPT_MATH = 995; // 'zmth' + /** symbol script constant */ + public static final int SCRIPT_SYMBOL = 996; // 'zsym' + /** undetermined script constant */ + public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy' + /** uncoded script constant */ + public static final int SCRIPT_UNCODED = 999; // 'zzzz' + + /** + * A static (class) parameter indicating whether V2 indic shaping + * rules apply or not, with default being <code>true</code>. + */ + private static final boolean useV2Indic = true; // CSOK: ConstantNameCheck + + private CharScript() { + } + + /** + * Determine if character c is punctuation. + * @param c a character represented as a unicode scalar value + * @return true if character is punctuation + */ + public static boolean isPunctuation ( int c ) { + if ( ( c >= 0x0021 ) && ( c <= 0x002F ) ) { // basic latin punctuation + return true; + } else if ( ( c >= 0x003A ) && ( c <= 0x0040 ) ) { // basic latin punctuation + return true; + } else if ( ( c >= 0x005F ) && ( c <= 0x0060 ) ) { // basic latin punctuation + return true; + } else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) { // basic latin punctuation + return true; + } else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) { // basic latin punctuation + return true; + } else if ( ( c >= 0x00A1 ) && ( c <= 0x00BF ) ) { // latin supplement punctuation + return true; + } else if ( ( c >= 0x00D7 ) && ( c <= 0x00D7 ) ) { // latin supplement punctuation + return true; + } else if ( ( c >= 0x00F7 ) && ( c <= 0x00F7 ) ) { // latin supplement punctuation + return true; + } else if ( ( c >= 0x2000 ) && ( c <= 0x206F ) ) { // general punctuation + return true; + } else { // [TBD] - not complete + return false; + } + } + + /** + * Determine if character c is a digit. + * @param c a character represented as a unicode scalar value + * @return true if character is a digit + */ + public static boolean isDigit ( int c ) { + if ( ( c >= 0x0030 ) && ( c <= 0x0039 ) ) { // basic latin digits + return true; + } else { // [TBD] - not complete + return false; + } + } + + /** + * Determine if character c belong to the hebrew script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to hebrew script + */ + public static boolean isHebrew ( int c ) { + if ( ( c >= 0x0590 ) && ( c <= 0x05FF ) ) { // hebrew block + return true; + } else if ( ( c >= 0xFB00 ) && ( c <= 0xFB4F ) ) { // hebrew presentation forms block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the mongolian script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to mongolian script + */ + public static boolean isMongolian ( int c ) { + if ( ( c >= 0x1800 ) && ( c <= 0x18AF ) ) { // mongolian block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the arabic script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to arabic script + */ + public static boolean isArabic ( int c ) { + if ( ( c >= 0x0600 ) && ( c <= 0x06FF ) ) { // arabic block + return true; + } else if ( ( c >= 0x0750 ) && ( c <= 0x077F ) ) { // arabic supplement block + return true; + } else if ( ( c >= 0xFB50 ) && ( c <= 0xFDFF ) ) { // arabic presentation forms a block + return true; + } else if ( ( c >= 0xFE70 ) && ( c <= 0xFEFF ) ) { // arabic presentation forms b block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the greek script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to greek script + */ + public static boolean isGreek ( int c ) { + if ( ( c >= 0x0370 ) && ( c <= 0x03FF ) ) { // greek (and coptic) block + return true; + } else if ( ( c >= 0x1F00 ) && ( c <= 0x1FFF ) ) { // greek extended block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the latin script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to latin script + */ + public static boolean isLatin ( int c ) { + if ( ( c >= 0x0041 ) && ( c <= 0x005A ) ) { // basic latin upper case + return true; + } else if ( ( c >= 0x0061 ) && ( c <= 0x007A ) ) { // basic latin lower case + return true; + } else if ( ( c >= 0x00C0 ) && ( c <= 0x00D6 ) ) { // latin supplement upper case + return true; + } else if ( ( c >= 0x00D8 ) && ( c <= 0x00DF ) ) { // latin supplement upper case + return true; + } else if ( ( c >= 0x00E0 ) && ( c <= 0x00F6 ) ) { // latin supplement lower case + return true; + } else if ( ( c >= 0x00F8 ) && ( c <= 0x00FF ) ) { // latin supplement lower case + return true; + } else if ( ( c >= 0x0100 ) && ( c <= 0x017F ) ) { // latin extended a + return true; + } else if ( ( c >= 0x0180 ) && ( c <= 0x024F ) ) { // latin extended b + return true; + } else if ( ( c >= 0x1E00 ) && ( c <= 0x1EFF ) ) { // latin extended additional + return true; + } else if ( ( c >= 0x2C60 ) && ( c <= 0x2C7F ) ) { // latin extended c + return true; + } else if ( ( c >= 0xA720 ) && ( c <= 0xA7FF ) ) { // latin extended d + return true; + } else if ( ( c >= 0xFB00 ) && ( c <= 0xFB0F ) ) { // latin ligatures + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the cyrillic script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to cyrillic script + */ + public static boolean isCyrillic ( int c ) { + if ( ( c >= 0x0400 ) && ( c <= 0x04FF ) ) { // cyrillic block + return true; + } else if ( ( c >= 0x0500 ) && ( c <= 0x052F ) ) { // cyrillic supplement block + return true; + } else if ( ( c >= 0x2DE0 ) && ( c <= 0x2DFF ) ) { // cyrillic extended-a block + return true; + } else if ( ( c >= 0xA640 ) && ( c <= 0xA69F ) ) { // cyrillic extended-b block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the georgian script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to georgian script + */ + public static boolean isGeorgian ( int c ) { + if ( ( c >= 0x10A0 ) && ( c <= 0x10FF ) ) { // georgian block + return true; + } else if ( ( c >= 0x2D00 ) && ( c <= 0x2D2F ) ) { // georgian supplement block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the hangul script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to hangul script + */ + public static boolean isHangul ( int c ) { + if ( ( c >= 0x1100 ) && ( c <= 0x11FF ) ) { // hangul jamo + return true; + } else if ( ( c >= 0x3130 ) && ( c <= 0x318F ) ) { // hangul compatibility jamo + return true; + } else if ( ( c >= 0xA960 ) && ( c <= 0xA97F ) ) { // hangul jamo extended a + return true; + } else if ( ( c >= 0xAC00 ) && ( c <= 0xD7A3 ) ) { // hangul syllables + return true; + } else if ( ( c >= 0xD7B0 ) && ( c <= 0xD7FF ) ) { // hangul jamo extended a + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the gurmukhi script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to gurmukhi script + */ + public static boolean isGurmukhi ( int c ) { + if ( ( c >= 0x0A00 ) && ( c <= 0x0A7F ) ) { // gurmukhi block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the devanagari script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to devanagari script + */ + public static boolean isDevanagari ( int c ) { + if ( ( c >= 0x0900 ) && ( c <= 0x097F ) ) { // devangari block + return true; + } else if ( ( c >= 0xA8E0 ) && ( c <= 0xA8FF ) ) { // devangari extended block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the gujarati script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to gujarati script + */ + public static boolean isGujarati ( int c ) { + if ( ( c >= 0x0A80 ) && ( c <= 0x0AFF ) ) { // gujarati block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the bengali script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to bengali script + */ + public static boolean isBengali ( int c ) { + if ( ( c >= 0x0980 ) && ( c <= 0x09FF ) ) { // bengali block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the oriya script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to oriya script + */ + public static boolean isOriya ( int c ) { + if ( ( c >= 0x0B00 ) && ( c <= 0x0B7F ) ) { // oriya block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the tibetan script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to tibetan script + */ + public static boolean isTibetan ( int c ) { + if ( ( c >= 0x0F00 ) && ( c <= 0x0FFF ) ) { // tibetan block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the telugu script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to telugu script + */ + public static boolean isTelugu ( int c ) { + if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) { // telugu block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the kannada script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to kannada script + */ + public static boolean isKannada ( int c ) { + if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) { // kannada block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the tamil script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to tamil script + */ + public static boolean isTamil ( int c ) { + if ( ( c >= 0x0B80 ) && ( c <= 0x0BFF ) ) { // tamil block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the malayalam script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to malayalam script + */ + public static boolean isMalayalam ( int c ) { + if ( ( c >= 0x0D00 ) && ( c <= 0x0D7F ) ) { // malayalam block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the sinhalese script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to sinhalese script + */ + public static boolean isSinhalese ( int c ) { + if ( ( c >= 0x0D80 ) && ( c <= 0x0DFF ) ) { // sinhala block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the burmese script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to burmese script + */ + public static boolean isBurmese ( int c ) { + if ( ( c >= 0x1000 ) && ( c <= 0x109F ) ) { // burmese (myanmar) block + return true; + } else if ( ( c >= 0xAA60 ) && ( c <= 0xAA7F ) ) { // burmese (myanmar) extended block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the thai script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to thai script + */ + public static boolean isThai ( int c ) { + if ( ( c >= 0x0E00 ) && ( c <= 0x0E7F ) ) { // thai block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the khmer script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to khmer script + */ + public static boolean isKhmer ( int c ) { + if ( ( c >= 0x1780 ) && ( c <= 0x17FF ) ) { // khmer block + return true; + } else if ( ( c >= 0x19E0 ) && ( c <= 0x19FF ) ) { // khmer symbols block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the lao script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to lao script + */ + public static boolean isLao ( int c ) { + if ( ( c >= 0x0E80 ) && ( c <= 0x0EFF ) ) { // lao block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the ethiopic (amharic) script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to ethiopic (amharic) script + */ + public static boolean isEthiopic ( int c ) { + if ( ( c >= 0x1200 ) && ( c <= 0x137F ) ) { // ethiopic block + return true; + } else if ( ( c >= 0x1380 ) && ( c <= 0x139F ) ) { // ethoipic supplement block + return true; + } else if ( ( c >= 0x2D80 ) && ( c <= 0x2DDF ) ) { // ethoipic extended block + return true; + } else if ( ( c >= 0xAB00 ) && ( c <= 0xAB2F ) ) { // ethoipic extended-a block + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the han (unified cjk) script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to han (unified cjk) script + */ + public static boolean isHan ( int c ) { + if ( ( c >= 0x3400 ) && ( c <= 0x4DBF ) ) { + return true; // cjk unified ideographs extension a + } else if ( ( c >= 0x4E00 ) && ( c <= 0x9FFF ) ) { + return true; // cjk unified ideographs + } else if ( ( c >= 0xF900 ) && ( c <= 0xFAFF ) ) { + return true; // cjk compatibility ideographs + } else if ( ( c >= 0x20000 ) && ( c <= 0x2A6DF ) ) { + return true; // cjk unified ideographs extension b + } else if ( ( c >= 0x2A700 ) && ( c <= 0x2B73F ) ) { + return true; // cjk unified ideographs extension c + } else if ( ( c >= 0x2F800 ) && ( c <= 0x2FA1F ) ) { + return true; // cjk compatibility ideographs supplement + } else { + return false; + } + } + + /** + * Determine if character c belong to the bopomofo script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to bopomofo script + */ + public static boolean isBopomofo ( int c ) { + if ( ( c >= 0x3100 ) && ( c <= 0x312F ) ) { + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the hiragana script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to hiragana script + */ + public static boolean isHiragana ( int c ) { + if ( ( c >= 0x3040 ) && ( c <= 0x309F ) ) { + return true; + } else { + return false; + } + } + + /** + * Determine if character c belong to the katakana script. + * @param c a character represented as a unicode scalar value + * @return true if character belongs to katakana script + */ + public static boolean isKatakana ( int c ) { + if ( ( c >= 0x30A0 ) && ( c <= 0x30FF ) ) { + return true; + } else if ( ( c >= 0x31F0 ) && ( c <= 0x31FF ) ) { + return true; + } else { + return false; + } + } + + /** + * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined, + * then the script code 998 ('zyyy') is returned. + * @param c the character to obtain script + * @return an ISO15924 script code + */ + public static int scriptOf ( int c ) { // [TBD] - needs optimization!!! + if ( CharUtilities.isAnySpace ( c ) ) { + return SCRIPT_UNDETERMINED; + } else if ( isPunctuation ( c ) ) { + return SCRIPT_UNDETERMINED; + } else if ( isDigit ( c ) ) { + return SCRIPT_UNDETERMINED; + } else if ( isLatin ( c ) ) { + return SCRIPT_LATIN; + } else if ( isCyrillic ( c ) ) { + return SCRIPT_CYRILLIC; + } else if ( isGreek ( c ) ) { + return SCRIPT_GREEK; + } else if ( isHan ( c ) ) { + return SCRIPT_HAN; + } else if ( isBopomofo ( c ) ) { + return SCRIPT_BOPOMOFO; + } else if ( isKatakana ( c ) ) { + return SCRIPT_KATAKANA; + } else if ( isHiragana ( c ) ) { + return SCRIPT_HIRAGANA; + } else if ( isHangul ( c ) ) { + return SCRIPT_HANGUL; + } else if ( isArabic ( c ) ) { + return SCRIPT_ARABIC; + } else if ( isHebrew ( c ) ) { + return SCRIPT_HEBREW; + } else if ( isMongolian ( c ) ) { + return SCRIPT_MONGOLIAN; + } else if ( isGeorgian ( c ) ) { + return SCRIPT_GEORGIAN; + } else if ( isGurmukhi ( c ) ) { + return useV2IndicRules ( SCRIPT_GURMUKHI ); + } else if ( isDevanagari ( c ) ) { + return useV2IndicRules ( SCRIPT_DEVANAGARI ); + } else if ( isGujarati ( c ) ) { + return useV2IndicRules ( SCRIPT_GUJARATI ); + } else if ( isBengali ( c ) ) { + return useV2IndicRules ( SCRIPT_BENGALI ); + } else if ( isOriya ( c ) ) { + return useV2IndicRules ( SCRIPT_ORIYA ); + } else if ( isTibetan ( c ) ) { + return SCRIPT_TIBETAN; + } else if ( isTelugu ( c ) ) { + return useV2IndicRules ( SCRIPT_TELUGU ); + } else if ( isKannada ( c ) ) { + return useV2IndicRules ( SCRIPT_KANNADA ); + } else if ( isTamil ( c ) ) { + return useV2IndicRules ( SCRIPT_TAMIL ); + } else if ( isMalayalam ( c ) ) { + return useV2IndicRules ( SCRIPT_MALAYALAM ); + } else if ( isSinhalese ( c ) ) { + return SCRIPT_SINHALESE; + } else if ( isBurmese ( c ) ) { + return SCRIPT_BURMESE; + } else if ( isThai ( c ) ) { + return SCRIPT_THAI; + } else if ( isKhmer ( c ) ) { + return SCRIPT_KHMER; + } else if ( isLao ( c ) ) { + return SCRIPT_LAO; + } else if ( isEthiopic ( c ) ) { + return SCRIPT_ETHIOPIC; + } else { + return SCRIPT_UNDETERMINED; + } + } + + /** + * Obtain the V2 indic script code corresponding to V1 indic script code SC if + * and only iff V2 indic rules apply; otherwise return SC. + * @param sc a V1 indic script code + * @return either SC or the V2 flavor of SC if V2 indic rules apply + */ + public static int useV2IndicRules ( int sc ) { + if ( useV2Indic ) { + return ( sc < 1000 ) ? ( sc + 1000 ) : sc; + } else { + return sc; + } + } + + /** + * Obtain the script codes of each character in a character sequence. If script + * is not or cannot be determined for some character, then the script code 998 + * ('zyyy') is returned. + * @param cs the character sequence + * @return a (possibly empty) array of script codes + */ + public static int[] scriptsOf ( CharSequence cs ) { + Set s = new HashSet(); + for ( int i = 0, n = cs.length(); i < n; i++ ) { + s.add ( Integer.valueOf ( scriptOf ( cs.charAt ( i ) ) ) ); + } + int[] sa = new int [ s.size() ]; + int ns = 0; + for ( Iterator it = s.iterator(); it.hasNext();) { + sa [ ns++ ] = ( (Integer) it.next() ) .intValue(); + } + Arrays.sort ( sa ); + return sa; + } + + /** + * Determine the dominant script of a character sequence. + * @param cs the character sequence + * @return the dominant script or SCRIPT_UNDETERMINED + */ + public static int dominantScript ( CharSequence cs ) { + Map m = new HashMap(); + for ( int i = 0, n = cs.length(); i < n; i++ ) { + int c = cs.charAt ( i ); + int s = scriptOf ( c ); + Integer k = Integer.valueOf ( s ); + Integer v = (Integer) m.get ( k ); + if ( v != null ) { + m.put ( k, Integer.valueOf ( v.intValue() + 1 ) ); + } else { + m.put ( k, Integer.valueOf ( 0 ) ); + } + } + int sMax = -1; + int cMax = -1; + for ( Iterator it = m.entrySet().iterator(); it.hasNext();) { + Map.Entry e = (Map.Entry) it.next(); + Integer k = (Integer) e.getKey(); + int s = k.intValue(); + switch ( s ) { + case SCRIPT_UNDETERMINED: + case SCRIPT_UNCODED: + break; + default: + { + Integer v = (Integer) e.getValue(); + assert v != null; + int c = v.intValue(); + if ( c > cMax ) { + cMax = c; sMax = s; + } + break; + } + } + } + if ( sMax < 0 ) { + sMax = SCRIPT_UNDETERMINED; + } + return sMax; + } + + /** + * Determine if script tag denotes an 'Indic' script, where a + * script is an 'Indic' script if it is intended to be processed by + * the generic 'Indic' Script Processor. + * @param script a script tag + * @return true if script tag is a designated 'Indic' script + */ + public static boolean isIndicScript ( String script ) { + return isIndicScript ( scriptCodeFromTag ( script ) ); + } + + /** + * Determine if script tag denotes an 'Indic' script, where a + * script is an 'Indic' script if it is intended to be processed by + * the generic 'Indic' Script Processor. + * @param script a script code + * @return true if script code is a designated 'Indic' script + */ + public static boolean isIndicScript ( int script ) { + switch ( script ) { + case SCRIPT_BENGALI: + case SCRIPT_BENGALI_2: + case SCRIPT_BURMESE: + case SCRIPT_DEVANAGARI: + case SCRIPT_DEVANAGARI_2: + case SCRIPT_GUJARATI: + case SCRIPT_GUJARATI_2: + case SCRIPT_GURMUKHI: + case SCRIPT_GURMUKHI_2: + case SCRIPT_KANNADA: + case SCRIPT_KANNADA_2: + case SCRIPT_MALAYALAM: + case SCRIPT_MALAYALAM_2: + case SCRIPT_ORIYA: + case SCRIPT_ORIYA_2: + case SCRIPT_TAMIL: + case SCRIPT_TAMIL_2: + case SCRIPT_TELUGU: + case SCRIPT_TELUGU_2: + return true; + default: + return false; + } + } + + /** + * Determine the script tag associated with an internal script code. + * @param code the script code + * @return a script tag + */ + public static String scriptTagFromCode ( int code ) { + Map<Integer,String> m = getScriptTagsMap(); + if ( m != null ) { + String tag; + if ( ( tag = m.get ( Integer.valueOf ( code ) ) ) != null ) { + return tag; + } else { + return ""; + } + } else { + return ""; + } + } + + /** + * Determine the internal script code associated with a script tag. + * @param tag the script tag + * @return a script code + */ + public static int scriptCodeFromTag ( String tag ) { + Map<String,Integer> m = getScriptCodeMap(); + if ( m != null ) { + Integer c; + if ( ( c = m.get ( tag ) ) != null ) { + return (int) c; + } else { + return SCRIPT_UNDETERMINED; + } + } else { + return SCRIPT_UNDETERMINED; + } + } + + private static Map<Integer,String> scriptTagsMap = null; + private static Map<String,Integer> scriptCodeMap = null; + + private static void putScriptTag ( Map tm, Map cm, int code, String tag ) { + assert tag != null; + assert tag.length() != 0; + assert code >= 0; + assert code < 2000; + tm.put ( Integer.valueOf ( code ), tag ); + cm.put ( tag, Integer.valueOf ( code ) ); + } + + private static void makeScriptMaps() { + HashMap<Integer,String> tm = new HashMap<Integer,String>(); + HashMap<String,Integer> cm = new HashMap<String,Integer>(); + putScriptTag ( tm, cm, SCRIPT_HEBREW, "hebr" ); + putScriptTag ( tm, cm, SCRIPT_MONGOLIAN, "mong" ); + putScriptTag ( tm, cm, SCRIPT_ARABIC, "arab" ); + putScriptTag ( tm, cm, SCRIPT_GREEK, "grek" ); + putScriptTag ( tm, cm, SCRIPT_LATIN, "latn" ); + putScriptTag ( tm, cm, SCRIPT_CYRILLIC, "cyrl" ); + putScriptTag ( tm, cm, SCRIPT_GEORGIAN, "geor" ); + putScriptTag ( tm, cm, SCRIPT_BOPOMOFO, "bopo" ); + putScriptTag ( tm, cm, SCRIPT_HANGUL, "hang" ); + putScriptTag ( tm, cm, SCRIPT_GURMUKHI, "guru" ); + putScriptTag ( tm, cm, SCRIPT_GURMUKHI_2, "gur2" ); + putScriptTag ( tm, cm, SCRIPT_DEVANAGARI, "deva" ); + putScriptTag ( tm, cm, SCRIPT_DEVANAGARI_2, "dev2" ); + putScriptTag ( tm, cm, SCRIPT_GUJARATI, "gujr" ); + putScriptTag ( tm, cm, SCRIPT_GUJARATI_2, "gjr2" ); + putScriptTag ( tm, cm, SCRIPT_BENGALI, "beng" ); + putScriptTag ( tm, cm, SCRIPT_BENGALI_2, "bng2" ); + putScriptTag ( tm, cm, SCRIPT_ORIYA, "orya" ); + putScriptTag ( tm, cm, SCRIPT_ORIYA_2, "ory2" ); + putScriptTag ( tm, cm, SCRIPT_TIBETAN, "tibt" ); + putScriptTag ( tm, cm, SCRIPT_TELUGU, "telu" ); + putScriptTag ( tm, cm, SCRIPT_TELUGU_2, "tel2" ); + putScriptTag ( tm, cm, SCRIPT_KANNADA, "knda" ); + putScriptTag ( tm, cm, SCRIPT_KANNADA_2, "knd2" ); + putScriptTag ( tm, cm, SCRIPT_TAMIL, "taml" ); + putScriptTag ( tm, cm, SCRIPT_TAMIL_2, "tml2" ); + putScriptTag ( tm, cm, SCRIPT_MALAYALAM, "mlym" ); + putScriptTag ( tm, cm, SCRIPT_MALAYALAM_2, "mlm2" ); + putScriptTag ( tm, cm, SCRIPT_SINHALESE, "sinh" ); + putScriptTag ( tm, cm, SCRIPT_BURMESE, "mymr" ); + putScriptTag ( tm, cm, SCRIPT_THAI, "thai" ); + putScriptTag ( tm, cm, SCRIPT_KHMER, "khmr" ); + putScriptTag ( tm, cm, SCRIPT_LAO, "laoo" ); + putScriptTag ( tm, cm, SCRIPT_HIRAGANA, "hira" ); + putScriptTag ( tm, cm, SCRIPT_ETHIOPIC, "ethi" ); + putScriptTag ( tm, cm, SCRIPT_HAN, "hani" ); + putScriptTag ( tm, cm, SCRIPT_KATAKANA, "kana" ); + putScriptTag ( tm, cm, SCRIPT_MATH, "zmth" ); + putScriptTag ( tm, cm, SCRIPT_SYMBOL, "zsym" ); + putScriptTag ( tm, cm, SCRIPT_UNDETERMINED, "zyyy" ); + putScriptTag ( tm, cm, SCRIPT_UNCODED, "zzzz" ); + scriptTagsMap = tm; + scriptCodeMap = cm; + } + + private static Map<Integer,String> getScriptTagsMap() { + if ( scriptTagsMap == null ) { + makeScriptMaps(); + } + return scriptTagsMap; + } + + private static Map<String,Integer> getScriptCodeMap() { + if ( scriptCodeMap == null ) { + makeScriptMaps(); + } + return scriptCodeMap; + } + +} diff --git a/src/java/org/apache/fop/complexscripts/util/DiscontinuousAssociationException.java b/src/java/org/apache/fop/complexscripts/util/DiscontinuousAssociationException.java new file mode 100644 index 000000000..daade8ca6 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/DiscontinuousAssociationException.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +/** + * Exception thrown during when attempting to map glyphs to associated characters + * in the case that the associated characters do not represent a compact interval. + * @author Glenn Adams + */ +public class DiscontinuousAssociationException extends RuntimeException { + /** + * Instantiate discontinuous association exception + */ + public DiscontinuousAssociationException() { + super(); + } + /** + * Instantiate discontinuous association exception + * @param message a message string + */ + public DiscontinuousAssociationException(String message) { + super(message); + } +} diff --git a/src/java/org/apache/fop/complexscripts/util/GlyphContextTester.java b/src/java/org/apache/fop/complexscripts/util/GlyphContextTester.java new file mode 100644 index 000000000..6bdeb2298 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/GlyphContextTester.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +// CSOFF: LineLengthCheck + +/** + * Interface for testing the originating (source) character context of a glyph sequence. + * @author Glenn Adams + */ +public interface GlyphContextTester { + + /** + * Perform a test on a glyph sequence in a specific (originating) character context. + * @param script governing script + * @param language governing language + * @param feature governing feature + * @param gs glyph sequence to test + * @param index index into glyph sequence to test + * @param flags that apply to lookup in scope + * @return true if test is satisfied + */ + boolean test ( String script, String language, String feature, GlyphSequence gs, int index, int flags ); + +} diff --git a/src/java/org/apache/fop/complexscripts/util/GlyphSequence.java b/src/java/org/apache/fop/complexscripts/util/GlyphSequence.java new file mode 100644 index 000000000..0e256241d --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/GlyphSequence.java @@ -0,0 +1,1075 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +import java.nio.IntBuffer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.fop.util.CharUtilities; + +// CSOFF: InnerAssignmentCheck +// CSOFF: LineLengthCheck +// CSOFF: WhitespaceAfterCheck +// CSOFF: NoWhitespaceAfterCheck + +/** + * A GlyphSequence encapsulates a sequence of character codes, a sequence of glyph codes, + * and a sequence of character associations, where, for each glyph in the sequence of glyph + * codes, there is a corresponding character association. Character associations server to + * relate the glyph codes in a glyph sequence to the specific characters in an original + * character code sequence with which the glyph codes are associated. + * @author Glenn Adams + */ +public class GlyphSequence implements Cloneable { + + /** default character buffer capacity in case new character buffer is created */ + private static final int DEFAULT_CHARS_CAPACITY = 8; + + /** character buffer */ + private IntBuffer characters; + /** glyph buffer */ + private IntBuffer glyphs; + /** association list */ + private List associations; + /** predications flag */ + private boolean predications; + + /** + * Instantiate a glyph sequence, reusing (i.e., not copying) the referenced + * character and glyph buffers and associations. If characters is null, then + * an empty character buffer is created. If glyphs is null, then a glyph buffer + * is created whose capacity is that of the character buffer. If associations is + * null, then identity associations are created. + * @param characters a (possibly null) buffer of associated (originating) characters + * @param glyphs a (possibly null) buffer of glyphs + * @param associations a (possibly null) array of glyph to character associations + * @param predications true if predications are enabled + */ + public GlyphSequence ( IntBuffer characters, IntBuffer glyphs, List associations, boolean predications ) { + if ( characters == null ) { + characters = IntBuffer.allocate ( DEFAULT_CHARS_CAPACITY ); + } + if ( glyphs == null ) { + glyphs = IntBuffer.allocate ( characters.capacity() ); + } + if ( associations == null ) { + associations = makeIdentityAssociations ( characters.limit(), glyphs.limit() ); + } + this.characters = characters; + this.glyphs = glyphs; + this.associations = associations; + this.predications = predications; + } + + /** + * Instantiate a glyph sequence, reusing (i.e., not copying) the referenced + * character and glyph buffers and associations. If characters is null, then + * an empty character buffer is created. If glyphs is null, then a glyph buffer + * is created whose capacity is that of the character buffer. If associations is + * null, then identity associations are created. + * @param characters a (possibly null) buffer of associated (originating) characters + * @param glyphs a (possibly null) buffer of glyphs + * @param associations a (possibly null) array of glyph to character associations + */ + public GlyphSequence ( IntBuffer characters, IntBuffer glyphs, List associations ) { + this ( characters, glyphs, associations, false ); + } + + /** + * Instantiate a glyph sequence using an existing glyph sequence, where the new glyph sequence shares + * the character array of the existing sequence (but not the buffer object), and creates new copies + * of glyphs buffer and association list. + * @param gs an existing glyph sequence + */ + public GlyphSequence ( GlyphSequence gs ) { + this ( gs.characters.duplicate(), copyBuffer ( gs.glyphs ), copyAssociations ( gs.associations ), gs.predications ); + } + + /** + * Instantiate a glyph sequence using an existing glyph sequence, where the new glyph sequence shares + * the character array of the existing sequence (but not the buffer object), but uses the specified + * backtrack, input, and lookahead glyph arrays to populate the glyphs, and uses the specified + * of glyphs buffer and association list. + * backtrack, input, and lookahead association arrays to populate the associations. + * @param gs an existing glyph sequence + * @param bga backtrack glyph array + * @param iga input glyph array + * @param lga lookahead glyph array + * @param bal backtrack association list + * @param ial input association list + * @param lal lookahead association list + */ + public GlyphSequence ( GlyphSequence gs, int[] bga, int[] iga, int[] lga, CharAssociation[] bal, CharAssociation[] ial, CharAssociation[] lal ) { + this ( gs.characters.duplicate(), concatGlyphs ( bga, iga, lga ), concatAssociations ( bal, ial, lal ), gs.predications ); + } + + /** + * Obtain reference to underlying character buffer. + * @return character buffer reference + */ + public IntBuffer getCharacters() { + return characters; + } + + /** + * Obtain array of characters. If <code>copy</code> is true, then + * a newly instantiated array is returned, otherwise a reference to + * the underlying buffer's array is returned. N.B. in case a reference + * to the undelying buffer's array is returned, the length + * of the array is not necessarily the number of characters in array. + * To determine the number of characters, use {@link #getCharacterCount}. + * @param copy true if to return a newly instantiated array of characters + * @return array of characters + */ + public int[] getCharacterArray ( boolean copy ) { + if ( copy ) { + return toArray ( characters ); + } else { + return characters.array(); + } + } + + /** + * Obtain the number of characters in character array, where + * each character constitutes a unicode scalar value. + * @return number of characters available in character array + */ + public int getCharacterCount() { + return characters.limit(); + } + + /** + * Obtain glyph id at specified index. + * @param index to obtain glyph + * @return the glyph identifier of glyph at specified index + * @throws IndexOutOfBoundsException if index is less than zero + * or exceeds last valid position + */ + public int getGlyph ( int index ) throws IndexOutOfBoundsException { + return glyphs.get ( index ); + } + + /** + * Set glyph id at specified index. + * @param index to set glyph + * @param gi glyph index + * @throws IndexOutOfBoundsException if index is greater or equal to + * the limit of the underlying glyph buffer + */ + public void setGlyph ( int index, int gi ) throws IndexOutOfBoundsException { + if ( gi > 65535 ) { + gi = 65535; + } + glyphs.put ( index, gi ); + } + + /** + * Obtain reference to underlying glyph buffer. + * @return glyph buffer reference + */ + public IntBuffer getGlyphs() { + return glyphs; + } + + /** + * Obtain count glyphs starting at offset. If <code>count</code> is + * negative, then it is treated as if the number of available glyphs + * were specified. + * @param offset into glyph sequence + * @param count of glyphs to obtain starting at offset, or negative, + * indicating all avaialble glyphs starting at offset + * @return glyph array + */ + public int[] getGlyphs ( int offset, int count ) { + int ng = getGlyphCount(); + if ( offset < 0 ) { + offset = 0; + } else if ( offset > ng ) { + offset = ng; + } + if ( count < 0 ) { + count = ng - offset; + } + int[] ga = new int [ count ]; + for ( int i = offset, n = offset + count, k = 0; i < n; i++ ) { + if ( k < ga.length ) { + ga [ k++ ] = glyphs.get ( i ); + } + } + return ga; + } + + /** + * Obtain array of glyphs. If <code>copy</code> is true, then + * a newly instantiated array is returned, otherwise a reference to + * the underlying buffer's array is returned. N.B. in case a reference + * to the undelying buffer's array is returned, the length + * of the array is not necessarily the number of glyphs in array. + * To determine the number of glyphs, use {@link #getGlyphCount}. + * @param copy true if to return a newly instantiated array of glyphs + * @return array of glyphs + */ + public int[] getGlyphArray ( boolean copy ) { + if ( copy ) { + return toArray ( glyphs ); + } else { + return glyphs.array(); + } + } + + /** + * Obtain the number of glyphs in glyphs array, where + * each glyph constitutes a font specific glyph index. + * @return number of glyphs available in character array + */ + public int getGlyphCount() { + return glyphs.limit(); + } + + /** + * Obtain association at specified index. + * @param index into associations array + * @return glyph to character associations at specified index + * @throws IndexOutOfBoundsException if index is less than zero + * or exceeds last valid position + */ + public CharAssociation getAssociation ( int index ) throws IndexOutOfBoundsException { + return (CharAssociation) associations.get ( index ); + } + + /** + * Obtain reference to underlying associations list. + * @return associations list + */ + public List getAssociations() { + return associations; + } + + /** + * Obtain count associations starting at offset. + * @param offset into glyph sequence + * @param count of associations to obtain starting at offset, or negative, + * indicating all avaialble associations starting at offset + * @return associations + */ + public CharAssociation[] getAssociations ( int offset, int count ) { + int ng = getGlyphCount(); + if ( offset < 0 ) { + offset = 0; + } else if ( offset > ng ) { + offset = ng; + } + if ( count < 0 ) { + count = ng - offset; + } + CharAssociation[] aa = new CharAssociation [ count ]; + for ( int i = offset, n = offset + count, k = 0; i < n; i++ ) { + if ( k < aa.length ) { + aa [ k++ ] = (CharAssociation) associations.get ( i ); + } + } + return aa; + } + + /** + * Enable or disable predications. + * @param enable true if predications are to be enabled; otherwise false to disable + */ + public void setPredications ( boolean enable ) { + this.predications = enable; + } + + /** + * Obtain predications state. + * @return true if predications are enabled + */ + public boolean getPredications() { + return this.predications; + } + + /** + * Set predication <KEY,VALUE> at glyph sequence OFFSET. + * @param offset offset (index) into glyph sequence + * @param key predication key + * @param value predication value + */ + public void setPredication ( int offset, String key, Object value ) { + if ( predications ) { + CharAssociation[] aa = getAssociations ( offset, 1 ); + CharAssociation ca = aa[0]; + ca.setPredication ( key, value ); + } + } + + /** + * Get predication KEY at glyph sequence OFFSET. + * @param offset offset (index) into glyph sequence + * @param key predication key + * @return predication KEY at OFFSET or null if none exists + */ + public Object getPredication ( int offset, String key ) { + if ( predications ) { + CharAssociation[] aa = getAssociations ( offset, 1 ); + CharAssociation ca = aa[0]; + return ca.getPredication ( key ); + } else { + return null; + } + } + + /** + * Compare glyphs. + * @param gb buffer containing glyph indices with which this glyph sequence's glyphs are to be compared + * @return zero if glyphs are the same, otherwise returns 1 or -1 according to whether this glyph sequence's + * glyphs are lexicographically greater or lesser than the glyphs in the specified string buffer + */ + public int compareGlyphs ( IntBuffer gb ) { + int ng = getGlyphCount(); + for ( int i = 0, n = gb.limit(); i < n; i++ ) { + if ( i < ng ) { + int g1 = glyphs.get ( i ); + int g2 = gb.get ( i ); + if ( g1 > g2 ) { + return 1; + } else if ( g1 < g2 ) { + return -1; + } + } else { + return -1; // this gb is a proper prefix of specified gb + } + } + return 0; // same lengths with no difference + } + + /** {@inheritDoc} */ + public Object clone() { + try { + GlyphSequence gs = (GlyphSequence) super.clone(); + gs.characters = copyBuffer ( characters ); + gs.glyphs = copyBuffer ( glyphs ); + gs.associations = copyAssociations ( associations ); + return gs; + } catch ( CloneNotSupportedException e ) { + return null; + } + } + + /** {@inheritDoc} */ + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append ( '{' ); + sb.append ( "chars = [" ); + sb.append ( characters ); + sb.append ( "], glyphs = [" ); + sb.append ( glyphs ); + sb.append ( "], associations = [" ); + sb.append ( associations ); + sb.append ( "]" ); + sb.append ( '}' ); + return sb.toString(); + } + + /** + * Determine if two arrays of glyphs are identical. + * @param ga1 first glyph array + * @param ga2 second glyph array + * @return true if arrays are botth null or both non-null and have identical elements + */ + public static boolean sameGlyphs ( int[] ga1, int[] ga2 ) { + if ( ga1 == ga2 ) { + return true; + } else if ( ( ga1 == null ) || ( ga2 == null ) ) { + return false; + } else if ( ga1.length != ga2.length ) { + return false; + } else { + for ( int i = 0, n = ga1.length; i < n; i++ ) { + if ( ga1[i] != ga2[i] ) { + return false; + } + } + return true; + } + } + + /** + * Concatenante glyph arrays. + * @param bga backtrack glyph array + * @param iga input glyph array + * @param lga lookahead glyph array + * @return new integer buffer containing concatenated glyphs + */ + public static IntBuffer concatGlyphs ( int[] bga, int[] iga, int[] lga ) { + int ng = 0; + if ( bga != null ) { + ng += bga.length; + } + if ( iga != null ) { + ng += iga.length; + } + if ( lga != null ) { + ng += lga.length; + } + IntBuffer gb = IntBuffer.allocate ( ng ); + if ( bga != null ) { + gb.put ( bga ); + } + if ( iga != null ) { + gb.put ( iga ); + } + if ( lga != null ) { + gb.put ( lga ); + } + gb.flip(); + return gb; + } + + /** + * Concatenante association arrays. + * @param baa backtrack association array + * @param iaa input association array + * @param laa lookahead association array + * @return new list containing concatenated associations + */ + public static List concatAssociations ( CharAssociation[] baa, CharAssociation[] iaa, CharAssociation[] laa ) { + int na = 0; + if ( baa != null ) { + na += baa.length; + } + if ( iaa != null ) { + na += iaa.length; + } + if ( laa != null ) { + na += laa.length; + } + if ( na > 0 ) { + List gl = new ArrayList ( na ); + if ( baa != null ) { + for ( int i = 0; i < baa.length; i++ ) { + gl.add ( baa[i] ); + } + } + if ( iaa != null ) { + for ( int i = 0; i < iaa.length; i++ ) { + gl.add ( iaa[i] ); + } + } + if ( laa != null ) { + for ( int i = 0; i < laa.length; i++ ) { + gl.add ( laa[i] ); + } + } + return gl; + } else { + return null; + } + } + + /** + * Join (concatenate) glyph sequences. + * @param gs original glyph sequence from which to reuse character array reference + * @param sa array of glyph sequences, whose glyph arrays and association lists are to be concatenated + * @return new glyph sequence referring to character array of GS and concatenated glyphs and associations of SA + */ + public static GlyphSequence join ( GlyphSequence gs, GlyphSequence[] sa ) { + assert sa != null; + int tg = 0; + int ta = 0; + for ( int i = 0, n = sa.length; i < n; i++ ) { + GlyphSequence s = sa [ i ]; + IntBuffer ga = s.getGlyphs(); + assert ga != null; + int ng = ga.limit(); + List al = s.getAssociations(); + assert al != null; + int na = al.size(); + assert na == ng; + tg += ng; + ta += na; + } + IntBuffer uga = IntBuffer.allocate ( tg ); + ArrayList ual = new ArrayList ( ta ); + for ( int i = 0, n = sa.length; i < n; i++ ) { + GlyphSequence s = sa [ i ]; + uga.put ( s.getGlyphs() ); + ual.addAll ( s.getAssociations() ); + } + return new GlyphSequence ( gs.getCharacters(), uga, ual, gs.getPredications() ); + } + + /** + * Reorder sequence such that [SOURCE,SOURCE+COUNT) is moved just prior to TARGET. + * @param gs input sequence + * @param source index of sub-sequence to reorder + * @param count length of sub-sequence to reorder + * @param target index to which source sub-sequence is to be moved + * @return reordered sequence (or original if no reordering performed) + */ + public static GlyphSequence reorder ( GlyphSequence gs, int source, int count, int target ) { + if ( source != target ) { + int ng = gs.getGlyphCount(); + int[] ga = gs.getGlyphArray ( false ); + int[] nga = new int [ ng ]; + GlyphSequence.CharAssociation[] aa = gs.getAssociations ( 0, ng ); + GlyphSequence.CharAssociation[] naa = new GlyphSequence.CharAssociation [ ng ]; + if ( source < target ) { + int t = 0; + for ( int s = 0, e = source; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + for ( int s = source + count, e = target; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + for ( int s = source, e = source + count; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + for ( int s = target, e = ng; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + } else { + int t = 0; + for ( int s = 0, e = target; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + for ( int s = source, e = source + count; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + for ( int s = target, e = source; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + for ( int s = source + count, e = ng; s < e; s++, t++ ) { + nga[t] = ga[s]; + naa[t] = aa[s]; + } + } + return new GlyphSequence ( gs, null, nga, null, null, naa, null ); + } else { + return gs; + } + } + + private static int[] toArray ( IntBuffer ib ) { + if ( ib != null ) { + int n = ib.limit(); + int[] ia = new int[n]; + ib.get ( ia, 0, n ); + return ia; + } else { + return new int[0]; + } + } + + private static List makeIdentityAssociations ( int numChars, int numGlyphs ) { + int nc = numChars; + int ng = numGlyphs; + List av = new ArrayList ( ng ); + for ( int i = 0, n = ng; i < n; i++ ) { + int k = ( i > nc ) ? nc : i; + av.add ( new CharAssociation ( i, ( k == nc ) ? 0 : 1 ) ); + } + return av; + } + + private static IntBuffer copyBuffer ( IntBuffer ib ) { + if ( ib != null ) { + int[] ia = new int [ ib.capacity() ]; + int p = ib.position(); + int l = ib.limit(); + System.arraycopy ( ib.array(), 0, ia, 0, ia.length ); + return IntBuffer.wrap ( ia, p, l - p ); + } else { + return null; + } + } + + private static List copyAssociations ( List ca ) { + if ( ca != null ) { + return new ArrayList ( ca ); + } else { + return ca; + } + } + + /** + * A structure class encapsulating an interval of characters + * expressed as an offset and count of Unicode scalar values (in + * an IntBuffer). A <code>CharAssociation</code> is used to + * maintain a backpointer from a glyph to one or more character + * intervals from which the glyph was derived. + * + * Each glyph in a glyph sequence is associated with a single + * <code>CharAssociation</code> instance. + * + * A <code>CharAssociation</code> instance is additionally (and + * optionally) used to record predication information about the + * glyph, such as whether the glyph was produced by the + * application of a specific substitution table or whether its + * position was adjusted by a specific poisitioning table. + */ + public static class CharAssociation implements Cloneable { + + // instance state + private final int offset; + private final int count; + private final int[] subIntervals; + private Map<String,Object> predications; + + // class state + private static volatile Map<String,PredicationMerger> predicationMergers; + + interface PredicationMerger { + Object merge ( String key, Object v1, Object v2 ); + } + + /** + * Instantiate a character association. + * @param offset into array of Unicode scalar values (in associated IntBuffer) + * @param count of Unicode scalar values (in associated IntBuffer) + * @param subIntervals if disjoint, then array of sub-intervals, otherwise null; even + * members of array are sub-interval starts, and odd members are sub-interval + * ends (exclusive) + */ + public CharAssociation ( int offset, int count, int[] subIntervals ) { + this.offset = offset; + this.count = count; + this.subIntervals = ( ( subIntervals != null ) && ( subIntervals.length > 2 ) ) ? subIntervals : null; + } + + /** + * Instantiate a non-disjoint character association. + * @param offset into array of UTF-16 code elements (in associated CharSequence) + * @param count of UTF-16 character code elements (in associated CharSequence) + */ + public CharAssociation ( int offset, int count ) { + this ( offset, count, null ); + } + + /** + * Instantiate a non-disjoint character association. + * @param subIntervals if disjoint, then array of sub-intervals, otherwise null; even + * members of array are sub-interval starts, and odd members are sub-interval + * ends (exclusive) + */ + public CharAssociation ( int[] subIntervals ) { + this ( getSubIntervalsStart ( subIntervals ), getSubIntervalsLength ( subIntervals ), subIntervals ); + } + + /** @return offset (start of association interval) */ + public int getOffset() { + return offset; + } + + /** @return count (number of characer codes in association) */ + public int getCount() { + return count; + } + + /** @return start of association interval */ + public int getStart() { + return getOffset(); + } + + /** @return end of association interval */ + public int getEnd() { + return getOffset() + getCount(); + } + + /** @return true if association is disjoint */ + public boolean isDisjoint() { + return subIntervals != null; + } + + /** @return subintervals of disjoint association */ + public int[] getSubIntervals() { + return subIntervals; + } + + /** @return count of subintervals of disjoint association */ + public int getSubIntervalCount() { + return ( subIntervals != null ) ? ( subIntervals.length / 2 ) : 0; + } + + /** + * @param offset of interval in sequence + * @param count length of interval + * @return true if this association is contained within [offset,offset+count) + */ + public boolean contained ( int offset, int count ) { + int s = offset; + int e = offset + count; + if ( ! isDisjoint() ) { + int s0 = getStart(); + int e0 = getEnd(); + return ( s0 >= s ) && ( e0 <= e ); + } else { + int ns = getSubIntervalCount(); + for ( int i = 0; i < ns; i++ ) { + int s0 = subIntervals [ 2 * i + 0 ]; + int e0 = subIntervals [ 2 * i + 1 ]; + if ( ( s0 >= s ) && ( e0 <= e ) ) { + return true; + } + } + return false; + } + } + + /** + * Set predication <KEY,VALUE>. + * @param key predication key + * @param value predication value + */ + public void setPredication ( String key, Object value ) { + if ( predications == null ) { + predications = new HashMap<String,Object>(); + } + if ( predications != null ) { + predications.put ( key, value ); + } + } + + /** + * Get predication KEY. + * @param key predication key + * @return predication KEY at OFFSET or null if none exists + */ + public Object getPredication ( String key ) { + if ( predications != null ) { + return predications.get ( key ); + } else { + return null; + } + } + + /** + * Merge predication <KEY,VALUE>. + * @param key predication key + * @param value predication value + */ + public void mergePredication ( String key, Object value ) { + if ( predications == null ) { + predications = new HashMap<String,Object>(); + } + if ( predications != null ) { + if ( predications.containsKey ( key ) ) { + Object v1 = predications.get ( key ); + Object v2 = value; + predications.put ( key, mergePredicationValues ( key, v1, v2 ) ); + } else { + predications.put ( key, value ); + } + } + } + + /** + * Merge predication values V1 and V2 on KEY. Uses registered <code>PredicationMerger</code> + * if one exists, otherwise uses V2 if non-null, otherwise uses V1. + * @param key predication key + * @param v1 first (original) predication value + * @param v2 second (to be merged) predication value + * @return merged value + */ + public static Object mergePredicationValues ( String key, Object v1, Object v2 ) { + PredicationMerger pm = getPredicationMerger ( key ); + if ( pm != null ) { + return pm.merge ( key, v1, v2 ); + } else if ( v2 != null ) { + return v2; + } else { + return v1; + } + } + + /** + * Merge predications from another CA. + * @param ca from which to merge + */ + public void mergePredications ( CharAssociation ca ) { + if ( ca.predications != null ) { + for ( Map.Entry<String,Object> e : ca.predications.entrySet() ) { + mergePredication ( e.getKey(), e.getValue() ); + } + } + } + + /** {@inheritDoc} */ + public Object clone() { + try { + CharAssociation ca = (CharAssociation) super.clone(); + if ( predications != null ) { + ca.predications = new HashMap<String,Object> ( predications ); + } + return ca; + } catch ( CloneNotSupportedException e ) { + return null; + } + } + + /** + * Register predication merger PM for KEY. + * @param key for predication merger + * @param pm predication merger + */ + public static void setPredicationMerger ( String key, PredicationMerger pm ) { + if ( predicationMergers == null ) { + predicationMergers = new HashMap<String,PredicationMerger>(); + } + if ( predicationMergers != null ) { + predicationMergers.put ( key, pm ); + } + } + + /** + * Obtain predication merger for KEY. + * @param key for predication merger + * @return predication merger or null if none exists + */ + public static PredicationMerger getPredicationMerger ( String key ) { + if ( predicationMergers != null ) { + return predicationMergers.get ( key ); + } else { + return null; + } + } + + /** + * Replicate association to form <code>repeat</code> new associations. + * @param a association to replicate + * @param repeat count + * @return array of replicated associations + */ + public static CharAssociation[] replicate ( CharAssociation a, int repeat ) { + CharAssociation[] aa = new CharAssociation [ repeat ]; + for ( int i = 0, n = aa.length; i < n; i++ ) { + aa [ i ] = (CharAssociation) a.clone(); + } + return aa; + } + + /** + * Join (merge) multiple associations into a single, potentially disjoint + * association. + * @param aa array of associations to join + * @return (possibly disjoint) association containing joined associations + */ + public static CharAssociation join ( CharAssociation[] aa ) { + CharAssociation ca; + // extract sorted intervals + int[] ia = extractIntervals ( aa ); + if ( ( ia == null ) || ( ia.length == 0 ) ) { + ca = new CharAssociation ( 0, 0 ); + } else if ( ia.length == 2 ) { + int s = ia[0]; + int e = ia[1]; + ca = new CharAssociation ( s, e - s ); + } else { + ca = new CharAssociation ( mergeIntervals ( ia ) ); + } + return mergePredicates ( ca, aa ); + } + + private static CharAssociation mergePredicates ( CharAssociation ca, CharAssociation[] aa ) { + for ( CharAssociation a : aa ) { + ca.mergePredications ( a ); + } + return ca; + } + + private static int getSubIntervalsStart ( int[] ia ) { + int us = Integer.MAX_VALUE; + int ue = Integer.MIN_VALUE; + if ( ia != null ) { + for ( int i = 0, n = ia.length; i < n; i += 2 ) { + int s = ia [ i + 0 ]; + int e = ia [ i + 1 ]; + if ( s < us ) { + us = s; + } + if ( e > ue ) { + ue = e; + } + } + if ( ue < 0 ) { + ue = 0; + } + if ( us > ue ) { + us = ue; + } + } + return us; + } + + private static int getSubIntervalsLength ( int[] ia ) { + int us = Integer.MAX_VALUE; + int ue = Integer.MIN_VALUE; + if ( ia != null ) { + for ( int i = 0, n = ia.length; i < n; i += 2 ) { + int s = ia [ i + 0 ]; + int e = ia [ i + 1 ]; + if ( s < us ) { + us = s; + } + if ( e > ue ) { + ue = e; + } + } + if ( ue < 0 ) { + ue = 0; + } + if ( us > ue ) { + us = ue; + } + } + return ue - us; + } + + /** + * Extract sorted sub-intervals. + */ + private static int[] extractIntervals ( CharAssociation[] aa ) { + int ni = 0; + for ( int i = 0, n = aa.length; i < n; i++ ) { + CharAssociation a = aa [ i ]; + if ( a.isDisjoint() ) { + ni += a.getSubIntervalCount(); + } else { + ni += 1; + } + } + int[] sa = new int [ ni ]; + int[] ea = new int [ ni ]; + for ( int i = 0, k = 0; i < aa.length; i++ ) { + CharAssociation a = aa [ i ]; + if ( a.isDisjoint() ) { + int[] da = a.getSubIntervals(); + for ( int j = 0; j < da.length; j += 2 ) { + sa [ k ] = da [ j + 0 ]; + ea [ k ] = da [ j + 1 ]; + k++; + } + } else { + sa [ k ] = a.getStart(); + ea [ k ] = a.getEnd(); + k++; + } + } + return sortIntervals ( sa, ea ); + } + + private static final int[] sortIncrements16 // CSOK: ConstantNameCheck + = { 1391376, 463792, 198768, 86961, 33936, 13776, 4592, 1968, 861, 336, 112, 48, 21, 7, 3, 1 }; + + private static final int[] sortIncrements03 // CSOK: ConstantNameCheck + = { 7, 3, 1 }; + + /** + * Sort sub-intervals using modified Shell Sort. + */ + private static int[] sortIntervals ( int[] sa, int[] ea ) { + assert sa != null; + assert ea != null; + assert sa.length == ea.length; + int ni = sa.length; + int[] incr = ( ni < 21 ) ? sortIncrements03 : sortIncrements16; + for ( int k = 0; k < incr.length; k++ ) { + for ( int h = incr [ k ], i = h, n = ni, j; i < n; i++ ) { + int s1 = sa [ i ]; + int e1 = ea [ i ]; + for ( j = i; j >= h; j -= h) { + int s2 = sa [ j - h ]; + int e2 = ea [ j - h ]; + if ( s2 > s1 ) { + sa [ j ] = s2; + ea [ j ] = e2; + } else if ( ( s2 == s1 ) && ( e2 > e1 ) ) { + sa [ j ] = s2; + ea [ j ] = e2; + } else { + break; + } + } + sa [ j ] = s1; + ea [ j ] = e1; + } + } + int[] ia = new int [ ni * 2 ]; + for ( int i = 0; i < ni; i++ ) { + ia [ ( i * 2 ) + 0 ] = sa [ i ]; + ia [ ( i * 2 ) + 1 ] = ea [ i ]; + } + return ia; + } + + /** + * Merge overlapping and abutting sub-intervals. + */ + private static int[] mergeIntervals ( int[] ia ) { + int ni = ia.length; + int i, n, nm, is, ie; + // count merged sub-intervals + for ( i = 0, n = ni, nm = 0, is = ie = -1; i < n; i += 2 ) { + int s = ia [ i + 0 ]; + int e = ia [ i + 1 ]; + if ( ( ie < 0 ) || ( s > ie ) ) { + is = s; + ie = e; + nm++; + } else if ( s >= is ) { + if ( e > ie ) { + ie = e; + } + } + } + int[] mi = new int [ nm * 2 ]; + // populate merged sub-intervals + for ( i = 0, n = ni, nm = 0, is = ie = -1; i < n; i += 2 ) { + int s = ia [ i + 0 ]; + int e = ia [ i + 1 ]; + int k = nm * 2; + if ( ( ie < 0 ) || ( s > ie ) ) { + is = s; + ie = e; + mi [ k + 0 ] = is; + mi [ k + 1 ] = ie; + nm++; + } else if ( s >= is ) { + if ( e > ie ) { + ie = e; + } + mi [ k - 1 ] = ie; + } + } + return mi; + } + + } + +} diff --git a/src/java/org/apache/fop/complexscripts/util/GlyphTester.java b/src/java/org/apache/fop/complexscripts/util/GlyphTester.java new file mode 100644 index 000000000..48d0444a0 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/GlyphTester.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +/** + * Interface for testing glyph properties according to glyph identifier. + * @author Glenn Adams + */ +public interface GlyphTester { + + /** + * Perform a test on a glyph identifier. + * @param gi glyph identififer + * @param flags that apply to lookup in scope + * @return true if test is satisfied + */ + boolean test ( int gi, int flags ); + +} diff --git a/src/java/org/apache/fop/complexscripts/util/NumberConverter.java b/src/java/org/apache/fop/complexscripts/util/NumberConverter.java new file mode 100644 index 000000000..6d9831249 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/NumberConverter.java @@ -0,0 +1,1616 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +import java.util.ArrayList; +import java.util.List; + +// CSOFF: LineLengthCheck +// CSOFF: InnerAssignmentCheck +// CSOFF: NoWhitespaceAfterCheck +// CSOFF: AvoidNestedBlocksCheck + +/** + * Implementation of Number to String Conversion algorithm specified by + * XSL Transformations (XSLT) Version 2.0, W3C Recommendation, 23 January 2007. + * + * This algorithm differs from that specified in XSLT 1.0 in the following + * ways: + * <ul> + * <li>input numbers are greater than or equal to zero rather than greater than zero;</li> + * <li>introduces format tokens { w, W, Ww };</li> + * <li>introduces ordinal parameter to generate ordinal numbers;</li> + * </ul> + * + * Implementation Defaults and Limitations + * <ul> + * <li>If language parameter is unspecified (null or empty string), then the value + * of DEFAULT_LANGUAGE is used, which is defined below as "eng" (English).</li> + * <li>Only English, French, and Spanish word numerals are supported, and only if less than one trillion (1,000,000,000,000).</li> + * <li>Ordinal word numerals are supported for French and Spanish only when less than or equal to ten (10).</li> + * </ul> + * + * Implementation Notes + * <ul> + * <li>In order to handle format tokens outside the Unicode BMP, all processing is + * done in Unicode Scalar Values represented with Integer and Integer[] + * types. Without affecting behavior, this may be subsequently optimized to + * use int and int[] types.</li> + * <li>In order to communicate various sub-parameters, including ordinalization, a <em>features</em> + * is employed, which consists of comma separated name and optional value tokens, where name and value + * are separated by an equals '=' sign.</li> + * <li>Ordinal numbers are selected by specifying a word based format token in combination with a 'ordinal' feature with no value, in which case + * the features 'male' and 'female' may be used to specify gender for gender sensitive languages. For example, the feature string "ordinal,female" + * selects female ordinals.</li> + * </ul> + * + * @author Glenn Adams + */ +public class NumberConverter { + + /** alphabetical */ + public static final int LETTER_VALUE_ALPHABETIC = 1; + /** traditional */ + public static final int LETTER_VALUE_TRADITIONAL = 2; + + /** no token type */ + private static final int TOKEN_NONE = 0; + /** alhphanumeric token type */ + private static final int TOKEN_ALPHANUMERIC = 1; + /** nonalphanumeric token type */ + private static final int TOKEN_NONALPHANUMERIC = 2; + /** default token */ + private static final Integer[] DEFAULT_TOKEN = new Integer[] { (int) '1' }; + /** default separator */ + private static final Integer[] DEFAULT_SEPARATOR = new Integer[] { (int) '.' }; + /** default language */ + private static final String DEFAULT_LANGUAGE = "eng"; + + /** prefix token */ + private Integer[] prefix; + /** suffix token */ + private Integer[] suffix; + /** sequence of tokens, as parsed from format */ + private Integer[][] tokens; + /** sequence of separators, as parsed from format */ + private Integer[][] separators; + /** grouping separator */ + private int groupingSeparator; + /** grouping size */ + private int groupingSize; + /** letter value */ + private int letterValue; + /** letter value system */ + private String features; + /** language */ + private String language; + /** country */ + private String country; + + /** + * Construct parameterized number converter. + * @param format format for the page number (may be null or empty, which is treated as null) + * @param groupingSeparator grouping separator (if zero, then no grouping separator applies) + * @param groupingSize grouping size (if zero or negative, then no grouping size applies) + * @param letterValue letter value (must be one of the above letter value enumeration values) + * @param features features (feature sub-parameters) + * @param language (may be null or empty, which is treated as null) + * @param country (may be null or empty, which is treated as null) + * @throws IllegalArgumentException if format is not a valid UTF-16 string (e.g., has unpaired surrogate) + */ + public NumberConverter ( String format, int groupingSeparator, int groupingSize, int letterValue, String features, String language, String country ) + throws IllegalArgumentException { + this.groupingSeparator = groupingSeparator; + this.groupingSize = groupingSize; + this.letterValue = letterValue; + this.features = features; + this.language = ( language != null ) ? language.toLowerCase() : null; + this.country = ( country != null ) ? country.toLowerCase() : null; + parseFormatTokens ( format ); + } + + /** + * Convert a number to string according to conversion parameters. + * @param number number to conver + * @return string representing converted number + */ + public String convert ( long number ) { + List<Long> numbers = new ArrayList<Long>(); + numbers.add ( number ); + return convert ( numbers ); + } + + /** + * Convert list of numbers to string according to conversion parameters. + * @param numbers list of numbers to convert + * @return string representing converted list of numbers + */ + public String convert ( List<Long> numbers ) { + List<Integer> scalars = new ArrayList<Integer>(); + if ( prefix != null ) { + appendScalars ( scalars, prefix ); + } + convertNumbers ( scalars, numbers ); + if ( suffix != null ) { + appendScalars ( scalars, suffix ); + } + return scalarsToString ( scalars ); + } + + private void parseFormatTokens ( String format ) throws IllegalArgumentException { + List<Integer[]> tokens = new ArrayList<Integer[]>(); + List<Integer[]> separators = new ArrayList<Integer[]>(); + if ( ( format == null ) || ( format.length() == 0 ) ) { + format = "1"; + } + int tokenType = TOKEN_NONE; + List<Integer> token = new ArrayList<Integer>(); + Integer[] ca = UTF32.toUTF32 ( format, 0, true ); + for ( int i = 0, n = ca.length; i < n; i++ ) { + int c = ca[i]; + int tokenTypeNew = isAlphaNumeric ( c ) ? TOKEN_ALPHANUMERIC : TOKEN_NONALPHANUMERIC; + if ( tokenTypeNew != tokenType ) { + if ( token.size() > 0 ) { + if ( tokenType == TOKEN_ALPHANUMERIC ) { + tokens.add ( token.toArray ( new Integer [ token.size() ] ) ); + } else { + separators.add ( token.toArray ( new Integer [ token.size() ] ) ); + } + token.clear(); + } + tokenType = tokenTypeNew; + } + token.add ( c ); + } + if ( token.size() > 0 ) { + if ( tokenType == TOKEN_ALPHANUMERIC ) { + tokens.add ( token.toArray ( new Integer [ token.size() ] ) ); + } else { + separators.add ( token.toArray ( new Integer [ token.size() ] ) ); + } + } + if ( ! separators.isEmpty() ) { + this.prefix = separators.remove ( 0 ); + } + if ( ! separators.isEmpty() ) { + this.suffix = separators.remove ( separators.size() - 1 ); + } + this.separators = separators.toArray ( new Integer [ separators.size() ] [] ); + this.tokens = tokens.toArray ( new Integer [ tokens.size() ] [] ); + } + + private static boolean isAlphaNumeric ( int c ) { + switch ( Character.getType ( c ) ) { + case Character.DECIMAL_DIGIT_NUMBER: // Nd + case Character.LETTER_NUMBER: // Nl + case Character.OTHER_NUMBER: // No + case Character.UPPERCASE_LETTER: // Lu + case Character.LOWERCASE_LETTER: // Ll + case Character.TITLECASE_LETTER: // Lt + case Character.MODIFIER_LETTER: // Lm + case Character.OTHER_LETTER: // Lo + return true; + default: + return false; + } + } + + private void convertNumbers ( List<Integer> scalars, List<Long> numbers ) { + Integer[] tknLast = DEFAULT_TOKEN; + int tknIndex = 0; + int tknCount = tokens.length; + int sepIndex = 0; + int sepCount = separators.length; + int numIndex = 0; + for ( Long number : numbers ) { + Integer[] sep = null; + Integer[] tkn; + if ( tknIndex < tknCount ) { + if ( numIndex > 0 ) { + if ( sepIndex < sepCount ) { + sep = separators [ sepIndex++ ]; + } else { + sep = DEFAULT_SEPARATOR; + } + } + tkn = tokens [ tknIndex++ ]; + } else { + tkn = tknLast; + } + appendScalars ( scalars, convertNumber ( number, sep, tkn ) ); + tknLast = tkn; + numIndex++; + } + } + + private Integer[] convertNumber ( long number, Integer[] separator, Integer[] token ) { + List<Integer> sl = new ArrayList<Integer>(); + if ( separator != null ) { + appendScalars ( sl, separator ); + } + if ( token != null ) { + appendScalars ( sl, formatNumber ( number, token ) ); + } + return sl.toArray ( new Integer [ sl.size() ] ); + } + + private Integer[] formatNumber ( long number, Integer[] token ) { + Integer[] fn = null; + assert token.length > 0; + if ( number < 0 ) { + throw new IllegalArgumentException ( "number must be non-negative" ); + } else if ( token.length == 1 ) { + int s = token[0].intValue(); + switch ( s ) { + case (int) '1': + { + fn = formatNumberAsDecimal ( number, (int) '1', 1 ); + break; + } + case (int) 'W': + case (int) 'w': + { + fn = formatNumberAsWord ( number, ( s == (int) 'W' ) ? Character.UPPERCASE_LETTER : Character.LOWERCASE_LETTER ); + break; + } + case (int) 'A': // handled as numeric sequence + case (int) 'a': // handled as numeric sequence + case (int) 'I': // handled as numeric special + case (int) 'i': // handled as numeric special + default: + { + if ( isStartOfDecimalSequence ( s ) ) { + fn = formatNumberAsDecimal ( number, s, 1 ); + } else if ( isStartOfAlphabeticSequence ( s ) ) { + fn = formatNumberAsSequence ( number, s, getSequenceBase ( s ), null ); + } else if ( isStartOfNumericSpecial ( s ) ) { + fn = formatNumberAsSpecial ( number, s ); + } else { + fn = null; + } + break; + } + } + } else if ( ( token.length == 2 ) && ( token[0] == (int) 'W' ) && ( token[1] == (int) 'w' ) ) { + fn = formatNumberAsWord ( number, Character.TITLECASE_LETTER ); + } else if ( isPaddedOne ( token ) ) { + int s = token [ token.length - 1 ].intValue(); + fn = formatNumberAsDecimal ( number, s, token.length ); + } else { + throw new IllegalArgumentException ( "invalid format token: \"" + UTF32.fromUTF32 ( token ) + "\"" ); + } + if ( fn == null ) { + fn = formatNumber ( number, DEFAULT_TOKEN ); + } + assert fn != null; + return fn; + } + + /** + * Format NUMBER as decimal using characters denoting digits that start at ONE, + * adding one or more (zero) padding characters as needed to fill out field WIDTH. + * @param number to be formatted + * @param one unicode scalar value denoting numeric value 1 + * @param width non-negative integer denoting field width of number, possible including padding + * @return formatted number as array of unicode scalars + */ + private Integer[] formatNumberAsDecimal ( long number, int one, int width ) { + assert Character.getNumericValue ( one ) == 1; + assert Character.getNumericValue ( one - 1 ) == 0; + assert Character.getNumericValue ( one + 8 ) == 9; + List<Integer> sl = new ArrayList<Integer>(); + int zero = one - 1; + while ( number > 0 ) { + long digit = number % 10; + sl.add ( 0, zero + (int) digit ); + number = number / 10; + } + while ( width > sl.size() ) { + sl.add ( 0, zero ); + } + if ( ( groupingSize != 0 ) && ( groupingSeparator != 0 ) ) { + sl = performGrouping ( sl, groupingSize, groupingSeparator ); + } + return sl.toArray ( new Integer [ sl.size() ] ); + } + + private static List<Integer> performGrouping ( List<Integer> sl, int groupingSize, int groupingSeparator ) { + assert groupingSize > 0; + assert groupingSeparator != 0; + if ( sl.size() > groupingSize ) { + List<Integer> gl = new ArrayList<Integer>(); + for ( int i = 0, n = sl.size(), g = 0; i < n; i++ ) { + int k = n - i - 1; + if ( g == groupingSize ) { + gl.add ( 0, groupingSeparator ); + g = 1; + } else { + g++; + } + gl.add ( 0, sl.get ( k ) ); + } + return gl; + } else { + return sl; + } + } + + + /** + * Format NUMBER as using sequence of characters that start at ONE, and + * having BASE radix. + * @param number to be formatted + * @param one unicode scalar value denoting start of sequence (numeric value 1) + * @param base number of elements in sequence + * @param map if non-null, then maps sequences indices to unicode scalars + * @return formatted number as array of unicode scalars + */ + private Integer[] formatNumberAsSequence ( long number, int one, int base, int[] map ) { + assert base > 1; + assert ( map == null ) || ( map.length >= base ); + List<Integer> sl = new ArrayList<Integer>(); + if ( number == 0 ) { + return null; + } else { + long n = number; + while ( n > 0 ) { + int d = (int) ( ( n - 1 ) % (long) base ); + int s = ( map != null ) ? map [ d ] : ( one + d ); + sl.add ( 0, s ); + n = ( n - 1 ) / base; + } + return sl.toArray ( new Integer [ sl.size() ] ); + } + } + + /** + * Format NUMBER as using special system that starts at ONE. + * @param number to be formatted + * @param one unicode scalar value denoting start of system (numeric value 1) + * @return formatted number as array of unicode scalars + */ + private Integer[] formatNumberAsSpecial ( long number, int one ) { + SpecialNumberFormatter f = getSpecialFormatter ( one, letterValue, features, language, country ); + if ( f != null ) { + return f.format ( number, one, letterValue, features, language, country ); + } else { + return null; + } + } + + /** + * Format NUMBER as word according to TYPE, which must be either + * Character.UPPERCASE_LETTER, Character.LOWERCASE_LETTER, or + * Character.TITLECASE_LETTER. Makes use of this.language to + * determine language of word. + * @param number to be formatted + * @param caseType unicode character type for case conversion + * @return formatted number as array of unicode scalars + */ + private Integer[] formatNumberAsWord ( long number, int caseType ) { + SpecialNumberFormatter f = null; + if ( isLanguage ( "eng" ) ) { + f = new EnglishNumberAsWordFormatter ( caseType ); + } else if ( isLanguage ( "spa" ) ) { + f = new SpanishNumberAsWordFormatter ( caseType ); + } else if ( isLanguage ( "fra" ) ) { + f = new FrenchNumberAsWordFormatter ( caseType ); + } else { + f = new EnglishNumberAsWordFormatter ( caseType ); + } + return f.format ( number, 0, letterValue, features, language, country ); + } + + private boolean isLanguage ( String iso3Code ) { + if ( language == null ) { + return false; + } else if ( language.equals ( iso3Code ) ) { + return true; + } else { + return isSameLanguage ( iso3Code, language ); + } + } + + private static String[][] equivalentLanguages = { + { "eng", "en" }, + { "fra", "fre", "fr" }, + { "spa", "es" }, + }; + + private static boolean isSameLanguage ( String i3c, String lc ) { + for ( String[] el : equivalentLanguages ) { + assert el.length >= 2; + if ( el[0].equals ( i3c ) ) { + for ( int i = 0, n = el.length; i < n; i++ ) { + if ( el[i].equals ( lc ) ) { + return true; + } + } + return false; + } + } + return false; + } + + private static boolean hasFeature ( String features, String feature ) { + if ( features != null ) { + assert feature != null; + assert feature.length() != 0; + String[] fa = features.split(","); + for ( String f : fa ) { + String[] fp = f.split("="); + assert fp.length > 0; + String fn = fp[0]; + String fv = ( fp.length > 1 ) ? fp[1] : ""; + if ( fn.equals ( feature ) ) { + return true; + } + } + } + return false; + } + + /* not yet used + private static String getFeatureValue ( String features, String feature ) { + if ( features != null ) { + assert feature != null; + assert feature.length() != 0; + String[] fa = features.split(","); + for ( String f : fa ) { + String[] fp = f.split("="); + assert fp.length > 0; + String fn = fp[0]; + String fv = ( fp.length > 1 ) ? fp[1] : ""; + if ( fn.equals ( feature ) ) { + return fv; + } + } + } + return ""; + } + */ + + private static void appendScalars ( List<Integer> scalars, Integer[] sa ) { + for ( Integer s : sa ) { + scalars.add ( s ); + } + } + + private static String scalarsToString ( List<Integer> scalars ) { + Integer[] sa = scalars.toArray ( new Integer [ scalars.size() ] ); + return UTF32.fromUTF32 ( sa ); + } + + private static boolean isPaddedOne ( Integer[] token ) { + if ( getDecimalValue ( token [ token.length - 1 ] ) != 1 ) { + return false; + } else { + for ( int i = 0, n = token.length - 1; i < n; i++ ) { + if ( getDecimalValue ( token [ i ] ) != 0 ) { + return false; + } + } + return true; + } + } + + private static int getDecimalValue ( Integer scalar ) { + int s = scalar.intValue(); + if ( Character.getType ( s ) == Character.DECIMAL_DIGIT_NUMBER ) { + return Character.getNumericValue ( s ); + } else { + return -1; + } + } + + private static boolean isStartOfDecimalSequence ( int s ) { + return ( Character.getNumericValue ( s ) == 1 ) + && ( Character.getNumericValue ( s - 1 ) == 0 ) + && ( Character.getNumericValue ( s + 8 ) == 9 ); + } + + private static int[][] supportedAlphabeticSequences = { + { 'A', 26 }, // A...Z + { 'a', 26 }, // a...z + }; + + private static boolean isStartOfAlphabeticSequence ( int s ) { + for ( int[] ss : supportedAlphabeticSequences ) { + assert ss.length >= 2; + if ( ss[0] == s ) { + return true; + } + } + return false; + } + + private static int getSequenceBase ( int s ) { + for ( int[] ss : supportedAlphabeticSequences ) { + assert ss.length >= 2; + if ( ss[0] == s ) { + return ss[1]; + } + } + return 0; + } + + private static int[][] supportedSpecials = { + { 'I' }, // latin - uppercase roman numerals + { 'i' }, // latin - lowercase roman numerals + { '\u0391' }, // greek - uppercase isopsephry numerals + { '\u03B1' }, // greek - lowercase isopsephry numerals + { '\u05D0' }, // hebrew - gematria numerals + { '\u0623' }, // arabic - abjadi numberals + { '\u0627' }, // arabic - either abjadi or hijai alphabetic sequence + { '\u0E01' }, // thai - default alphabetic sequence + { '\u3042' }, // kana - hiragana (gojuon) - default alphabetic sequence + { '\u3044' }, // kana - hiragana (iroha) + { '\u30A2' }, // kana - katakana (gojuon) - default alphabetic sequence + { '\u30A4' }, // kana - katakana (iroha) + }; + + private static boolean isStartOfNumericSpecial ( int s ) { + for ( int[] ss : supportedSpecials ) { + assert ss.length >= 1; + if ( ss[0] == s ) { + return true; + } + } + return false; + } + + private SpecialNumberFormatter getSpecialFormatter ( int one, int letterValue, String features, String language, String country ) { + if ( one == (int) 'I' ) { + return new RomanNumeralsFormatter(); + } else if ( one == (int) 'i' ) { + return new RomanNumeralsFormatter(); + } else if ( one == (int) '\u0391' ) { + return new IsopsephryNumeralsFormatter(); + } else if ( one == (int) '\u03B1' ) { + return new IsopsephryNumeralsFormatter(); + } else if ( one == (int) '\u05D0' ) { + return new GematriaNumeralsFormatter(); + } else if ( one == (int) '\u0623' ) { + return new ArabicNumeralsFormatter(); + } else if ( one == (int) '\u0627' ) { + return new ArabicNumeralsFormatter(); + } else if ( one == (int) '\u0E01' ) { + return new ThaiNumeralsFormatter(); + } else if ( one == (int) '\u3042' ) { + return new KanaNumeralsFormatter(); + } else if ( one == (int) '\u3044' ) { + return new KanaNumeralsFormatter(); + } else if ( one == (int) '\u30A2' ) { + return new KanaNumeralsFormatter(); + } else if ( one == (int) '\u30A4' ) { + return new KanaNumeralsFormatter(); + } else { + return null; + } + } + + private static Integer[] toUpperCase ( Integer[] sa ) { + assert sa != null; + for ( int i = 0, n = sa.length; i < n; i++ ) { + Integer s = sa [ i ]; + sa [ i ] = Character.toUpperCase ( s ); + } + return sa; + } + + private static Integer[] toLowerCase ( Integer[] sa ) { + assert sa != null; + for ( int i = 0, n = sa.length; i < n; i++ ) { + Integer s = sa [ i ]; + sa [ i ] = Character.toLowerCase ( s ); + } + return sa; + } + + /* not yet used + private static Integer[] toTitleCase ( Integer[] sa ) { + assert sa != null; + if ( sa.length > 0 ) { + sa [ 0 ] = Character.toTitleCase ( sa [ 0 ] ); + } + return sa; + } + */ + + private static List<String> convertWordCase ( List<String> words, int caseType ) { + List<String> wl = new ArrayList<String>(); + for ( String w : words ) { + wl.add ( convertWordCase ( w, caseType ) ); + } + return wl; + } + + private static String convertWordCase ( String word, int caseType ) { + if ( caseType == Character.UPPERCASE_LETTER ) { + return word.toUpperCase(); + } else if ( caseType == Character.LOWERCASE_LETTER ) { + return word.toLowerCase(); + } else if ( caseType == Character.TITLECASE_LETTER ) { + StringBuffer sb = new StringBuffer(); + for ( int i = 0, n = word.length(); i < n; i++ ) { + String s = word.substring ( i, i + 1 ); + if ( i == 0 ) { + sb.append ( s.toUpperCase() ); + } else { + sb.append ( s.toLowerCase() ); + } + } + return sb.toString(); + } else { + return word; + } + } + + private static String joinWords ( List<String> words, String separator ) { + StringBuffer sb = new StringBuffer(); + for ( String w : words ) { + if ( sb.length() > 0 ) { + sb.append ( separator ); + } + sb.append ( w ); + } + return sb.toString(); + } + + /** + * Special number formatter. + */ + interface SpecialNumberFormatter { + /** + * Format number with special numeral system. + * @param number to be formatted + * @param one unicode scalar value denoting numeric value 1 + * @param letterValue letter value (must be one of the above letter value enumeration values) + * @param features features (feature sub-parameters) + * @param language denotes applicable language + * @param country denotes applicable country + * @return formatted number as array of unicode scalars + */ + Integer[] format ( long number, int one, int letterValue, String features, String language, String country ); + } + + /** + * English Word Numerals + */ + private static String[] englishWordOnes = { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine" }; + private static String[] englishWordTeens = { "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen" }; + private static String[] englishWordTens = { "", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety" }; + private static String[] englishWordOthers = { "hundred", "thousand", "million", "billion" }; + private static String[] englishWordOnesOrd = { "none", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth" }; + private static String[] englishWordTeensOrd = { "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth" }; + private static String[] englishWordTensOrd = { "", "tenth", "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "eightieth", "ninetith" }; + private static String[] englishWordOthersOrd = { "hundredth", "thousandth", "millionth", "billionth" }; + private static class EnglishNumberAsWordFormatter implements SpecialNumberFormatter { + private int caseType = Character.UPPERCASE_LETTER; + EnglishNumberAsWordFormatter ( int caseType ) { + this.caseType = caseType; + } + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + List<String> wl = new ArrayList<String>(); + if ( number >= 1000000000000L ) { + return null; + } else { + boolean ordinal = hasFeature ( features, "ordinal" ); + if ( number == 0 ) { + wl.add ( englishWordOnes [ 0 ] ); + } else if ( ordinal && ( number < 10 ) ) { + wl.add ( englishWordOnesOrd [ (int) number ] ); + } else { + int ones = (int) ( number % 1000 ); + int thousands = (int) ( ( number / 1000 ) % 1000 ); + int millions = (int) ( ( number / 1000000 ) % 1000 ); + int billions = (int) ( ( number / 1000000000 ) % 1000 ); + if ( billions > 0 ) { + wl = formatOnesInThousand ( wl, billions ); + if ( ordinal && ( ( number % 1000000000 ) == 0 ) ) { + wl.add ( englishWordOthersOrd[3] ); + } else { + wl.add ( englishWordOthers[3] ); + } + } + if ( millions > 0 ) { + wl = formatOnesInThousand ( wl, millions ); + if ( ordinal && ( ( number % 1000000 ) == 0 ) ) { + wl.add ( englishWordOthersOrd[2] ); + } else { + wl.add ( englishWordOthers[2] ); + } + } + if ( thousands > 0 ) { + wl = formatOnesInThousand ( wl, thousands ); + if ( ordinal && ( ( number % 1000 ) == 0 ) ) { + wl.add ( englishWordOthersOrd[1] ); + } else { + wl.add ( englishWordOthers[1] ); + } + } + if ( ones > 0 ) { + wl = formatOnesInThousand ( wl, ones, ordinal ); + } + } + wl = convertWordCase ( wl, caseType ); + return UTF32.toUTF32 ( joinWords ( wl, " " ), 0, true ); + } + } + private List<String> formatOnesInThousand ( List<String> wl, int number ) { + return formatOnesInThousand ( wl, number, false ); + } + private List<String> formatOnesInThousand ( List<String> wl, int number, boolean ordinal ) { + assert number < 1000; + int ones = number % 10; + int tens = ( number / 10 ) % 10; + int hundreds = ( number / 100 ) % 10; + if ( hundreds > 0 ) { + wl.add ( englishWordOnes [ hundreds ] ); + if ( ordinal && ( ( number % 100 ) == 0 ) ) { + wl.add ( englishWordOthersOrd[0] ); + } else { + wl.add ( englishWordOthers[0] ); + } + } + if ( tens > 0 ) { + if ( tens == 1 ) { + if ( ordinal ) { + wl.add ( englishWordTeensOrd [ ones ] ); + } else { + wl.add ( englishWordTeens [ ones ] ); + } + } else { + if ( ordinal && ( ones == 0 ) ) { + wl.add ( englishWordTensOrd [ tens ] ); + } else { + wl.add ( englishWordTens [ tens ] ); + } + if ( ones > 0 ) { + if ( ordinal ) { + wl.add ( englishWordOnesOrd [ ones ] ); + } else { + wl.add ( englishWordOnes [ ones ] ); + } + } + } + } else if ( ones > 0 ) { + if ( ordinal ) { + wl.add ( englishWordOnesOrd [ ones ] ); + } else { + wl.add ( englishWordOnes [ ones ] ); + } + } + return wl; + } + } + + /** + * French Word Numerals + */ + private static String[] frenchWordOnes = { "z\u00e9ro", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf" }; + private static String[] frenchWordTeens = { "dix", "onze", "douze", "treize", "quatorze", "quinze", "seize", "dix-sept", "dix-huit", "dix-neuf" }; + private static String[] frenchWordTens = { "", "dix", "vingt", "trente", "quarante", "cinquante", "soixante", "soixante-dix", "quatre-vingt", "quatre-vingt-dix" }; + private static String[] frenchWordOthers = { "cent", "cents", "mille", "million", "millions", "milliard", "milliards" }; + private static String[] frenchWordOnesOrdMale = { "premier", "deuxi\u00e8me", "troisi\u00e8me", "quatri\u00e8me", "cinqui\u00e8me", "sixi\u00e8me", "septi\u00e8me", "huiti\u00e8me", "neuvi\u00e8me", "dixi\u00e8me" }; + private static String[] frenchWordOnesOrdFemale = { "premi\u00e8re", "deuxi\u00e8me", "troisi\u00e8me", "quatri\u00e8me", "cinqui\u00e8me", "sixi\u00e8me", "septi\u00e8me", "huiti\u00e8me", "neuvi\u00e8me", "dixi\u00e8me" }; + private static class FrenchNumberAsWordFormatter implements SpecialNumberFormatter { + private int caseType = Character.UPPERCASE_LETTER; + FrenchNumberAsWordFormatter ( int caseType ) { + this.caseType = caseType; + } + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + List<String> wl = new ArrayList<String>(); + if ( number >= 1000000000000L ) { + return null; + } else { + boolean ordinal = hasFeature ( features, "ordinal" ); + if ( number == 0 ) { + wl.add ( frenchWordOnes [ 0 ] ); + } else if ( ordinal && ( number <= 10 ) ) { + boolean female = hasFeature ( features, "female" ); + if ( female ) { + wl.add ( frenchWordOnesOrdFemale [ (int) number ] ); + } else { + wl.add ( frenchWordOnesOrdMale [ (int) number ] ); + } + } else { + int ones = (int) ( number % 1000 ); + int thousands = (int) ( ( number / 1000 ) % 1000 ); + int millions = (int) ( ( number / 1000000 ) % 1000 ); + int billions = (int) ( ( number / 1000000000 ) % 1000 ); + if ( billions > 0 ) { + wl = formatOnesInThousand ( wl, billions ); + if ( billions == 1 ) { + wl.add ( frenchWordOthers[5] ); + } else { + wl.add ( frenchWordOthers[6] ); + } + } + if ( millions > 0 ) { + wl = formatOnesInThousand ( wl, millions ); + if ( millions == 1 ) { + wl.add ( frenchWordOthers[3] ); + } else { + wl.add ( frenchWordOthers[4] ); + } + } + if ( thousands > 0 ) { + if ( thousands > 1 ) { + wl = formatOnesInThousand ( wl, thousands ); + } + wl.add ( frenchWordOthers[2] ); + } + if ( ones > 0 ) { + wl = formatOnesInThousand ( wl, ones ); + } + } + wl = convertWordCase ( wl, caseType ); + return UTF32.toUTF32 ( joinWords ( wl, " " ), 0, true ); + } + } + private List<String> formatOnesInThousand ( List<String> wl, int number ) { + assert number < 1000; + int ones = number % 10; + int tens = ( number / 10 ) % 10; + int hundreds = ( number / 100 ) % 10; + if ( hundreds > 0 ) { + if ( hundreds > 1 ) { + wl.add ( frenchWordOnes [ hundreds ] ); + } + if ( ( hundreds > 1 ) && ( tens == 0 ) && ( ones == 0 ) ) { + wl.add ( frenchWordOthers[1] ); + } else { + wl.add ( frenchWordOthers[0] ); + } + } + if ( tens > 0 ) { + if ( tens == 1 ) { + wl.add ( frenchWordTeens [ ones ] ); + } else if ( tens < 7 ) { + if ( ones == 1 ) { + wl.add ( frenchWordTens [ tens ] ); + wl.add ( "et" ); + wl.add ( frenchWordOnes [ ones ] ); + } else { + StringBuffer sb = new StringBuffer(); + sb.append ( frenchWordTens [ tens ] ); + if ( ones > 0 ) { + sb.append ( '-' ); + sb.append ( frenchWordOnes [ ones ] ); + } + wl.add ( sb.toString() ); + } + } else if ( tens == 7 ) { + if ( ones == 1 ) { + wl.add ( frenchWordTens [ 6 ] ); + wl.add ( "et" ); + wl.add ( frenchWordTeens [ ones ] ); + } else { + StringBuffer sb = new StringBuffer(); + sb.append ( frenchWordTens [ 6 ] ); + sb.append ( '-' ); + sb.append ( frenchWordTeens [ ones ] ); + wl.add ( sb.toString() ); + } + } else if ( tens == 8 ) { + StringBuffer sb = new StringBuffer(); + sb.append ( frenchWordTens [ tens ] ); + if ( ones > 0 ) { + sb.append ( '-' ); + sb.append ( frenchWordOnes [ ones ] ); + } else { + sb.append ( 's' ); + } + wl.add ( sb.toString() ); + } else if ( tens == 9 ) { + StringBuffer sb = new StringBuffer(); + sb.append ( frenchWordTens [ 8 ] ); + sb.append ( '-' ); + sb.append ( frenchWordTeens [ ones ] ); + wl.add ( sb.toString() ); + } + } else if ( ones > 0 ) { + wl.add ( frenchWordOnes [ ones ] ); + } + return wl; + } + } + + /** + * Spanish Word Numerals + */ + private static String[] spanishWordOnes = { "cero", "uno", "dos", "tres", "cuatro", "cinco", "seise", "siete", "ocho", "nueve" }; + private static String[] spanishWordTeens = { "diez", "once", "doce", "trece", "catorce", "quince", "diecis\u00e9is", "diecisiete", "dieciocho", "diecinueve" }; + private static String[] spanishWordTweens = { "veinte", "veintiuno", "veintid\u00f3s", "veintitr\u00e9s", "veinticuatro", "veinticinco", "veintis\u00e9is", "veintisiete", "veintiocho", "veintinueve" }; + private static String[] spanishWordTens = { "", "diez", "veinte", "treinta", "cuarenta", "cincuenta", "sesenta", "setenta", "ochenta", "noventa" }; + private static String[] spanishWordHundreds = { "", "ciento", "doscientos", "trescientos", "cuatrocientos", "quinientos", "seiscientos", "setecientos", "ochocientos", "novecientos" }; + private static String[] spanishWordOthers = { "un", "cien", "mil", "mill\u00f3n", "millones" }; + private static String[] spanishWordOnesOrdMale = { "ninguno", "primero", "segundo", "tercero", "cuarto", "quinto", "sexto", "s\u00e9ptimo", "octavo", "novento", "d\u00e9cimo" }; + private static String[] spanishWordOnesOrdFemale = { "ninguna", "primera", "segunda", "tercera", "cuarta", "quinta", "sexta", "s\u00e9ptima", "octava", "noventa", "d\u00e9cima" }; + private static class SpanishNumberAsWordFormatter implements SpecialNumberFormatter { + private int caseType = Character.UPPERCASE_LETTER; + SpanishNumberAsWordFormatter ( int caseType ) { + this.caseType = caseType; + } + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + List<String> wl = new ArrayList<String>(); + if ( number >= 1000000000000L ) { + return null; + } else { + boolean ordinal = hasFeature ( features, "ordinal" ); + if ( number == 0 ) { + wl.add ( spanishWordOnes [ 0 ] ); + } else if ( ordinal && ( number <= 10 ) ) { + boolean female = hasFeature ( features, "female" ); + if ( female ) { + wl.add ( spanishWordOnesOrdFemale [ (int) number ] ); + } else { + wl.add ( spanishWordOnesOrdMale [ (int) number ] ); + } + } else { + int ones = (int) ( number % 1000 ); + int thousands = (int) ( ( number / 1000 ) % 1000 ); + int millions = (int) ( ( number / 1000000 ) % 1000 ); + int billions = (int) ( ( number / 1000000000 ) % 1000 ); + if ( billions > 0 ) { + if ( billions > 1 ) { + wl = formatOnesInThousand ( wl, billions ); + } + wl.add ( spanishWordOthers[2] ); + wl.add ( spanishWordOthers[4] ); + } + if ( millions > 0 ) { + if ( millions == 1 ) { + wl.add ( spanishWordOthers[0] ); + } else { + wl = formatOnesInThousand ( wl, millions ); + } + if ( millions > 1 ) { + wl.add ( spanishWordOthers[4] ); + } else { + wl.add ( spanishWordOthers[3] ); + } + } + if ( thousands > 0 ) { + if ( thousands > 1 ) { + wl = formatOnesInThousand ( wl, thousands ); + } + wl.add ( spanishWordOthers[2] ); + } + if ( ones > 0 ) { + wl = formatOnesInThousand ( wl, ones ); + } + } + wl = convertWordCase ( wl, caseType ); + return UTF32.toUTF32 ( joinWords ( wl, " " ), 0, true ); + } + } + private List<String> formatOnesInThousand ( List<String> wl, int number ) { + assert number < 1000; + int ones = number % 10; + int tens = ( number / 10 ) % 10; + int hundreds = ( number / 100 ) % 10; + if ( hundreds > 0 ) { + if ( ( hundreds == 1 ) && ( tens == 0 ) && ( ones == 0 ) ) { + wl.add ( spanishWordOthers[1] ); + } else { + wl.add ( spanishWordHundreds [ hundreds ] ); + } + } + if ( tens > 0 ) { + if ( tens == 1 ) { + wl.add ( spanishWordTeens [ ones ] ); + } else if ( tens == 2 ) { + wl.add ( spanishWordTweens [ ones ] ); + } else { + wl.add ( spanishWordTens [ tens ] ); + if ( ones > 0 ) { + wl.add ( "y" ); + wl.add ( spanishWordOnes [ ones ] ); + } + } + } else if ( ones > 0 ) { + wl.add ( spanishWordOnes [ ones ] ); + } + return wl; + } + } + + /** + * Roman (Latin) Numerals + */ + private static int[] romanMapping = { + 100000, + 90000, + 50000, + 40000, + 10000, + 9000, + 5000, + 4000, + 1000, + 900, + 500, + 400, + 100, + 90, + 50, + 40, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1 + }; + private static String[] romanStandardForms = { + null, + null, + null, + null, + null, + null, + null, + null, + "m", + "cm", + "d", + "cd", + "c", + "xc", + "l", + "xl", + "x", + "ix", + null, + null, + null, + "v", + "iv", + null, + null, + "i" + }; + private static String[] romanLargeForms = { + "\u2188", + "\u2182\u2188", + "\u2187", + "\u2182\u2187", + "\u2182", + "\u2180\u2182", + "\u2181", + "\u2180\u2181", + "m", + "cm", + "d", + "cd", + "c", + "xc", + "l", + "xl", + "x", + "ix", + null, + null, + null, + "v", + "iv", + null, + null, + "i" + }; + private static String[] romanNumberForms = { + "\u2188", + "\u2182\u2188", + "\u2187", + "\u2182\u2187", + "\u2182", + "\u2180\u2182", + "\u2181", + "\u2180\u2181", + "\u216F", + "\u216D\u216F", + "\u216E", + "\u216D\u216E", + "\u216D", + "\u2169\u216D", + "\u216C", + "\u2169\u216C", + "\u2169", + "\u2168", + "\u2167", + "\u2166", + "\u2165", + "\u2164", + "\u2163", + "\u2162", + "\u2161", + "\u2160" + }; + private static class RomanNumeralsFormatter implements SpecialNumberFormatter { + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + List<Integer> sl = new ArrayList<Integer>(); + if ( number == 0 ) { + return null; + } else { + String[] forms; + int maxNumber; + if ( hasFeature ( features, "unicode-number-forms" ) ) { + forms = romanNumberForms; + maxNumber = 199999; + } else if ( hasFeature ( features, "large" ) ) { + forms = romanLargeForms; + maxNumber = 199999; + } else { + forms = romanStandardForms; + maxNumber = 4999; + } + if ( number > maxNumber ) { + return null; + } else { + while ( number > 0 ) { + for ( int i = 0, n = romanMapping.length; i < n; i++ ) { + int d = romanMapping [ i ]; + if ( ( number >= d ) && ( forms [ i ] != null ) ) { + appendScalars ( sl, UTF32.toUTF32 ( forms [ i ], 0, true ) ); + number = number - d; + break; + } + } + } + if ( one == (int) 'I' ) { + return toUpperCase ( sl.toArray ( new Integer [ sl.size() ] ) ); + } else if ( one == (int) 'i' ) { + return toLowerCase ( sl.toArray ( new Integer [ sl.size() ] ) ); + } else { + return null; + } + } + } + } + } + + /** + * Isopsephry (Greek) Numerals + */ + private static class IsopsephryNumeralsFormatter implements SpecialNumberFormatter { + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + return null; + } + } + + /** + * Gematria (Hebrew) Numerals + */ + private static int[] hebrewGematriaAlphabeticMap = { + // ones + 0x05D0, // ALEF + 0x05D1, // BET + 0x05D2, // GIMEL + 0x05D3, // DALET + 0x05D4, // HE + 0x05D5, // VAV + 0x05D6, // ZAYIN + 0x05D7, // HET + 0x05D8, // TET + // tens + 0x05D9, // YOD + 0x05DB, // KAF + 0x05DC, // LAMED + 0x05DE, // MEM + 0x05E0, // NUN + 0x05E1, // SAMEKH + 0x05E2, // AYIN + 0x05E4, // PE + 0x05E6, // TSADHI + // hundreds + 0x05E7, // QOF + 0x05E8, // RESH + 0x05E9, // SHIN + 0x05EA, // TAV + 0x05DA, // FINAL KAF + 0x05DD, // FINAL MEM + 0x05DF, // FINAL NUN + 0x05E3, // FINAL PE + 0x05E5, // FINAL TSADHI + }; + private class GematriaNumeralsFormatter implements SpecialNumberFormatter { + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + if ( one == 0x05D0 ) { + if ( letterValue == LETTER_VALUE_ALPHABETIC ) { + return formatNumberAsSequence ( number, one, hebrewGematriaAlphabeticMap.length, hebrewGematriaAlphabeticMap ); + } else if ( letterValue == LETTER_VALUE_TRADITIONAL ) { + if ( ( number == 0 ) || ( number > 1999 ) ) { + return null; + } else { + return formatAsGematriaNumber ( number, features, language, country ); + } + } else { + return null; + } + } else { + return null; + } + } + private Integer[] formatAsGematriaNumber ( long number, String features, String language, String country ) { + List<Integer> sl = new ArrayList<Integer>(); + assert hebrewGematriaAlphabeticMap.length == 27; + assert hebrewGematriaAlphabeticMap[0] == 0x05D0; // ALEF + assert hebrewGematriaAlphabeticMap[21] == 0x05EA; // TAV + assert number != 0; + assert number < 2000; + int[] map = hebrewGematriaAlphabeticMap; + int thousands = (int) ( ( number / 1000 ) % 10 ); + int hundreds = (int) ( ( number / 100 ) % 10 ); + int tens = (int) ( ( number / 10 ) % 10 ); + int ones = (int) ( ( number / 1 ) % 10 ); + if ( thousands > 0 ) { + sl.add ( map [ 0 + ( thousands - 1 ) ] ); + sl.add ( 0x05F3 ); + } + if ( hundreds > 0 ) { + assert hundreds < 10; + if ( hundreds < 5 ) { + sl.add ( map [ 18 + ( hundreds - 1 ) ] ); + } else if ( hundreds < 9 ) { + sl.add ( map [ 18 + ( 4 - 1 ) ] ); + sl.add ( 0x05F4 ); + sl.add ( map [ 18 + ( hundreds - 5 ) ] ); + } else if ( hundreds == 9 ) { + sl.add ( map [ 18 + ( 4 - 1 ) ] ); + sl.add ( map [ 18 + ( 4 - 1 ) ] ); + sl.add ( 0x05F4 ); + sl.add ( map [ 18 + ( hundreds - 9 ) ] ); + } + } + if ( number == 15 ) { + sl.add ( map [ 9 - 1] ); + sl.add ( 0x05F4 ); + sl.add ( map [ 6 - 1] ); + } else if ( number == 16 ) { + sl.add ( map [ 9 - 1 ] ); + sl.add ( 0x05F4 ); + sl.add ( map [ 7 - 1 ] ); + } else { + if ( tens > 0 ) { + assert tens < 10; + sl.add ( map [ 9 + ( tens - 1 ) ] ); + } + if ( ones > 0 ) { + assert ones < 10; + sl.add ( map [ 0 + ( ones - 1 ) ] ); + } + } + return sl.toArray ( new Integer [ sl.size() ] ); + } + } + + /** + * Arabic Numerals + */ + private static int[] arabicAbjadiAlphabeticMap = { + // ones + 0x0623, // ALEF WITH HAMZA ABOVE + 0x0628, // BEH + 0x062C, // JEEM + 0x062F, // DAL + 0x0647, // HEH + 0x0648, // WAW + 0x0632, // ZAIN + 0x062D, // HAH + 0x0637, // TAH + // tens + 0x0649, // ALEF MAQSURA + 0x0643, // KAF + 0x0644, // LAM + 0x0645, // MEEM + 0x0646, // NOON + 0x0633, // SEEN + 0x0639, // AIN + 0x0641, // FEH + 0x0635, // SAD + // hundreds + 0x0642, // QAF + 0x0631, // REH + 0x0634, // SHEEN + 0x062A, // TEH + 0x062B, // THEH + 0x062E, // KHAH + 0x0630, // THAL + 0x0636, // DAD + 0x0638, // ZAH + // thousands + 0x063A, // GHAIN + }; + private static int[] arabicHijaiAlphabeticMap = { + 0x0623, // ALEF WITH HAMZA ABOVE + 0x0628, // BEH + 0x062A, // TEH + 0x062B, // THEH + 0x062C, // JEEM + 0x062D, // HAH + 0x062E, // KHAH + 0x062F, // DAL + 0x0630, // THAL + 0x0631, // REH + 0x0632, // ZAIN + 0x0633, // SEEN + 0x0634, // SHEEN + 0x0635, // SAD + 0x0636, // DAD + 0x0637, // TAH + 0x0638, // ZAH + 0x0639, // AIN + 0x063A, // GHAIN + 0x0641, // FEH + 0x0642, // QAF + 0x0643, // KAF + 0x0644, // LAM + 0x0645, // MEEM + 0x0646, // NOON + 0x0647, // HEH + 0x0648, // WAW + 0x0649, // ALEF MAQSURA + }; + private class ArabicNumeralsFormatter implements SpecialNumberFormatter { + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + if ( one == 0x0627 ) { + int[] map; + if ( letterValue == LETTER_VALUE_TRADITIONAL ) { + map = arabicAbjadiAlphabeticMap; + } else if ( letterValue == LETTER_VALUE_ALPHABETIC ) { + map = arabicHijaiAlphabeticMap; + } else { + map = arabicAbjadiAlphabeticMap; + } + return formatNumberAsSequence ( number, one, map.length, map ); + } else if ( one == 0x0623 ) { + if ( ( number == 0 ) || ( number > 1999 ) ) { + return null; + } else { + return formatAsAbjadiNumber ( number, features, language, country ); + } + } else { + return null; + } + } + private Integer[] formatAsAbjadiNumber ( long number, String features, String language, String country ) { + List<Integer> sl = new ArrayList<Integer>(); + assert arabicAbjadiAlphabeticMap.length == 28; + assert arabicAbjadiAlphabeticMap[0] == 0x0623; // ALEF WITH HAMZA ABOVE + assert arabicAbjadiAlphabeticMap[27] == 0x063A; // GHAIN + assert number != 0; + assert number < 2000; + int[] map = arabicAbjadiAlphabeticMap; + int thousands = (int) ( ( number / 1000 ) % 10 ); + int hundreds = (int) ( ( number / 100 ) % 10 ); + int tens = (int) ( ( number / 10 ) % 10 ); + int ones = (int) ( ( number / 1 ) % 10 ); + if ( thousands > 0 ) { + assert thousands < 2; + sl.add ( map [ 27 + ( thousands - 1 ) ] ); + } + if ( hundreds > 0 ) { + assert thousands < 10; + sl.add ( map [ 18 + ( hundreds - 1 ) ] ); + } + if ( tens > 0 ) { + assert tens < 10; + sl.add ( map [ 9 + ( tens - 1 ) ] ); + } + if ( ones > 0 ) { + assert ones < 10; + sl.add ( map [ 0 + ( ones - 1 ) ] ); + } + return sl.toArray ( new Integer [ sl.size() ] ); + } + } + + /** + * Kana (Japanese) Numerals + */ + private static int[] hiraganaGojuonAlphabeticMap = { + 0x3042, // A + 0x3044, // I + 0x3046, // U + 0x3048, // E + 0x304A, // O + 0x304B, // KA + 0x304D, // KI + 0x304F, // KU + 0x3051, // KE + 0x3053, // KO + 0x3055, // SA + 0x3057, // SI + 0x3059, // SU + 0x305B, // SE + 0x305D, // SO + 0x305F, // TA + 0x3061, // TI + 0x3064, // TU + 0x3066, // TE + 0x3068, // TO + 0x306A, // NA + 0x306B, // NI + 0x306C, // NU + 0x306D, // NE + 0x306E, // NO + 0x306F, // HA + 0x3072, // HI + 0x3075, // HU + 0x3078, // HE + 0x307B, // HO + 0x307E, // MA + 0x307F, // MI + 0x3080, // MU + 0x3081, // ME + 0x3082, // MO + 0x3084, // YA + 0x3086, // YU + 0x3088, // YO + 0x3089, // RA + 0x308A, // RI + 0x308B, // RU + 0x308C, // RE + 0x308D, // RO + 0x308F, // WA + 0x3090, // WI + 0x3091, // WE + 0x3092, // WO + 0x3093, // N + }; + private static int[] katakanaGojuonAlphabeticMap = { + 0x30A2, // A + 0x30A4, // I + 0x30A6, // U + 0x30A8, // E + 0x30AA, // O + 0x30AB, // KA + 0x30AD, // KI + 0x30AF, // KU + 0x30B1, // KE + 0x30B3, // KO + 0x30B5, // SA + 0x30B7, // SI + 0x30B9, // SU + 0x30BB, // SE + 0x30BD, // SO + 0x30BF, // TA + 0x30C1, // TI + 0x30C4, // TU + 0x30C6, // TE + 0x30C8, // TO + 0x30CA, // NA + 0x30CB, // NI + 0x30CC, // NU + 0x30CD, // NE + 0x30CE, // NO + 0x30CF, // HA + 0x30D2, // HI + 0x30D5, // HU + 0x30D8, // HE + 0x30DB, // HO + 0x30DE, // MA + 0x30DF, // MI + 0x30E0, // MU + 0x30E1, // ME + 0x30E2, // MO + 0x30E4, // YA + 0x30E6, // YU + 0x30E8, // YO + 0x30E9, // RA + 0x30EA, // RI + 0x30EB, // RU + 0x30EC, // RE + 0x30ED, // RO + 0x30EF, // WA + 0x30F0, // WI + 0x30F1, // WE + 0x30F2, // WO + 0x30F3, // N + }; + private class KanaNumeralsFormatter implements SpecialNumberFormatter { + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + if ( ( one == 0x3042 ) && ( letterValue == LETTER_VALUE_ALPHABETIC ) ) { + return formatNumberAsSequence ( number, one, hiraganaGojuonAlphabeticMap.length, hiraganaGojuonAlphabeticMap ); + } else if ( ( one == 0x30A2 ) && ( letterValue == LETTER_VALUE_ALPHABETIC ) ) { + return formatNumberAsSequence ( number, one, katakanaGojuonAlphabeticMap.length, katakanaGojuonAlphabeticMap ); + } else { + return null; + } + } + } + + /** + * Thai Numerals + */ + private static int[] thaiAlphabeticMap = { + 0x0E01, + 0x0E02, + 0x0E03, + 0x0E04, + 0x0E05, + 0x0E06, + 0x0E07, + 0x0E08, + 0x0E09, + 0x0E0A, + 0x0E0B, + 0x0E0C, + 0x0E0D, + 0x0E0E, + 0x0E0F, + 0x0E10, + 0x0E11, + 0x0E12, + 0x0E13, + 0x0E14, + 0x0E15, + 0x0E16, + 0x0E17, + 0x0E18, + 0x0E19, + 0x0E1A, + 0x0E1B, + 0x0E1C, + 0x0E1D, + 0x0E1E, + 0x0E1F, + 0x0E20, + 0x0E21, + 0x0E22, + 0x0E23, + // 0x0E24, // RU - not used in modern sequence + 0x0E25, + // 0x0E26, // LU - not used in modern sequence + 0x0E27, + 0x0E28, + 0x0E29, + 0x0E2A, + 0x0E2B, + 0x0E2C, + 0x0E2D, + 0x0E2E, + }; + private class ThaiNumeralsFormatter implements SpecialNumberFormatter { + @Override + public Integer[] format ( long number, int one, int letterValue, String features, String language, String country ) { + if ( ( one == 0x0E01 ) && ( letterValue == LETTER_VALUE_ALPHABETIC ) ) { + return formatNumberAsSequence ( number, one, thaiAlphabeticMap.length, thaiAlphabeticMap ); + } else { + return null; + } + } + } + +} diff --git a/src/java/org/apache/fop/complexscripts/util/ScriptContextTester.java b/src/java/org/apache/fop/complexscripts/util/ScriptContextTester.java new file mode 100644 index 000000000..3f68b00e2 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/ScriptContextTester.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +/** + * Interface for providing script specific context testers. + * @author Glenn Adams + */ +public interface ScriptContextTester { + + /** + * Obtain a glyph context tester for the specified feature. + * @param feature a feature identifier + * @return a glyph context tester or null if none available for the specified feature + */ + GlyphContextTester getTester ( String feature ); + +} diff --git a/src/java/org/apache/fop/complexscripts/util/UTF32.java b/src/java/org/apache/fop/complexscripts/util/UTF32.java new file mode 100644 index 000000000..9df2020f0 --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/UTF32.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +import org.apache.fop.util.CharUtilities; + +// CSOFF: InnerAssignmentCheck + +/** + * UTF32 related utilities. + * @author Glenn Adams + */ +public final class UTF32 { + + private UTF32() { + } + + /** + * Convert Java string (UTF-16) to a Unicode scalar array (UTF-32). + * Note that if there are any non-BMP encoded characters present in the + * input, then the number of entries in the output array will be less + * than the number of elements in the input string. Any + * @param s input string + * @param substitution value to substitute for ill-formed surrogate + * @param errorOnSubstitution throw runtime exception (IllegalArgumentException) in + * case this argument is true and a substitution would be attempted + * @return output scalar array + * @throws IllegalArgumentException if substitution required and errorOnSubstitution + * is not false + */ + public static Integer[] toUTF32 ( String s, int substitution, boolean errorOnSubstitution ) + throws IllegalArgumentException { + int n; + if ( ( n = s.length() ) == 0 ) { + return new Integer[0]; + } else { + Integer[] sa = new Integer [ n ]; + int k = 0; + for ( int i = 0; i < n; i++ ) { + int c = (int) s.charAt(i); + if ( ( c >= 0xD800 ) && ( c < 0xE000 ) ) { + int s1 = c; + int s2 = ( ( i + 1 ) < n ) ? (int) s.charAt ( i + 1 ) : 0; + if ( s1 < 0xDC00 ) { + if ( ( s2 >= 0xDC00 ) && ( s2 < 0xE000 ) ) { + c = ( ( s1 - 0xD800 ) << 10 ) + ( s2 - 0xDC00 ) + 65536; + i++; + } else { + if ( errorOnSubstitution ) { + throw new IllegalArgumentException + ( "isolated high (leading) surrogate" ); + } else { + c = substitution; + } + } + } else { + if ( errorOnSubstitution ) { + throw new IllegalArgumentException + ( "isolated low (trailing) surrogate" ); + } else { + c = substitution; + } + } + } + sa[k++] = c; + } + if ( k == n ) { + return sa; + } else { + Integer[] na = new Integer [ k ]; + System.arraycopy ( sa, 0, na, 0, k ); + return na; + } + } + } + + /** + * Convert a Unicode scalar array (UTF-32) a Java string (UTF-16). + * @param sa input scalar array + * @return output (UTF-16) string + * @throws IllegalArgumentException if an input scalar value is illegal, + * e.g., a surrogate or out of range + */ + public static String fromUTF32 ( Integer[] sa ) throws IllegalArgumentException { + StringBuffer sb = new StringBuffer(); + for ( int s : sa ) { + if ( s < 65535 ) { + if ( ( s < 0xD800 ) || ( s > 0xDFFF ) ) { + sb.append ( (char) s ); + } else { + String ncr = CharUtilities.charToNCRef(s); + throw new IllegalArgumentException + ( "illegal scalar value 0x" + ncr.substring(2, ncr.length() - 1) + + "; cannot be UTF-16 surrogate" ); + } + } else if ( s < 1114112 ) { + int s1 = ( ( ( s - 65536 ) >> 10 ) & 0x3FF ) + 0xD800; + int s2 = ( ( ( s - 65536 ) >> 0 ) & 0x3FF ) + 0xDC00; + sb.append ( (char) s1 ); + sb.append ( (char) s2 ); + } else { + String ncr = CharUtilities.charToNCRef(s); + throw new IllegalArgumentException + ( "illegal scalar value 0x" + ncr.substring(2, ncr.length() - 1) + + "; out of range for UTF-16" ); + } + } + return sb.toString(); + } + +} |