/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* $Id$ */

package org.apache.fop.util;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

// CSOFF: AvoidNestedBlocksCheck
// CSOFF: InnerAssignmentCheck
// CSOFF: WhitespaceAfterCheck
// CSOFF: SimplifyBooleanReturnCheck

/**
 * This class provides utilities to distinguish various kinds of Unicode
 * whitespace and to get character widths in a given FontState.
 */
public class CharUtilities {

    /**
     * Character code used to signal a character boundary in
     * inline content, such as an inline with borders and padding
     * or a nested block object.
     */
    public static final char CODE_EOT = 0;

    /**
     * Character class: Unicode white space
     */
    public static final int UCWHITESPACE = 0;
    /**
     * Character class: Line feed
     */
    public static final int LINEFEED = 1;
    /**
     * Character class: Boundary between text runs
     */
    public static final int EOT = 2;
    /**
     * Character class: non-whitespace
     */
    public static final int NONWHITESPACE = 3;
    /**
     * Character class: XML whitespace
     */
    public static final int XMLWHITESPACE = 4;


    /** null char */
    public static final char NULL_CHAR = '\u0000';
    /** linefeed character */
    public static final char LINEFEED_CHAR = '\n';
    /** carriage return */
    public static final char CARRIAGE_RETURN = '\r';
    /** normal tab */
    public static final char TAB = '\t';
    /** normal space */
    public static final char SPACE = '\u0020';
    /** non-breaking space */
    public static final char NBSPACE = '\u00A0';
    /** next line control character */
    public static final char NEXT_LINE = '\u0085';
    /** zero-width space */
    public static final char ZERO_WIDTH_SPACE = '\u200B';
    /** word joiner */
    public static final char WORD_JOINER = '\u2060';
    /** zero-width joiner */
    public static final char ZERO_WIDTH_JOINER = '\u200D';
    /** left-to-right mark */
    public static final char LRM = '\u200E';
    /** right-to-left mark */
    public static final char RLM = '\u202F';
    /** left-to-right embedding */
    public static final char LRE = '\u202A';
    /** right-to-left embedding */
    public static final char RLE = '\u202B';
    /** pop directional formatting */
    public static final char PDF = '\u202C';
    /** left-to-right override */
    public static final char LRO = '\u202D';
    /** right-to-left override */
    public static final char RLO = '\u202E';
    /** zero-width no-break space (= byte order mark) */
    public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
    /** soft hyphen */
    public static final char SOFT_HYPHEN = '\u00AD';
    /** line-separator */
    public static final char LINE_SEPARATOR = '\u2028';
    /** paragraph-separator */
    public static final char PARAGRAPH_SEPARATOR = '\u2029';
    /** missing ideograph */
    public static final char MISSING_IDEOGRAPH = '\u25A1';
    /** Ideogreaphic space */
    public static final char IDEOGRAPHIC_SPACE = '\u3000';
    /** Object replacement character */
    public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
    /** Unicode value indicating the the character is "not a character". */
    public static final char NOT_A_CHARACTER = '\uFFFF';

    /**
      * A static (class) parameter indicating whether V2 indic shaping
      * rules apply or not, with default being <code>true</code>.
      */
    private static final boolean useV2Indic = true; // CSOK: ConstantNameCheck

    /**
     * Utility class: Constructor prevents instantiating when subclassed.
     */
    protected CharUtilities() {
        throw new UnsupportedOperationException();
    }

    /**
     * Return the appropriate CharClass constant for the type
     * of the passed character.
     * @param c character to inspect
     * @return the determined character class
     */
    public static int classOf ( int c ) {
        switch (c) {
            case CODE_EOT:
                return EOT;
            case LINEFEED_CHAR:
                return LINEFEED;
            case SPACE:
            case CARRIAGE_RETURN:
            case TAB:
                return XMLWHITESPACE;
            default:
                return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
        }
    }


    /**
     * Helper method to determine if the character is a
     * space with normal behavior. Normal behavior means that
     * it's not non-breaking.
     * @param c character to inspect
     * @return True if the character is a normal space
     */
    public static boolean isBreakableSpace ( int c ) {
        return (c == SPACE || isFixedWidthSpace(c));
    }

    /**
     * Method to determine if the character is a zero-width space.
     * @param c the character to check
     * @return true if the character is a zero-width space
     */
    public static boolean isZeroWidthSpace ( int c ) {
        return c == ZERO_WIDTH_SPACE           // 200Bh
            || c == WORD_JOINER                // 2060h
            || c == ZERO_WIDTH_NOBREAK_SPACE;  // FEFFh (also used as BOM)
    }

    /**
     * Method to determine if the character is a (breakable) fixed-width space.
     * @param c the character to check
     * @return true if the character has a fixed-width
     */
    public static boolean isFixedWidthSpace ( int c ) {
        return (c >= '\u2000' && c <= '\u200B')
                || c == '\u3000';
//      c == '\u2000'                   // en quad
//      c == '\u2001'                   // em quad
//      c == '\u2002'                   // en space
//      c == '\u2003'                   // em space
//      c == '\u2004'                   // three-per-em space
//      c == '\u2005'                   // four-per-em space
//      c == '\u2006'                   // six-per-em space
//      c == '\u2007'                   // figure space
//      c == '\u2008'                   // punctuation space
//      c == '\u2009'                   // thin space
//      c == '\u200A'                   // hair space
//      c == '\u200B'                   // zero width space
//      c == '\u3000'                   // ideographic space
    }

    /**
     * Method to determine if the character is a nonbreaking
     * space.
     * @param c character to check
     * @return True if the character is a nbsp
     */
    public static boolean isNonBreakableSpace ( int c ) {
        return
            (c == NBSPACE       // no-break space
            || c == '\u202F'    // narrow no-break space
            || c == '\u3000'    // ideographic space
            || c == WORD_JOINER // word joiner
            || c == ZERO_WIDTH_NOBREAK_SPACE);  // zero width no-break space
    }

    /**
     * Method to determine if the character is an adjustable
     * space.
     * @param c character to check
     * @return True if the character is adjustable
     */
    public static boolean isAdjustableSpace ( int c ) {
        //TODO: are there other kinds of adjustable spaces?
        return
            (c == '\u0020'    // normal space
            || c == NBSPACE); // no-break space
    }

    /**
     * Determines if the character represents any kind of space.
     * @param c character to check
     * @return True if the character represents any kind of space
     */
    public static boolean isAnySpace ( int c ) {
        return (isBreakableSpace(c) || isNonBreakableSpace(c));
    }

    /**
     * Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
     * @param c the character
     * @return true if the character is "Alphabetic"
     */
    public static boolean isAlphabetic ( int c ) {
        //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
        //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
        int generalCategory = Character.getType((char)c);
        switch (generalCategory) {
            case Character.UPPERCASE_LETTER: //Lu
            case Character.LOWERCASE_LETTER: //Ll
            case Character.TITLECASE_LETTER: //Lt
            case Character.MODIFIER_LETTER: //Lm
            case Character.OTHER_LETTER: //Lo
            case Character.LETTER_NUMBER: //Nl
                return true;
            default:
                //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
                //Other_Alphabetic contains mostly more exotic characters
                return false;
        }
    }

    /**
     * Indicates whether the given character is an explicit break-character
     * @param c    the character to check
     * @return  true if the character represents an explicit break
     */
    public static boolean isExplicitBreak ( int c ) {
        return (c == LINEFEED_CHAR
            || c == CARRIAGE_RETURN
            || c == NEXT_LINE
            || c == LINE_SEPARATOR
            || c == PARAGRAPH_SEPARATOR);
    }


    //
    // The following script codes are based on ISO 15924. Codes less than 1000 are
    // official assignments from 15924; those equal to or greater than 1000 are FOP
    // implementation specific.
    // 
    // CSOFF: LineLengthCheck
    /** hebrew script constant */
    public static final int SCRIPT_HEBREW                               = 125;  // 'hebr'
    /** mongolian script constant */
    public static final int SCRIPT_MONGOLIAN                            = 145;  // 'mong'
    /** arabic script constant */
    public static final int SCRIPT_ARABIC                               = 160;  // 'arab'
    /** greek script constant */
    public static final int SCRIPT_GREEK                                = 200;  // 'grek'
    /** latin script constant */
    public static final int SCRIPT_LATIN                                = 215;  // 'latn'
    /** cyrillic script constant */
    public static final int SCRIPT_CYRILLIC                             = 220;  // 'cyrl'
    /** georgian script constant */
    public static final int SCRIPT_GEORGIAN                             = 240;  // 'geor'
    /** bopomofo script constant */
    public static final int SCRIPT_BOPOMOFO                             = 285;  // 'bopo'
    /** hangul script constant */
    public static final int SCRIPT_HANGUL                               = 286;  // 'hang'
    /** gurmukhi script constant */
    public static final int SCRIPT_GURMUKHI                             = 310;  // 'guru'
    /** gurmukhi 2 script constant */
    public static final int SCRIPT_GURMUKHI_2                           = 1310; // 'gur2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** devanagari script constant */
    public static final int SCRIPT_DEVANAGARI                           = 315;  // 'deva'
    /** devanagari 2 script constant */
    public static final int SCRIPT_DEVANAGARI_2                         = 1315; // 'dev2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** gujarati script constant */
    public static final int SCRIPT_GUJARATI                             = 320;  // 'gujr'
    /** gujarati 2 script constant */
    public static final int SCRIPT_GUJARATI_2                           = 1320; // 'gjr2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** bengali script constant */
    public static final int SCRIPT_BENGALI                              = 326;  // 'beng'
    /** bengali 2 script constant */
    public static final int SCRIPT_BENGALI_2                            = 1326; // 'bng2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** oriya script constant */
    public static final int SCRIPT_ORIYA                                = 327;  // 'orya'
    /** oriya 2 script constant */
    public static final int SCRIPT_ORIYA_2                              = 1327; // 'ory2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** tibetan script constant */
    public static final int SCRIPT_TIBETAN                              = 330;  // 'tibt'
    /** telugu script constant */
    public static final int SCRIPT_TELUGU                               = 340;  // 'telu'
    /** telugu 2 script constant */
    public static final int SCRIPT_TELUGU_2                             = 1340; // 'tel2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** kannada script constant */
    public static final int SCRIPT_KANNADA                              = 345;  // 'knda'
    /** kannada 2 script constant */
    public static final int SCRIPT_KANNADA_2                            = 1345; // 'knd2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** tamil script constant */
    public static final int SCRIPT_TAMIL                                = 346;  // 'taml'
    /** tamil 2 script constant */
    public static final int SCRIPT_TAMIL_2                              = 1346; // 'tml2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** malayalam script constant */
    public static final int SCRIPT_MALAYALAM                            = 347;  // 'mlym'
    /** malayalam 2 script constant */
    public static final int SCRIPT_MALAYALAM_2                          = 1347; // 'mlm2'       -- MSFT (pseudo) script tag for variant shaping semantics
    /** sinhalese script constant */
    public static final int SCRIPT_SINHALESE                            = 348;  // 'sinh'
    /** burmese script constant */
    public static final int SCRIPT_BURMESE                              = 350;  // 'mymr'
    /** thai script constant */
    public static final int SCRIPT_THAI                                 = 352;  // 'thai'
    /** khmer script constant */
    public static final int SCRIPT_KHMER                                = 355;  // 'khmr'
    /** lao script constant */
    public static final int SCRIPT_LAO                                  = 356;  // 'laoo'
    /** hiragana script constant */
    public static final int SCRIPT_HIRAGANA                             = 410;  // 'hira'
    /** ethiopic script constant */
    public static final int SCRIPT_ETHIOPIC                             = 430;  // 'ethi'
    /** han script constant */
    public static final int SCRIPT_HAN                                  = 500;  // 'hani'
    /** katakana script constant */
    public static final int SCRIPT_KATAKANA                             = 410;  // 'kana'
    /** math script constant */
    public static final int SCRIPT_MATH                                 = 995;  // 'zmth'
    /** symbol script constant */
    public static final int SCRIPT_SYMBOL                               = 996;  // 'zsym'
    /** undetermined script constant */
    public static final int SCRIPT_UNDETERMINED                         = 998;  // 'zyyy'
    /** uncoded script constant */
    public static final int SCRIPT_UNCODED                              = 999;  // 'zzzz'
    // CSON: LineLengthCheck

    /**
     * Determine if character c is punctuation.
     * @param c a character represented as a unicode scalar value
     * @return true if character is punctuation
     */
    public static boolean isPunctuation ( int c ) {
        if ( ( c >= 0x0021 ) && ( c <= 0x002F ) ) {             // basic latin punctuation
            return true;
        } else if ( ( c >= 0x003A ) && ( c <= 0x0040 ) ) {      // basic latin punctuation
            return true;
        } else if ( ( c >= 0x005F ) && ( c <= 0x0060 ) ) {      // basic latin punctuation
            return true;
        } else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) {      // basic latin punctuation
            return true;
        } else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) {      // basic latin punctuation
            return true;
        } else if ( ( c >= 0x00A1 ) && ( c <= 0x00BF ) ) {      // latin supplement punctuation
            return true;
        } else if ( ( c >= 0x00D7 ) && ( c <= 0x00D7 ) ) {      // latin supplement punctuation
            return true;
        } else if ( ( c >= 0x00F7 ) && ( c <= 0x00F7 ) ) {      // latin supplement punctuation
            return true;
        } else if ( ( c >= 0x2000 ) && ( c <= 0x206F ) ) {      // general punctuation
            return true;
        } else {                                                // [TBD] - not complete
            return false;
        }
    }

    /**
     * Determine if character c is a digit.
     * @param c a character represented as a unicode scalar value
     * @return true if character is a digit
     */
    public static boolean isDigit ( int c ) {
        if ( ( c >= 0x0030 ) && ( c <= 0x0039 ) ) {             // basic latin digits
            return true;
        } else {                                                // [TBD] - not complete
            return false;
        }
    }

    /**
     * Determine if character c belong to the hebrew script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to hebrew script
     */
    public static boolean isHebrew ( int c ) {
        if ( ( c >= 0x0590 ) && ( c <= 0x05FF ) ) {             // hebrew block
            return true;
        } else if ( ( c >= 0xFB00 ) && ( c <= 0xFB4F ) ) {      // hebrew presentation forms block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the mongolian script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to mongolian script
     */
    public static boolean isMongolian ( int c ) {
        if ( ( c >= 0x1800 ) && ( c <= 0x18AF ) ) {             // mongolian block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the arabic script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to arabic script
     */
    public static boolean isArabic ( int c ) {
        if ( ( c >= 0x0600 ) && ( c <= 0x06FF ) ) {             // arabic block
            return true;
        } else if ( ( c >= 0x0750 ) && ( c <= 0x077F ) ) {      // arabic supplement block
            return true;
        } else if ( ( c >= 0xFB50 ) && ( c <= 0xFDFF ) ) {      // arabic presentation forms a block
            return true;
        } else if ( ( c >= 0xFE70 ) && ( c <= 0xFEFF ) ) {      // arabic presentation forms b block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the greek script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to greek script
     */
    public static boolean isGreek ( int c ) {
        if ( ( c >= 0x0370 ) && ( c <= 0x03FF ) ) {             // greek (and coptic) block
            return true;
        } else if ( ( c >= 0x1F00 ) && ( c <= 0x1FFF ) ) {      // greek extended block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the latin script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to latin script
     */
    public static boolean isLatin ( int c ) {
        if ( ( c >= 0x0041 ) && ( c <= 0x005A ) ) {             // basic latin upper case
            return true;
        } else if ( ( c >= 0x0061 ) && ( c <= 0x007A ) ) {      // basic latin lower case
            return true;
        } else if ( ( c >= 0x00C0 ) && ( c <= 0x00D6 ) ) {      // latin supplement upper case
            return true;
        } else if ( ( c >= 0x00D8 ) && ( c <= 0x00DF ) ) {      // latin supplement upper case
            return true;
        } else if ( ( c >= 0x00E0 ) && ( c <= 0x00F6 ) ) {      // latin supplement lower case
            return true;
        } else if ( ( c >= 0x00F8 ) && ( c <= 0x00FF ) ) {      // latin supplement lower case
            return true;
        } else if ( ( c >= 0x0100 ) && ( c <= 0x017F ) ) {      // latin extended a
            return true;
        } else if ( ( c >= 0x0180 ) && ( c <= 0x024F ) ) {      // latin extended b
            return true;
        } else if ( ( c >= 0x1E00 ) && ( c <= 0x1EFF ) ) {      // latin extended additional
            return true;
        } else if ( ( c >= 0x2C60 ) && ( c <= 0x2C7F ) ) {      // latin extended c
            return true;
        } else if ( ( c >= 0xA720 ) && ( c <= 0xA7FF ) ) {      // latin extended d
            return true;
        } else if ( ( c >= 0xFB00 ) && ( c <= 0xFB0F ) ) {      // latin ligatures
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the cyrillic script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to cyrillic script
     */
    public static boolean isCyrillic ( int c ) {
        if ( ( c >= 0x0400 ) && ( c <= 0x04FF ) ) {             // cyrillic block
            return true;
        } else if ( ( c >= 0x0500 ) && ( c <= 0x052F ) ) {      // cyrillic supplement block
            return true;
        } else if ( ( c >= 0x2DE0 ) && ( c <= 0x2DFF ) ) {      // cyrillic extended-a block
            return true;
        } else if ( ( c >= 0xA640 ) && ( c <= 0xA69F ) ) {      // cyrillic extended-b block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the georgian script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to georgian script
     */
    public static boolean isGeorgian ( int c ) {
        if ( ( c >= 0x10A0 ) && ( c <= 0x10FF ) ) {             // georgian block
            return true;
        } else if ( ( c >= 0x2D00 ) && ( c <= 0x2D2F ) ) {      // georgian supplement block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the hangul script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to hangul script
     */
    public static boolean isHangul ( int c ) {
        if ( ( c >= 0x1100 ) && ( c <= 0x11FF ) ) {             // hangul jamo
            return true;
        } else if ( ( c >= 0x3130 ) && ( c <= 0x318F ) ) {      // hangul compatibility jamo
            return true;
        } else if ( ( c >= 0xA960 ) && ( c <= 0xA97F ) ) {      // hangul jamo extended a
            return true;
        } else if ( ( c >= 0xAC00 ) && ( c <= 0xD7A3 ) ) {      // hangul syllables
            return true;
        } else if ( ( c >= 0xD7B0 ) && ( c <= 0xD7FF ) ) {      // hangul jamo extended a
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the gurmukhi script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to gurmukhi script
     */
    public static boolean isGurmukhi ( int c ) {
        if ( ( c >= 0x0A00 ) && ( c <= 0x0A7F ) ) {             // gurmukhi block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the devanagari script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to devanagari script
     */
    public static boolean isDevanagari ( int c ) {
        if ( ( c >= 0x0900 ) && ( c <= 0x097F ) ) {             // devangari block
            return true;
        } else if ( ( c >= 0xA8E0 ) && ( c <= 0xA8FF ) ) {      // devangari extended block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the gujarati script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to gujarati script
     */
    public static boolean isGujarati ( int c ) {
        if ( ( c >= 0x0A80 ) && ( c <= 0x0AFF ) ) {             // gujarati block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the bengali script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to bengali script
     */
    public static boolean isBengali ( int c ) {
        if ( ( c >= 0x0980 ) && ( c <= 0x09FF ) ) {             // bengali block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the oriya script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to oriya script
     */
    public static boolean isOriya ( int c ) {
        if ( ( c >= 0x0B00 ) && ( c <= 0x0B7F ) ) {             // oriya block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the tibetan script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to tibetan script
     */
    public static boolean isTibetan ( int c ) {
        if ( ( c >= 0x0F00 ) && ( c <= 0x0FFF ) ) {             // tibetan block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the telugu script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to telugu script
     */
    public static boolean isTelugu ( int c ) {
        if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) {             // telugu block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the kannada script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to kannada script
     */
    public static boolean isKannada ( int c ) {
        if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) {             // kannada block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the tamil script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to tamil script
     */
    public static boolean isTamil ( int c ) {
        if ( ( c >= 0x0B80 ) && ( c <= 0x0BFF ) ) {             // tamil block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the malayalam script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to malayalam script
     */
    public static boolean isMalayalam ( int c ) {
        if ( ( c >= 0x0D00 ) && ( c <= 0x0D7F ) ) {             // malayalam block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the sinhalese script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to sinhalese script
     */
    public static boolean isSinhalese ( int c ) {
        if ( ( c >= 0x0D80 ) && ( c <= 0x0DFF ) ) {             // sinhala block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the burmese script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to burmese script
     */
    public static boolean isBurmese ( int c ) {
        if ( ( c >= 0x1000 ) && ( c <= 0x109F ) ) {             // burmese (myanmar) block
            return true;
        } else if ( ( c >= 0xAA60 ) && ( c <= 0xAA7F ) ) {      // burmese (myanmar) extended block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the thai script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to thai script
     */
    public static boolean isThai ( int c ) {
        if ( ( c >= 0x0E00 ) && ( c <= 0x0E7F ) ) {             // thai block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the khmer script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to khmer script
     */
    public static boolean isKhmer ( int c ) {
        if ( ( c >= 0x1780 ) && ( c <= 0x17FF ) ) {             // khmer block
            return true;
        } else if ( ( c >= 0x19E0 ) && ( c <= 0x19FF ) ) {      // khmer symbols block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the lao script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to lao script
     */
    public static boolean isLao ( int c ) {
        if ( ( c >= 0x0E80 ) && ( c <= 0x0EFF ) ) {             // lao block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the ethiopic (amharic) script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to ethiopic (amharic) script
     */
    public static boolean isEthiopic ( int c ) {
        if ( ( c >= 0x1200 ) && ( c <= 0x137F ) ) {             // ethiopic block
            return true;
        } else if ( ( c >= 0x1380 ) && ( c <= 0x139F ) ) {      // ethoipic supplement block
            return true;
        } else if ( ( c >= 0x2D80 ) && ( c <= 0x2DDF ) ) {      // ethoipic extended block
            return true;
        } else if ( ( c >= 0xAB00 ) && ( c <= 0xAB2F ) ) {      // ethoipic extended-a block
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the han (unified cjk) script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to han (unified cjk) script
     */
    public static boolean isHan ( int c ) {
        if ( ( c >= 0x3400 ) && ( c <= 0x4DBF ) ) {             
            return true; // cjk unified ideographs extension a
        } else if ( ( c >= 0x4E00 ) && ( c <= 0x9FFF ) ) {      
            return true; // cjk unified ideographs
        } else if ( ( c >= 0xF900 ) && ( c <= 0xFAFF ) ) {      
            return true; // cjk compatibility ideographs
        } else if ( ( c >= 0x20000 ) && ( c <= 0x2A6DF ) ) {    
            return true; // cjk unified ideographs extension b
        } else if ( ( c >= 0x2A700 ) && ( c <= 0x2B73F ) ) {    
            return true; // cjk unified ideographs extension c
        } else if ( ( c >= 0x2F800 ) && ( c <= 0x2FA1F ) ) {    
            return true; // cjk compatibility ideographs supplement
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the bopomofo script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to bopomofo script
     */
    public static boolean isBopomofo ( int c ) {
        if ( ( c >= 0x3100 ) && ( c <= 0x312F ) ) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the hiragana script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to hiragana script
     */
    public static boolean isHiragana ( int c ) {
        if ( ( c >= 0x3040 ) && ( c <= 0x309F ) ) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Determine if character c belong to the katakana script.
     * @param c a character represented as a unicode scalar value
     * @return true if character belongs to katakana script
     */
    public static boolean isKatakana ( int c ) {
        if ( ( c >= 0x30A0 ) && ( c <= 0x30FF ) ) {
            return true;
        } else if ( ( c >= 0x31F0 ) && ( c <= 0x31FF ) ) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined,
     * then the script code 998 ('zyyy') is returned.
     * @param c the character to obtain script
     * @return an ISO15924 script code
     */
    public static int scriptOf ( int c ) { // [TBD] - needs optimization!!!
        if ( isAnySpace ( c ) ) {
            return SCRIPT_UNDETERMINED;
        } else if ( isPunctuation ( c ) ) {
            return SCRIPT_UNDETERMINED;
        } else if ( isDigit ( c ) ) {
            return SCRIPT_UNDETERMINED;
        } else if ( isLatin ( c ) ) {
            return SCRIPT_LATIN;
        } else if ( isCyrillic ( c ) ) {
            return SCRIPT_CYRILLIC;
        } else if ( isGreek ( c ) ) {
            return SCRIPT_GREEK;
        } else if ( isHan ( c ) ) {
            return SCRIPT_HAN;
        } else if ( isBopomofo ( c ) ) {
            return SCRIPT_BOPOMOFO;
        } else if ( isKatakana ( c ) ) {
            return SCRIPT_KATAKANA;
        } else if ( isHiragana ( c ) ) {
            return SCRIPT_HIRAGANA;
        } else if ( isHangul ( c ) ) {
            return SCRIPT_HANGUL;
        } else if ( isArabic ( c ) ) {
            return SCRIPT_ARABIC;
        } else if ( isHebrew ( c ) ) {
            return SCRIPT_HEBREW;
        } else if ( isMongolian ( c ) ) {
            return SCRIPT_MONGOLIAN;
        } else if ( isGeorgian ( c ) ) {
            return SCRIPT_GEORGIAN;
        } else if ( isGurmukhi ( c ) ) {
            return useV2IndicRules ( SCRIPT_GURMUKHI );
        } else if ( isDevanagari ( c ) ) {
            return useV2IndicRules ( SCRIPT_DEVANAGARI );
        } else if ( isGujarati ( c ) ) {
            return useV2IndicRules ( SCRIPT_GUJARATI );
        } else if ( isBengali ( c ) ) {
            return useV2IndicRules ( SCRIPT_BENGALI );
        } else if ( isOriya ( c ) ) {
            return useV2IndicRules ( SCRIPT_ORIYA );
        } else if ( isTibetan ( c ) ) {
            return SCRIPT_TIBETAN;
        } else if ( isTelugu ( c ) ) {
            return useV2IndicRules ( SCRIPT_TELUGU );
        } else if ( isKannada ( c ) ) {
            return useV2IndicRules ( SCRIPT_KANNADA );
        } else if ( isTamil ( c ) ) {
            return useV2IndicRules ( SCRIPT_TAMIL );
        } else if ( isMalayalam ( c ) ) {
            return useV2IndicRules ( SCRIPT_MALAYALAM );
        } else if ( isSinhalese ( c ) ) {
            return SCRIPT_SINHALESE;
        } else if ( isBurmese ( c ) ) {
            return SCRIPT_BURMESE;
        } else if ( isThai ( c ) ) {
            return SCRIPT_THAI;
        } else if ( isKhmer ( c ) ) {
            return SCRIPT_KHMER;
        } else if ( isLao ( c ) ) {
            return SCRIPT_LAO;
        } else if ( isEthiopic ( c ) ) {
            return SCRIPT_ETHIOPIC;
        } else {
            return SCRIPT_UNDETERMINED;
        }
    }

    /**
     * Obtain the V2 indic script code corresponding to V1 indic script code SC if
     * and only iff V2 indic rules apply; otherwise return SC.
     * @param sc a V1 indic script code
     * @return either SC or the V2 flavor of SC if V2 indic rules apply
     */
    public static int useV2IndicRules ( int sc ) {
        if ( useV2Indic ) {
            return ( sc < 1000 ) ? ( sc + 1000 ) : sc;
        } else {
            return sc;
        }
    }

    /**
     * Obtain the  script codes of each character in a character sequence. If script
     * is not or cannot be determined for some character, then the script code 998
     * ('zyyy') is returned.
     * @param cs the character sequence
     * @return a (possibly empty) array of script codes
     */
    public static int[] scriptsOf ( CharSequence cs ) {
        Set s = new HashSet();
        for ( int i = 0, n = cs.length(); i < n; i++ ) {
            s.add ( Integer.valueOf ( scriptOf ( cs.charAt ( i ) ) ) );
        }
        int[] sa = new int [ s.size() ];
        int ns = 0;
        for ( Iterator it = s.iterator(); it.hasNext();) {
            sa [ ns++ ] = ( (Integer) it.next() ) .intValue();
        }
        Arrays.sort ( sa );
        return sa;
    }

    /**
     * Determine the dominant script of a character sequence.
     * @param cs the character sequence
     * @return the dominant script or SCRIPT_UNDETERMINED
     */
    public static int dominantScript ( CharSequence cs ) {
        Map m = new HashMap();
        for ( int i = 0, n = cs.length(); i < n; i++ ) {
            int c = cs.charAt ( i );
            int s = scriptOf ( c );
            Integer k = Integer.valueOf ( s );
            Integer v = (Integer) m.get ( k );
            if ( v != null ) {
                m.put ( k, Integer.valueOf ( v.intValue() + 1 ) );
            } else {
                m.put ( k, Integer.valueOf ( 0 ) );
            }
        }
        int sMax = -1;
        int cMax = -1;
        for ( Iterator it = m.entrySet().iterator(); it.hasNext();) {
            Map.Entry e = (Map.Entry) it.next();
            Integer k = (Integer) e.getKey();
            int s = k.intValue();
            switch ( s ) {
            case SCRIPT_UNDETERMINED:
            case SCRIPT_UNCODED:
                break;
            default:
                {
                    Integer v = (Integer) e.getValue();
                    assert v != null;
                    int c = v.intValue();
                    if ( c > cMax ) {
                        cMax = c; sMax = s;
                    }
                    break;
                }
            }
        }
        if ( sMax < 0 ) {
            sMax = SCRIPT_UNDETERMINED;
        }
        return sMax;
    }

    /**
     * Determine if script tag denotes an 'Indic' script, where a
     * script is an 'Indic' script if it is intended to be processed by
     * the generic 'Indic' Script Processor.
     * @param script a script tag
     * @return true if script tag is a designated 'Indic' script
     */
    public static boolean isIndicScript ( String script ) {
        switch ( scriptCodeFromTag ( script ) ) {
        case SCRIPT_BENGALI:
        case SCRIPT_BENGALI_2:
        case SCRIPT_BURMESE:
        case SCRIPT_DEVANAGARI:
        case SCRIPT_DEVANAGARI_2:
        case SCRIPT_GUJARATI:
        case SCRIPT_GUJARATI_2:
        case SCRIPT_GURMUKHI:
        case SCRIPT_GURMUKHI_2:
        case SCRIPT_KANNADA:
        case SCRIPT_KANNADA_2:
        case SCRIPT_MALAYALAM:
        case SCRIPT_MALAYALAM_2:
        case SCRIPT_ORIYA:
        case SCRIPT_ORIYA_2:
        case SCRIPT_TAMIL:
        case SCRIPT_TAMIL_2:
        case SCRIPT_TELUGU:
        case SCRIPT_TELUGU_2:
            return true;
        default:
            return false;
        }
    }

    /**
     * Determine the script tag associated with an internal script code.
     * @param code the script code
     * @return a  script tag
     */
    public static String scriptTagFromCode ( int code ) {
        Map<Integer,String> m = getScriptTagsMap();
        if ( m != null ) {
            String tag;
            if ( ( tag = m.get ( Integer.valueOf ( code ) ) ) != null ) {
                return tag;
            } else {
                return "";
            }
        } else {
            return "";
        }
    }

    /**
     * Determine the internal script code associated with a script tag.
     * @param tag the script tag
     * @return a script code
     */
    public static int scriptCodeFromTag ( String tag ) {
        Map<String,Integer> m = getScriptCodeMap();
        if ( m != null ) {
            Integer c;
            if ( ( c = m.get ( tag ) ) != null ) {
                return (int) c;
            } else {
                return SCRIPT_UNDETERMINED;
            }
        } else {
            return SCRIPT_UNDETERMINED;
        }
    }

    /**
     * Convert a single unicode scalar value to an XML numeric character
     * reference. If in the BMP, four digits are used, otherwise 6 digits are used.
     * @param c a unicode scalar value
     * @return a string representing a numeric character reference
     */
    public static String charToNCRef ( int c ) {
        StringBuffer sb = new StringBuffer();
        for ( int i = 0, nDigits = ( c > 0xFFFF ) ? 6 : 4; i < nDigits; i++, c >>= 4 ) {
            int d = c & 0xF;
            char hd;
            if ( d < 10 ) {
                hd = (char) ( (int) '0' + d );
            } else {
                hd = (char) ( (int) 'A' + ( d - 10 ) );
            }
            sb.append ( hd );
        }
        return "&#x" + sb.reverse() + ";";
    }

    /**
     * Convert a string to a sequence of ASCII or XML numeric character references.
     * @param s a java string (encoded in UTF-16)
     * @return a string representing a sequence of numeric character reference or
     * ASCII characters
     */
    public static String toNCRefs ( String s ) {
        StringBuffer sb = new StringBuffer();
        if ( s != null ) {
            for ( int i = 0; i < s.length(); i++ ) {
                char c = s.charAt(i);
                if ( ( c >= 32 ) && ( c < 127 ) ) {
                    if ( c == '<' ) {
                        sb.append ( "&lt;" );
                    } else if ( c == '>' ) {
                        sb.append ( "&gt;" );
                    } else if ( c == '&' ) {
                        sb.append ( "&amp;" );
                    } else {
                        sb.append ( c );
                    }
                } else {
                    sb.append ( charToNCRef ( c ) );
                }
            }
        }
        return sb.toString();
    }

    /**
     * Pad a string S on left out to width W using padding character PAD.
     * @param s string to pad
     * @param width width of field to add padding
     * @param pad character to use for padding
     * @return padded string
     */
    public static String padLeft ( String s, int width, char pad ) {
        StringBuffer sb = new StringBuffer();
        for ( int i = s.length(); i < width; i++ ) {
            sb.append(pad);
        }
        sb.append ( s );
        return sb.toString();
    }

    /**
     * Format character for debugging output, which it is prefixed with "0x", padded left with '0'
     * and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
     * @param c character code
     * @return formatted character string
     */
    public static String format ( int c ) {
        if ( c < 1114112 ) {
            return "0x" + padLeft ( Integer.toString ( c, 16 ), ( c < 65536 ) ? 4 : 6, '0' );
        } else {
            return "!NOT A CHARACTER!";
        }
    }

    private static Map<Integer,String> scriptTagsMap = null;
    private static Map<String,Integer> scriptCodeMap = null;

    private static void putScriptTag ( Map tm, Map cm, int code, String tag ) {
        assert tag != null;
        assert tag.length() != 0;
        assert code >= 0;
        assert code <  2000;
        tm.put ( Integer.valueOf ( code ), tag );
        cm.put ( tag, Integer.valueOf ( code ) );
    }

    private static void makeScriptMaps() {
        HashMap<Integer,String> tm = new HashMap<Integer,String>();
        HashMap<String,Integer> cm = new HashMap<String,Integer>();
        putScriptTag ( tm, cm, SCRIPT_HEBREW, "hebr" );
        putScriptTag ( tm, cm, SCRIPT_MONGOLIAN, "mong" );
        putScriptTag ( tm, cm, SCRIPT_ARABIC, "arab" );
        putScriptTag ( tm, cm, SCRIPT_GREEK, "grek" );
        putScriptTag ( tm, cm, SCRIPT_LATIN, "latn" );
        putScriptTag ( tm, cm, SCRIPT_CYRILLIC, "cyrl" );
        putScriptTag ( tm, cm, SCRIPT_GEORGIAN, "geor" );
        putScriptTag ( tm, cm, SCRIPT_BOPOMOFO, "bopo" );
        putScriptTag ( tm, cm, SCRIPT_HANGUL, "hang" );
        putScriptTag ( tm, cm, SCRIPT_GURMUKHI, "guru" );
        putScriptTag ( tm, cm, SCRIPT_GURMUKHI_2, "gur2" );
        putScriptTag ( tm, cm, SCRIPT_DEVANAGARI, "deva" );
        putScriptTag ( tm, cm, SCRIPT_DEVANAGARI_2, "dev2" );
        putScriptTag ( tm, cm, SCRIPT_GUJARATI, "gujr" );
        putScriptTag ( tm, cm, SCRIPT_GUJARATI_2, "gjr2" );
        putScriptTag ( tm, cm, SCRIPT_BENGALI, "beng" );
        putScriptTag ( tm, cm, SCRIPT_BENGALI_2, "bng2" );
        putScriptTag ( tm, cm, SCRIPT_ORIYA, "orya" );
        putScriptTag ( tm, cm, SCRIPT_ORIYA_2, "ory2" );
        putScriptTag ( tm, cm, SCRIPT_TIBETAN, "tibt" );
        putScriptTag ( tm, cm, SCRIPT_TELUGU, "telu" );
        putScriptTag ( tm, cm, SCRIPT_TELUGU_2, "tel2" );
        putScriptTag ( tm, cm, SCRIPT_KANNADA, "knda" );
        putScriptTag ( tm, cm, SCRIPT_KANNADA_2, "knd2" );
        putScriptTag ( tm, cm, SCRIPT_TAMIL, "taml" );
        putScriptTag ( tm, cm, SCRIPT_TAMIL_2, "tml2" );
        putScriptTag ( tm, cm, SCRIPT_MALAYALAM, "mlym" );
        putScriptTag ( tm, cm, SCRIPT_MALAYALAM_2, "mlm2" );
        putScriptTag ( tm, cm, SCRIPT_SINHALESE, "sinh" );
        putScriptTag ( tm, cm, SCRIPT_BURMESE, "mymr" );
        putScriptTag ( tm, cm, SCRIPT_THAI, "thai" );
        putScriptTag ( tm, cm, SCRIPT_KHMER, "khmr" );
        putScriptTag ( tm, cm, SCRIPT_LAO, "laoo" );
        putScriptTag ( tm, cm, SCRIPT_HIRAGANA, "hira" );
        putScriptTag ( tm, cm, SCRIPT_ETHIOPIC, "ethi" );
        putScriptTag ( tm, cm, SCRIPT_HAN, "hani" );
        putScriptTag ( tm, cm, SCRIPT_KATAKANA, "kana" );
        putScriptTag ( tm, cm, SCRIPT_MATH, "zmth" );
        putScriptTag ( tm, cm, SCRIPT_SYMBOL, "zsym" );
        putScriptTag ( tm, cm, SCRIPT_UNDETERMINED, "zyyy" );
        putScriptTag ( tm, cm, SCRIPT_UNCODED, "zzzz" );
        scriptTagsMap = tm;
        scriptCodeMap = cm;
    }

    private static Map<Integer,String> getScriptTagsMap() {
        if ( scriptTagsMap == null ) {
            makeScriptMaps();
        }
        return scriptTagsMap;
    }

    private static Map<String,Integer> getScriptCodeMap() {
        if ( scriptCodeMap == null ) {
            makeScriptMaps();
        }
        return scriptCodeMap;
    }

    /**
     * Mirror characters that are designated as having the bidi mirrorred property.
     * @param s a string whose characters are to be mirrored
     * @return the resulting string
     */
    public static String mirror ( String s ) {
        StringBuffer sb = new StringBuffer ( s );
        for ( int i = 0, n = sb.length(); i < n; i++ ) {
            sb.setCharAt ( i, (char) mirror ( sb.charAt ( i ) ) );
        }
        return sb.toString();
    }

    private static int[] mirroredCharacters = {
        0x0028,
        0x0029,
        0x003C,
        0x003E,
        0x005B,
        0x005D,
        0x007B,
        0x007D,
        0x00AB,
        0x00BB,
        0x0F3A,
        0x0F3B,
        0x0F3C,
        0x0F3D,
        0x169B,
        0x169C,
        0x2039,
        0x203A,
        0x2045,
        0x2046,
        0x207D,
        0x207E,
        0x208D,
        0x208E,
        0x2208,
        0x2209,
        0x220A,
        0x220B,
        0x220C,
        0x220D,
        0x2215,
        0x223C,
        0x223D,
        0x2243,
        0x2252,
        0x2253,
        0x2254,
        0x2255,
        0x2264,
        0x2265,
        0x2266,
        0x2267,
        0x2268,
        0x2269,
        0x226A,
        0x226B,
        0x226E,
        0x226F,
        0x2270,
        0x2271,
        0x2272,
        0x2273,
        0x2274,
        0x2275,
        0x2276,
        0x2277,
        0x2278,
        0x2279,
        0x227A,
        0x227B,
        0x227C,
        0x227D,
        0x227E,
        0x227F,
        0x2280,
        0x2281,
        0x2282,
        0x2283,
        0x2284,
        0x2285,
        0x2286,
        0x2287,
        0x2288,
        0x2289,
        0x228A,
        0x228B,
        0x228F,
        0x2290,
        0x2291,
        0x2292,
        0x2298,
        0x22A2,
        0x22A3,
        0x22A6,
        0x22A8,
        0x22A9,
        0x22AB,
        0x22B0,
        0x22B1,
        0x22B2,
        0x22B3,
        0x22B4,
        0x22B5,
        0x22B6,
        0x22B7,
        0x22C9,
        0x22CA,
        0x22CB,
        0x22CC,
        0x22CD,
        0x22D0,
        0x22D1,
        0x22D6,
        0x22D7,
        0x22D8,
        0x22D9,
        0x22DA,
        0x22DB,
        0x22DC,
        0x22DD,
        0x22DE,
        0x22DF,
        0x22E0,
        0x22E1,
        0x22E2,
        0x22E3,
        0x22E4,
        0x22E5,
        0x22E6,
        0x22E7,
        0x22E8,
        0x22E9,
        0x22EA,
        0x22EB,
        0x22EC,
        0x22ED,
        0x22F0,
        0x22F1,
        0x22F2,
        0x22F3,
        0x22F4,
        0x22F6,
        0x22F7,
        0x22FA,
        0x22FB,
        0x22FC,
        0x22FD,
        0x22FE,
        0x2308,
        0x2309,
        0x230A,
        0x230B,
        0x2329,
        0x232A,
        0x2768,
        0x2769,
        0x276A,
        0x276B,
        0x276C,
        0x276D,
        0x276E,
        0x276F,
        0x2770,
        0x2771,
        0x2772,
        0x2773,
        0x2774,
        0x2775,
        0x27C3,
        0x27C4,
        0x27C5,
        0x27C6,
        0x27C8,
        0x27C9,
        0x27D5,
        0x27D6,
        0x27DD,
        0x27DE,
        0x27E2,
        0x27E3,
        0x27E4,
        0x27E5,
        0x27E6,
        0x27E7,
        0x27E8,
        0x27E9,
        0x27EA,
        0x27EB,
        0x27EC,
        0x27ED,
        0x27EE,
        0x27EF,
        0x2983,
        0x2984,
        0x2985,
        0x2986,
        0x2987,
        0x2988,
        0x2989,
        0x298A,
        0x298B,
        0x298C,
        0x298D,
        0x298E,
        0x298F,
        0x2990,
        0x2991,
        0x2992,
        0x2993,
        0x2994,
        0x2995,
        0x2996,
        0x2997,
        0x2998,
        0x29B8,
        0x29C0,
        0x29C1,
        0x29C4,
        0x29C5,
        0x29CF,
        0x29D0,
        0x29D1,
        0x29D2,
        0x29D4,
        0x29D5,
        0x29D8,
        0x29D9,
        0x29DA,
        0x29DB,
        0x29F5,
        0x29F8,
        0x29F9,
        0x29FC,
        0x29FD,
        0x2A2B,
        0x2A2C,
        0x2A2D,
        0x2A2E,
        0x2A34,
        0x2A35,
        0x2A3C,
        0x2A3D,
        0x2A64,
        0x2A65,
        0x2A79,
        0x2A7A,
        0x2A7D,
        0x2A7E,
        0x2A7F,
        0x2A80,
        0x2A81,
        0x2A82,
        0x2A83,
        0x2A84,
        0x2A8B,
        0x2A8C,
        0x2A91,
        0x2A92,
        0x2A93,
        0x2A94,
        0x2A95,
        0x2A96,
        0x2A97,
        0x2A98,
        0x2A99,
        0x2A9A,
        0x2A9B,
        0x2A9C,
        0x2AA1,
        0x2AA2,
        0x2AA6,
        0x2AA7,
        0x2AA8,
        0x2AA9,
        0x2AAA,
        0x2AAB,
        0x2AAC,
        0x2AAD,
        0x2AAF,
        0x2AB0,
        0x2AB3,
        0x2AB4,
        0x2AC3,
        0x2AC4,
        0x2AC5,
        0x2AC6,
        0x2ACD,
        0x2ACE,
        0x2ACF,
        0x2AD0,
        0x2AD1,
        0x2AD2,
        0x2AD3,
        0x2AD4,
        0x2AD5,
        0x2AD6,
        0x2ADE,
        0x2AE3,
        0x2E02,
        0x2E03,
        0x2E04,
        0x2E05,
        0x2E09,
        0x2E0A,
        0x2E0C,
        0x2E0D,
        0x2E1C,
        0x2E1D,
        0x2E20,
        0x2E21,
        0x2E22,
        0x2E23,
        0x2E24,
        0x2E25,
        0x2E26,
        0x300E,
        0x300F,
        0x3010,
        0x3011,
        0x3014,
        0x3015,
        0x3016,
        0x3017,
        0x3018,
        0x3019,
        0x301A,
        0x301B,
        0xFE59,
        0xFE5A,
        0xFF3B,
        0xFF3D,
        0xFF5B,
        0xFF5D,
        0xFF5F,
        0xFF60,
        0xFF62,
        0xFF63
    };

    private static int[] mirroredCharactersMapping = {
        0x0029,
        0x0028,
        0x003E,
        0x003C,
        0x005D,
        0x005B,
        0x007D,
        0x007B,
        0x00BB,
        0x00AB,
        0x0F3B,
        0x0F3A,
        0x0F3D,
        0x0F3C,
        0x169C,
        0x169B,
        0x203A,
        0x2039,
        0x2046,
        0x2045,
        0x207E,
        0x207D,
        0x208E,
        0x208D,
        0x220B,
        0x220C,
        0x220D,
        0x2208,
        0x2209,
        0x220A,
        0x29F5,
        0x223D,
        0x223C,
        0x22CD,
        0x2253,
        0x2252,
        0x2255,
        0x2254,
        0x2265,
        0x2264,
        0x2267,
        0x2266,
        0x2269,
        0x2268,
        0x226B,
        0x226A,
        0x226F,
        0x226E,
        0x2271,
        0x2270,
        0x2273,
        0x2272,
        0x2275,
        0x2274,
        0x2277,
        0x2276,
        0x2279,
        0x2278,
        0x227B,
        0x227A,
        0x227D,
        0x227C,
        0x227F,
        0x227E,
        0x2281,
        0x2280,
        0x2283,
        0x2282,
        0x2285,
        0x2284,
        0x2287,
        0x2286,
        0x2289,
        0x2288,
        0x228B,
        0x228A,
        0x2290,
        0x228F,
        0x2292,
        0x2291,
        0x29B8,
        0x22A3,
        0x22A2,
        0x2ADE,
        0x2AE4,
        0x2AE3,
        0x2AE5,
        0x22B1,
        0x22B0,
        0x22B3,
        0x22B2,
        0x22B5,
        0x22B4,
        0x22B7,
        0x22B6,
        0x22CA,
        0x22C9,
        0x22CC,
        0x22CB,
        0x2243,
        0x22D1,
        0x22D0,
        0x22D7,
        0x22D6,
        0x22D9,
        0x22D8,
        0x22DB,
        0x22DA,
        0x22DD,
        0x22DC,
        0x22DF,
        0x22DE,
        0x22E1,
        0x22E0,
        0x22E3,
        0x22E2,
        0x22E5,
        0x22E4,
        0x22E7,
        0x22E6,
        0x22E9,
        0x22E8,
        0x22EB,
        0x22EA,
        0x22ED,
        0x22EC,
        0x22F1,
        0x22F0,
        0x22FA,
        0x22FB,
        0x22FC,
        0x22FD,
        0x22FE,
        0x22F2,
        0x22F3,
        0x22F4,
        0x22F6,
        0x22F7,
        0x2309,
        0x2308,
        0x230B,
        0x230A,
        0x232A,
        0x2329,
        0x2769,
        0x2768,
        0x276B,
        0x276A,
        0x276D,
        0x276C,
        0x276F,
        0x276E,
        0x2771,
        0x2770,
        0x2773,
        0x2772,
        0x2775,
        0x2774,
        0x27C4,
        0x27C3,
        0x27C6,
        0x27C5,
        0x27C9,
        0x27C8,
        0x27D6,
        0x27D5,
        0x27DE,
        0x27DD,
        0x27E3,
        0x27E2,
        0x27E5,
        0x27E4,
        0x27E7,
        0x27E6,
        0x27E9,
        0x27E8,
        0x27EB,
        0x27EA,
        0x27ED,
        0x27EC,
        0x27EF,
        0x27EE,
        0x2984,
        0x2983,
        0x2986,
        0x2985,
        0x2988,
        0x2987,
        0x298A,
        0x2989,
        0x298C,
        0x298B,
        0x2990,
        0x298F,
        0x298E,
        0x298D,
        0x2992,
        0x2991,
        0x2994,
        0x2993,
        0x2996,
        0x2995,
        0x2998,
        0x2997,
        0x2298,
        0x29C1,
        0x29C0,
        0x29C5,
        0x29C4,
        0x29D0,
        0x29CF,
        0x29D2,
        0x29D1,
        0x29D5,
        0x29D4,
        0x29D9,
        0x29D8,
        0x29DB,
        0x29DA,
        0x2215,
        0x29F9,
        0x29F8,
        0x29FD,
        0x29FC,
        0x2A2C,
        0x2A2B,
        0x2A2E,
        0x2A2D,
        0x2A35,
        0x2A34,
        0x2A3D,
        0x2A3C,
        0x2A65,
        0x2A64,
        0x2A7A,
        0x2A79,
        0x2A7E,
        0x2A7D,
        0x2A80,
        0x2A7F,
        0x2A82,
        0x2A81,
        0x2A84,
        0x2A83,
        0x2A8C,
        0x2A8B,
        0x2A92,
        0x2A91,
        0x2A94,
        0x2A93,
        0x2A96,
        0x2A95,
        0x2A98,
        0x2A97,
        0x2A9A,
        0x2A99,
        0x2A9C,
        0x2A9B,
        0x2AA2,
        0x2AA1,
        0x2AA7,
        0x2AA6,
        0x2AA9,
        0x2AA8,
        0x2AAB,
        0x2AAA,
        0x2AAD,
        0x2AAC,
        0x2AB0,
        0x2AAF,
        0x2AB4,
        0x2AB3,
        0x2AC4,
        0x2AC3,
        0x2AC6,
        0x2AC5,
        0x2ACE,
        0x2ACD,
        0x2AD0,
        0x2ACF,
        0x2AD2,
        0x2AD1,
        0x2AD4,
        0x2AD3,
        0x2AD6,
        0x2AD5,
        0x22A6,
        0x22A9,
        0x2E03,
        0x2E02,
        0x2E05,
        0x2E04,
        0x2E0A,
        0x2E09,
        0x2E0D,
        0x2E0C,
        0x2E1D,
        0x2E1C,
        0x2E21,
        0x2E20,
        0x2E23,
        0x2E22,
        0x2E25,
        0x2E24,
        0x2E27,
        0x300F,
        0x300E,
        0x3011,
        0x3010,
        0x3015,
        0x3014,
        0x3017,
        0x3016,
        0x3019,
        0x3018,
        0x301B,
        0x301A,
        0xFE5A,
        0xFE59,
        0xFF3D,
        0xFF3B,
        0xFF5D,
        0xFF5B,
        0xFF60,
        0xFF5F,
        0xFF63,
        0xFF62
    };

    private static int mirror ( int c ) {
        int i = Arrays.binarySearch ( mirroredCharacters, c );
        if ( i < 0 ) {
            return c;
        } else {
            return mirroredCharactersMapping [ i ];
        }
    }

    /**
     * Determine if two character sequences contain the same characters.
     * @param cs1 first character sequence
     * @param cs2 second character sequence
     * @return true if both sequences have same length and same character sequence
     */
    public static boolean isSameSequence ( CharSequence cs1, CharSequence cs2 ) {
        assert cs1 != null;
        assert cs2 != null;
        if ( cs1.length() != cs2.length() ) {
            return false;
        } else {
            for ( int i = 0, n = cs1.length(); i < n; i++ ) {
                if ( cs1.charAt(i) != cs2.charAt(i) ) {
                    return false;
                }
            }
            return true;
        }
    }

    /**
     * Convert Java string (UTF-16) to a Unicode scalar array (UTF-32).
     * Note that if there are any non-BMP encoded characters present in the
     * input, then the number of entries in the output array will be less
     * than the number of elements in the input string. Any 
     * @param s input string
     * @param substitution value to substitute for ill-formed surrogate
     * @param errorOnSubstitution throw runtime exception (IllegalArgumentException) in
     * case this argument is true and a substitution would be attempted
     * @return output scalar array
     * @throws IllegalArgumentException if substitution required and errorOnSubstitution
     *   is not false
     */
    public static Integer[] toUTF32 ( String s, int substitution, boolean errorOnSubstitution )
        throws IllegalArgumentException {
        int n;
        if ( ( n = s.length() ) == 0 ) {
            return new Integer[0];
        } else {
            Integer[] sa = new Integer [ n ];
            int k = 0;
            for ( int i = 0; i < n; i++ ) {
                int c = (int) s.charAt(i);
                if ( ( c >= 0xD800 ) && ( c < 0xE000 ) ) {
                    int s1 = c;
                    int s2 = ( ( i + 1 ) < n ) ? (int) s.charAt ( i + 1 ) : 0;
                    if ( s1 < 0xDC00 ) {
                        if ( ( s2 >= 0xDC00 ) && ( s2 < 0xE000 ) ) {
                            c = ( ( s1 - 0xD800 ) << 10 ) + ( s2 - 0xDC00 ) + 65536;
                            i++;
                        } else {
                            if ( errorOnSubstitution ) {
                                throw new IllegalArgumentException
                                    ( "isolated high (leading) surrogate" );
                            } else {
                                c = substitution;
                            }
                        }
                    } else {
                        if ( errorOnSubstitution ) {
                            throw new IllegalArgumentException
                                ( "isolated low (trailing) surrogate" );
                        } else {
                            c = substitution;
                        }
                    }
                }
                sa[k++] = c;
            }
            if ( k == n ) {
                return sa;
            } else {
                Integer[] na = new Integer [ k ];
                System.arraycopy ( sa, 0, na, 0, k );
                return na;
            }
        }
    }

    /**
     * Convert a Unicode scalar array (UTF-32) a Java string (UTF-16).
     * @param sa input scalar array
     * @return output (UTF-16) string
     * @throws IllegalArgumentException if an input scalar value is illegal,
     *   e.g., a surrogate or out of range
     */
    public static String fromUTF32 ( Integer[] sa ) throws IllegalArgumentException {
        StringBuffer sb = new StringBuffer();
        for ( int s : sa ) {
            if ( s < 65535 ) {
                if ( ( s < 0xD800 ) || ( s > 0xDFFF ) ) {
                    sb.append ( (char) s );
                } else {
                    String ncr = charToNCRef(s);
                    throw new IllegalArgumentException
                        ( "illegal scalar value 0x" + ncr.substring(2,ncr.length() - 1)
                          + "; cannot be UTF-16 surrogate" );
                }
            } else if ( s < 1114112 ) {
                int s1 = ( ( ( s - 65536 ) >> 10 ) & 0x3FF ) + 0xD800;
                int s2 = ( ( ( s - 65536 ) >>  0 ) & 0x3FF ) + 0xDC00;
                sb.append ( (char) s1 );
                sb.append ( (char) s2 );
            } else {
                String ncr = charToNCRef(s);
                throw new IllegalArgumentException
                    ( "illegal scalar value 0x" + ncr.substring(2,ncr.length() - 1)
                      + "; out of range for UTF-16"  );
            }
        }
        return sb.toString();
    }
}