123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490 |
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /* $Id$ */
-
- package org.apache.fop.util;
-
- import java.util.Iterator;
- import java.util.NoSuchElementException;
-
- /**
- * This class provides utilities to distinguish various kinds of Unicode
- * whitespace and to get character widths in a given FontState.
- */
- public class CharUtilities {
-
- /**
- * Character code used to signal a character boundary in
- * inline content, such as an inline with borders and padding
- * or a nested block object.
- */
- public static final char CODE_EOT = 0;
-
- /**
- * Character class: Unicode white space
- */
- public static final int UCWHITESPACE = 0;
- /**
- * Character class: Line feed
- */
- public static final int LINEFEED = 1;
- /**
- * Character class: Boundary between text runs
- */
- public static final int EOT = 2;
- /**
- * Character class: non-whitespace
- */
- public static final int NONWHITESPACE = 3;
- /**
- * Character class: XML whitespace
- */
- public static final int XMLWHITESPACE = 4;
-
-
- /** null char */
- public static final char NULL_CHAR = '\u0000';
- /** linefeed character */
- public static final char LINEFEED_CHAR = '\n';
- /** carriage return */
- public static final char CARRIAGE_RETURN = '\r';
- /** normal tab */
- public static final char TAB = '\t';
- /** normal space */
- public static final char SPACE = '\u0020';
- /** non-breaking space */
- public static final char NBSPACE = '\u00A0';
- /** next line control character */
- public static final char NEXT_LINE = '\u0085';
- /** zero-width space */
- public static final char ZERO_WIDTH_SPACE = '\u200B';
- /** word joiner */
- public static final char WORD_JOINER = '\u2060';
- /** zero-width joiner */
- public static final char ZERO_WIDTH_JOINER = '\u200D';
- /** left-to-right mark */
- public static final char LRM = '\u200E';
- /** right-to-left mark */
- public static final char RLM = '\u202F';
- /** left-to-right embedding */
- public static final char LRE = '\u202A';
- /** right-to-left embedding */
- public static final char RLE = '\u202B';
- /** pop directional formatting */
- public static final char PDF = '\u202C';
- /** left-to-right override */
- public static final char LRO = '\u202D';
- /** right-to-left override */
- public static final char RLO = '\u202E';
- /** zero-width no-break space (= byte order mark) */
- public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
- /** soft hyphen */
- public static final char SOFT_HYPHEN = '\u00AD';
- /** line-separator */
- public static final char LINE_SEPARATOR = '\u2028';
- /** paragraph-separator */
- public static final char PARAGRAPH_SEPARATOR = '\u2029';
- /** missing ideograph */
- public static final char MISSING_IDEOGRAPH = '\u25A1';
- /** Ideogreaphic space */
- public static final char IDEOGRAPHIC_SPACE = '\u3000';
- /** Object replacement character */
- public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
- /** Unicode value indicating the the character is "not a character". */
- public static final char NOT_A_CHARACTER = '\uFFFF';
-
- /**
- * Utility class: Constructor prevents instantiating when subclassed.
- */
- protected CharUtilities() {
- throw new UnsupportedOperationException();
- }
-
- /**
- * Return the appropriate CharClass constant for the type
- * of the passed character.
- * @param c character to inspect
- * @return the determined character class
- */
- public static int classOf(int c) {
- switch (c) {
- case CODE_EOT:
- return EOT;
- case LINEFEED_CHAR:
- return LINEFEED;
- case SPACE:
- case CARRIAGE_RETURN:
- case TAB:
- return XMLWHITESPACE;
- default:
- return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
- }
- }
-
-
- /**
- * Helper method to determine if the character is a
- * space with normal behavior. Normal behavior means that
- * it's not non-breaking.
- * @param c character to inspect
- * @return True if the character is a normal space
- */
- public static boolean isBreakableSpace(int c) {
- return (c == SPACE || isFixedWidthSpace(c));
- }
-
- /**
- * Method to determine if the character is a zero-width space.
- * @param c the character to check
- * @return true if the character is a zero-width space
- */
- public static boolean isZeroWidthSpace(int c) {
- return c == ZERO_WIDTH_SPACE // 200Bh
- || c == WORD_JOINER // 2060h
- || c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
- }
-
- /**
- * Method to determine if the character is a (breakable) fixed-width space.
- * @param c the character to check
- * @return true if the character has a fixed-width
- */
- public static boolean isFixedWidthSpace(int c) {
- return (c >= '\u2000' && c <= '\u200B')
- || c == '\u3000';
- // c == '\u2000' // en quad
- // c == '\u2001' // em quad
- // c == '\u2002' // en space
- // c == '\u2003' // em space
- // c == '\u2004' // three-per-em space
- // c == '\u2005' // four-per-em space
- // c == '\u2006' // six-per-em space
- // c == '\u2007' // figure space
- // c == '\u2008' // punctuation space
- // c == '\u2009' // thin space
- // c == '\u200A' // hair space
- // c == '\u200B' // zero width space
- // c == '\u3000' // ideographic space
- }
-
- /**
- * Method to determine if the character is a nonbreaking
- * space.
- * @param c character to check
- * @return True if the character is a nbsp
- */
- public static boolean isNonBreakableSpace(int c) {
- return
- (c == NBSPACE // no-break space
- || c == '\u202F' // narrow no-break space
- || c == '\u3000' // ideographic space
- || c == WORD_JOINER // word joiner
- || c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
- }
-
- /**
- * Method to determine if the character is an adjustable
- * space.
- * @param c character to check
- * @return True if the character is adjustable
- */
- public static boolean isAdjustableSpace(int c) {
- //TODO: are there other kinds of adjustable spaces?
- return
- (c == '\u0020' // normal space
- || c == NBSPACE); // no-break space
- }
-
- /**
- * Determines if the character represents any kind of space.
- * @param c character to check
- * @return True if the character represents any kind of space
- */
- public static boolean isAnySpace(int c) {
- return (isBreakableSpace(c) || isNonBreakableSpace(c));
- }
-
- /**
- * Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
- * @param c the character
- * @return true if the character is "Alphabetic"
- */
- public static boolean isAlphabetic(int c) {
- //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
- //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
- int generalCategory = Character.getType((char)c);
- switch (generalCategory) {
- case Character.UPPERCASE_LETTER: //Lu
- case Character.LOWERCASE_LETTER: //Ll
- case Character.TITLECASE_LETTER: //Lt
- case Character.MODIFIER_LETTER: //Lm
- case Character.OTHER_LETTER: //Lo
- case Character.LETTER_NUMBER: //Nl
- return true;
- default:
- //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
- //Other_Alphabetic contains mostly more exotic characters
- return false;
- }
- }
-
- /**
- * Indicates whether the given character is an explicit break-character
- * @param c the character to check
- * @return true if the character represents an explicit break
- */
- public static boolean isExplicitBreak(int c) {
- return (c == LINEFEED_CHAR
- || c == CARRIAGE_RETURN
- || c == NEXT_LINE
- || c == LINE_SEPARATOR
- || c == PARAGRAPH_SEPARATOR);
- }
-
- /**
- * Convert a single unicode scalar value to an XML numeric character
- * reference. If in the BMP, four digits are used, otherwise 6 digits are used.
- * @param c a unicode scalar value
- * @return a string representing a numeric character reference
- */
- public static String charToNCRef(int c) {
- StringBuffer sb = new StringBuffer();
- for (int i = 0, nDigits = (c > 0xFFFF) ? 6 : 4; i < nDigits; i++, c >>= 4) {
- int d = c & 0xF;
- char hd;
- if (d < 10) {
- hd = (char) ((int) '0' + d);
- } else {
- hd = (char) ((int) 'A' + (d - 10));
- }
- sb.append(hd);
- }
- return "&#x" + sb.reverse() + ";";
- }
-
- /**
- * Convert a string to a sequence of ASCII or XML numeric character references.
- * @param s a java string (encoded in UTF-16)
- * @return a string representing a sequence of numeric character reference or
- * ASCII characters
- */
- public static String toNCRefs(String s) {
- StringBuffer sb = new StringBuffer();
- if (s != null) {
- for (int i = 0; i < s.length(); i++) {
- char c = s.charAt(i);
- if ((c >= 32) && (c < 127)) {
- if (c == '<') {
- sb.append("<");
- } else if (c == '>') {
- sb.append(">");
- } else if (c == '&') {
- sb.append("&");
- } else {
- sb.append(c);
- }
- } else {
- sb.append(charToNCRef(c));
- }
- }
- }
- return sb.toString();
- }
-
- /**
- * Pad a string S on left out to width W using padding character PAD.
- * @param s string to pad
- * @param width width of field to add padding
- * @param pad character to use for padding
- * @return padded string
- */
- public static String padLeft(String s, int width, char pad) {
- StringBuffer sb = new StringBuffer();
- for (int i = s.length(); i < width; i++) {
- sb.append(pad);
- }
- sb.append(s);
- return sb.toString();
- }
-
- /**
- * Format character for debugging output, which it is prefixed with "0x", padded left with '0'
- * and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
- * @param c character code
- * @return formatted character string
- */
- public static String format(int c) {
- if (c < 1114112) {
- return "0x" + padLeft(Integer.toString(c, 16), (c < 65536) ? 4 : 6, '0');
- } else {
- return "!NOT A CHARACTER!";
- }
- }
-
- /**
- * Determine if two character sequences contain the same characters.
- * @param cs1 first character sequence
- * @param cs2 second character sequence
- * @return true if both sequences have same length and same character sequence
- */
- public static boolean isSameSequence(CharSequence cs1, CharSequence cs2) {
- assert cs1 != null;
- assert cs2 != null;
- if (cs1.length() != cs2.length()) {
- return false;
- } else {
- for (int i = 0, n = cs1.length(); i < n; i++) {
- if (cs1.charAt(i) != cs2.charAt(i)) {
- return false;
- }
- }
- return true;
- }
- }
-
- /**
- * Determine whether the specified character (Unicode code point) is in then Basic
- * Multilingual Plane (BMP). Such code points can be represented using a single {@code char}.
- *
- * @see Character#isBmpCodePoint(int) from Java 1.7
- * @param codePoint the character (Unicode code point) to be tested
- * @return {@code true} if the specified code point is between Character#MIN_VALUE and
- * Character#MAX_VALUE} inclusive; {@code false} otherwise
- */
- public static boolean isBmpCodePoint(int codePoint) {
- return codePoint >>> 16 == 0;
- }
-
- /**
- * Returns 1 if codePoint not in the BMP. This function is particularly useful in for
- * loops over strings where, in presence of surrogate pairs, you need to skip one loop.
- *
- * @param codePoint 1 if codePoint > 0xFFFF, 0 otherwise
- * @return 1 if codePoint > 0xFFFF, 0 otherwise
- */
- public static int incrementIfNonBMP(int codePoint) {
- return isBmpCodePoint(codePoint) ? 0 : 1;
- }
-
- /**
- * Determine if the given characters is part of a surrogate pair.
- *
- * @param ch character to be checked
- * @return true if ch is an high surrogate or a low surrogate
- */
- public static boolean isSurrogatePair(char ch) {
- return Character.isHighSurrogate(ch) || Character.isLowSurrogate(ch);
- }
-
- /**
- * Tells whether there is a surrogate pair starting from the given index in the {@link CharSequence}. If the
- * character at index is an high surrogate then the character at index+1 is checked to be a low surrogate. If a
- * malformed surrogate pair is encountered then an {@link IllegalArgumentException} is thrown.
- * <pre>
- * high surrogate [0xD800 - 0xDC00]
- * low surrogate [0xDC00 - 0xE000]
- * </pre>
- *
- * @param chars CharSequence to check
- * @param index index in the CharSequqnce where to start the check
- * @throws IllegalArgumentException if there wrong usage of surrogate pairs
- * @return true if there is a well-formed surrogate pair at index
- */
- public static boolean containsSurrogatePairAt(CharSequence chars, int index) {
- char ch = chars.charAt(index);
-
- if (Character.isHighSurrogate(ch)) {
- if ((index + 1) > chars.length()) {
- throw new IllegalArgumentException(
- "ill-formed UTF-16 sequence, contains isolated high surrogate at end of sequence");
- }
-
- if (Character.isLowSurrogate(chars.charAt(index + 1))) {
- return true;
- }
-
- throw new IllegalArgumentException(
- "ill-formed UTF-16 sequence, contains isolated high surrogate at index " + index);
-
- } else if (Character.isLowSurrogate(ch)) {
- throw new IllegalArgumentException(
- "ill-formed UTF-16 sequence, contains isolated low surrogate at index " + index);
- }
-
- return false;
- }
-
- /**
- * Creates an iterator to iter a {@link CharSequence} codepoints.
- *
- * @see #codepointsIter(CharSequence, int, int)
- * @param s {@link CharSequence} to iter
- * @return codepoint iterator for the given {@link CharSequence}.
- */
- public static Iterable<Integer> codepointsIter(final CharSequence s) {
- return codepointsIter(s, 0, s.length());
- }
-
- /**
- * Creates an iterator to iter a sub-CharSequence codepoints.
- *
- * @see <a href="http://bugs.java.com/bugdatabase/view_bug.do?bug_id=5003547">Bug JDK-5003547</a>
- * @param s {@link CharSequence} to iter
- * @param beginIndex lower range
- * @param endIndex upper range
- * @return codepoint iterator for the given sub-CharSequence.
- */
- public static Iterable<Integer> codepointsIter(final CharSequence s, final int beginIndex, final int endIndex) {
- if (beginIndex < 0) {
- throw new StringIndexOutOfBoundsException(beginIndex);
- }
- if (endIndex > s.length()) {
- throw new StringIndexOutOfBoundsException(endIndex);
- }
- int subLen = endIndex - beginIndex;
- if (subLen < 0) {
- throw new StringIndexOutOfBoundsException(subLen);
- }
-
- return new Iterable<Integer>() {
- public Iterator<Integer> iterator() {
- return new Iterator<Integer>() {
- int nextIndex = beginIndex;
-
- public boolean hasNext() {
- return nextIndex < endIndex;
- }
-
- public Integer next() {
- if (!hasNext()) {
- // Findbugs wants this: IT_NO_SUCH_ELEMENT
- throw new NoSuchElementException();
- }
- int result = Character.codePointAt(s, nextIndex);
- nextIndex += Character.charCount(result);
- return result;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
- };
- }
- };
- }
- }
|