You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CharUtilities.java 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.util;
  19. import java.util.Iterator;
  20. import java.util.NoSuchElementException;
  21. /**
  22. * This class provides utilities to distinguish various kinds of Unicode
  23. * whitespace and to get character widths in a given FontState.
  24. */
  25. public class CharUtilities {
  26. /**
  27. * Character code used to signal a character boundary in
  28. * inline content, such as an inline with borders and padding
  29. * or a nested block object.
  30. */
  31. public static final char CODE_EOT = 0;
  32. /**
  33. * Character class: Unicode white space
  34. */
  35. public static final int UCWHITESPACE = 0;
  36. /**
  37. * Character class: Line feed
  38. */
  39. public static final int LINEFEED = 1;
  40. /**
  41. * Character class: Boundary between text runs
  42. */
  43. public static final int EOT = 2;
  44. /**
  45. * Character class: non-whitespace
  46. */
  47. public static final int NONWHITESPACE = 3;
  48. /**
  49. * Character class: XML whitespace
  50. */
  51. public static final int XMLWHITESPACE = 4;
  52. /** null char */
  53. public static final char NULL_CHAR = '\u0000';
  54. /** linefeed character */
  55. public static final char LINEFEED_CHAR = '\n';
  56. /** carriage return */
  57. public static final char CARRIAGE_RETURN = '\r';
  58. /** normal tab */
  59. public static final char TAB = '\t';
  60. /** normal space */
  61. public static final char SPACE = '\u0020';
  62. /** non-breaking space */
  63. public static final char NBSPACE = '\u00A0';
  64. /** next line control character */
  65. public static final char NEXT_LINE = '\u0085';
  66. /** zero-width space */
  67. public static final char ZERO_WIDTH_SPACE = '\u200B';
  68. /** word joiner */
  69. public static final char WORD_JOINER = '\u2060';
  70. /** zero-width joiner */
  71. public static final char ZERO_WIDTH_JOINER = '\u200D';
  72. /** left-to-right mark */
  73. public static final char LRM = '\u200E';
  74. /** right-to-left mark */
  75. public static final char RLM = '\u202F';
  76. /** left-to-right embedding */
  77. public static final char LRE = '\u202A';
  78. /** right-to-left embedding */
  79. public static final char RLE = '\u202B';
  80. /** pop directional formatting */
  81. public static final char PDF = '\u202C';
  82. /** left-to-right override */
  83. public static final char LRO = '\u202D';
  84. /** right-to-left override */
  85. public static final char RLO = '\u202E';
  86. /** zero-width no-break space (= byte order mark) */
  87. public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
  88. /** soft hyphen */
  89. public static final char SOFT_HYPHEN = '\u00AD';
  90. /** line-separator */
  91. public static final char LINE_SEPARATOR = '\u2028';
  92. /** paragraph-separator */
  93. public static final char PARAGRAPH_SEPARATOR = '\u2029';
  94. /** missing ideograph */
  95. public static final char MISSING_IDEOGRAPH = '\u25A1';
  96. /** Ideogreaphic space */
  97. public static final char IDEOGRAPHIC_SPACE = '\u3000';
  98. /** Object replacement character */
  99. public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
  100. /** Unicode value indicating the the character is "not a character". */
  101. public static final char NOT_A_CHARACTER = '\uFFFF';
  102. /**
  103. * Utility class: Constructor prevents instantiating when subclassed.
  104. */
  105. protected CharUtilities() {
  106. throw new UnsupportedOperationException();
  107. }
  108. /**
  109. * Return the appropriate CharClass constant for the type
  110. * of the passed character.
  111. * @param c character to inspect
  112. * @return the determined character class
  113. */
  114. public static int classOf(int c) {
  115. switch (c) {
  116. case CODE_EOT:
  117. return EOT;
  118. case LINEFEED_CHAR:
  119. return LINEFEED;
  120. case SPACE:
  121. case CARRIAGE_RETURN:
  122. case TAB:
  123. return XMLWHITESPACE;
  124. default:
  125. return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
  126. }
  127. }
  128. /**
  129. * Helper method to determine if the character is a
  130. * space with normal behavior. Normal behavior means that
  131. * it's not non-breaking.
  132. * @param c character to inspect
  133. * @return True if the character is a normal space
  134. */
  135. public static boolean isBreakableSpace(int c) {
  136. return (c == SPACE || isFixedWidthSpace(c));
  137. }
  138. /**
  139. * Method to determine if the character is a zero-width space.
  140. * @param c the character to check
  141. * @return true if the character is a zero-width space
  142. */
  143. public static boolean isZeroWidthSpace(int c) {
  144. return c == ZERO_WIDTH_SPACE // 200Bh
  145. || c == WORD_JOINER // 2060h
  146. || c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
  147. }
  148. /**
  149. * Method to determine if the character is a (breakable) fixed-width space.
  150. * @param c the character to check
  151. * @return true if the character has a fixed-width
  152. */
  153. public static boolean isFixedWidthSpace(int c) {
  154. return (c >= '\u2000' && c <= '\u200B')
  155. || c == '\u3000';
  156. // c == '\u2000' // en quad
  157. // c == '\u2001' // em quad
  158. // c == '\u2002' // en space
  159. // c == '\u2003' // em space
  160. // c == '\u2004' // three-per-em space
  161. // c == '\u2005' // four-per-em space
  162. // c == '\u2006' // six-per-em space
  163. // c == '\u2007' // figure space
  164. // c == '\u2008' // punctuation space
  165. // c == '\u2009' // thin space
  166. // c == '\u200A' // hair space
  167. // c == '\u200B' // zero width space
  168. // c == '\u3000' // ideographic space
  169. }
  170. /**
  171. * Method to determine if the character is a nonbreaking
  172. * space.
  173. * @param c character to check
  174. * @return True if the character is a nbsp
  175. */
  176. public static boolean isNonBreakableSpace(int c) {
  177. return
  178. (c == NBSPACE // no-break space
  179. || c == '\u202F' // narrow no-break space
  180. || c == '\u3000' // ideographic space
  181. || c == WORD_JOINER // word joiner
  182. || c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
  183. }
  184. /**
  185. * Method to determine if the character is an adjustable
  186. * space.
  187. * @param c character to check
  188. * @return True if the character is adjustable
  189. */
  190. public static boolean isAdjustableSpace(int c) {
  191. //TODO: are there other kinds of adjustable spaces?
  192. return
  193. (c == '\u0020' // normal space
  194. || c == NBSPACE); // no-break space
  195. }
  196. /**
  197. * Determines if the character represents any kind of space.
  198. * @param c character to check
  199. * @return True if the character represents any kind of space
  200. */
  201. public static boolean isAnySpace(int c) {
  202. return (isBreakableSpace(c) || isNonBreakableSpace(c));
  203. }
  204. /**
  205. * Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
  206. * @param c the character
  207. * @return true if the character is "Alphabetic"
  208. */
  209. public static boolean isAlphabetic(int c) {
  210. //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
  211. //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
  212. int generalCategory = Character.getType((char)c);
  213. switch (generalCategory) {
  214. case Character.UPPERCASE_LETTER: //Lu
  215. case Character.LOWERCASE_LETTER: //Ll
  216. case Character.TITLECASE_LETTER: //Lt
  217. case Character.MODIFIER_LETTER: //Lm
  218. case Character.OTHER_LETTER: //Lo
  219. case Character.LETTER_NUMBER: //Nl
  220. return true;
  221. default:
  222. //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
  223. //Other_Alphabetic contains mostly more exotic characters
  224. return false;
  225. }
  226. }
  227. /**
  228. * Indicates whether the given character is an explicit break-character
  229. * @param c the character to check
  230. * @return true if the character represents an explicit break
  231. */
  232. public static boolean isExplicitBreak(int c) {
  233. return (c == LINEFEED_CHAR
  234. || c == CARRIAGE_RETURN
  235. || c == NEXT_LINE
  236. || c == LINE_SEPARATOR
  237. || c == PARAGRAPH_SEPARATOR);
  238. }
  239. /**
  240. * Convert a single unicode scalar value to an XML numeric character
  241. * reference. If in the BMP, four digits are used, otherwise 6 digits are used.
  242. * @param c a unicode scalar value
  243. * @return a string representing a numeric character reference
  244. */
  245. public static String charToNCRef(int c) {
  246. StringBuffer sb = new StringBuffer();
  247. for (int i = 0, nDigits = (c > 0xFFFF) ? 6 : 4; i < nDigits; i++, c >>= 4) {
  248. int d = c & 0xF;
  249. char hd;
  250. if (d < 10) {
  251. hd = (char) ((int) '0' + d);
  252. } else {
  253. hd = (char) ((int) 'A' + (d - 10));
  254. }
  255. sb.append(hd);
  256. }
  257. return "&#x" + sb.reverse() + ";";
  258. }
  259. /**
  260. * Convert a string to a sequence of ASCII or XML numeric character references.
  261. * @param s a java string (encoded in UTF-16)
  262. * @return a string representing a sequence of numeric character reference or
  263. * ASCII characters
  264. */
  265. public static String toNCRefs(String s) {
  266. StringBuffer sb = new StringBuffer();
  267. if (s != null) {
  268. for (int i = 0; i < s.length(); i++) {
  269. char c = s.charAt(i);
  270. if ((c >= 32) && (c < 127)) {
  271. if (c == '<') {
  272. sb.append("&lt;");
  273. } else if (c == '>') {
  274. sb.append("&gt;");
  275. } else if (c == '&') {
  276. sb.append("&amp;");
  277. } else {
  278. sb.append(c);
  279. }
  280. } else {
  281. sb.append(charToNCRef(c));
  282. }
  283. }
  284. }
  285. return sb.toString();
  286. }
  287. /**
  288. * Pad a string S on left out to width W using padding character PAD.
  289. * @param s string to pad
  290. * @param width width of field to add padding
  291. * @param pad character to use for padding
  292. * @return padded string
  293. */
  294. public static String padLeft(String s, int width, char pad) {
  295. StringBuffer sb = new StringBuffer();
  296. for (int i = s.length(); i < width; i++) {
  297. sb.append(pad);
  298. }
  299. sb.append(s);
  300. return sb.toString();
  301. }
  302. /**
  303. * Format character for debugging output, which it is prefixed with "0x", padded left with '0'
  304. * and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
  305. * @param c character code
  306. * @return formatted character string
  307. */
  308. public static String format(int c) {
  309. if (c < 1114112) {
  310. return "0x" + padLeft(Integer.toString(c, 16), (c < 65536) ? 4 : 6, '0');
  311. } else {
  312. return "!NOT A CHARACTER!";
  313. }
  314. }
  315. /**
  316. * Determine if two character sequences contain the same characters.
  317. * @param cs1 first character sequence
  318. * @param cs2 second character sequence
  319. * @return true if both sequences have same length and same character sequence
  320. */
  321. public static boolean isSameSequence(CharSequence cs1, CharSequence cs2) {
  322. assert cs1 != null;
  323. assert cs2 != null;
  324. if (cs1.length() != cs2.length()) {
  325. return false;
  326. } else {
  327. for (int i = 0, n = cs1.length(); i < n; i++) {
  328. if (cs1.charAt(i) != cs2.charAt(i)) {
  329. return false;
  330. }
  331. }
  332. return true;
  333. }
  334. }
  335. /**
  336. * Determine whether the specified character (Unicode code point) is in then Basic
  337. * Multilingual Plane (BMP). Such code points can be represented using a single {@code char}.
  338. *
  339. * @see Character#isBmpCodePoint(int) from Java 1.7
  340. * @param codePoint the character (Unicode code point) to be tested
  341. * @return {@code true} if the specified code point is between Character#MIN_VALUE and
  342. * Character#MAX_VALUE} inclusive; {@code false} otherwise
  343. */
  344. public static boolean isBmpCodePoint(int codePoint) {
  345. return codePoint >>> 16 == 0;
  346. }
  347. /**
  348. * Returns 1 if codePoint not in the BMP. This function is particularly useful in for
  349. * loops over strings where, in presence of surrogate pairs, you need to skip one loop.
  350. *
  351. * @param codePoint 1 if codePoint > 0xFFFF, 0 otherwise
  352. * @return 1 if codePoint > 0xFFFF, 0 otherwise
  353. */
  354. public static int incrementIfNonBMP(int codePoint) {
  355. return isBmpCodePoint(codePoint) ? 0 : 1;
  356. }
  357. /**
  358. * Determine if the given characters is part of a surrogate pair.
  359. *
  360. * @param ch character to be checked
  361. * @return true if ch is an high surrogate or a low surrogate
  362. */
  363. public static boolean isSurrogatePair(char ch) {
  364. return Character.isHighSurrogate(ch) || Character.isLowSurrogate(ch);
  365. }
  366. /**
  367. * Tells whether there is a surrogate pair starting from the given index in the {@link CharSequence}. If the
  368. * character at index is an high surrogate then the character at index+1 is checked to be a low surrogate. If a
  369. * malformed surrogate pair is encountered then an {@link IllegalArgumentException} is thrown.
  370. * <pre>
  371. * high surrogate [0xD800 - 0xDC00]
  372. * low surrogate [0xDC00 - 0xE000]
  373. * </pre>
  374. *
  375. * @param chars CharSequence to check
  376. * @param index index in the CharSequqnce where to start the check
  377. * @throws IllegalArgumentException if there wrong usage of surrogate pairs
  378. * @return true if there is a well-formed surrogate pair at index
  379. */
  380. public static boolean containsSurrogatePairAt(CharSequence chars, int index) {
  381. char ch = chars.charAt(index);
  382. if (Character.isHighSurrogate(ch)) {
  383. if ((index + 1) > chars.length()) {
  384. throw new IllegalArgumentException(
  385. "ill-formed UTF-16 sequence, contains isolated high surrogate at end of sequence");
  386. }
  387. if (Character.isLowSurrogate(chars.charAt(index + 1))) {
  388. return true;
  389. }
  390. throw new IllegalArgumentException(
  391. "ill-formed UTF-16 sequence, contains isolated high surrogate at index " + index);
  392. } else if (Character.isLowSurrogate(ch)) {
  393. throw new IllegalArgumentException(
  394. "ill-formed UTF-16 sequence, contains isolated low surrogate at index " + index);
  395. }
  396. return false;
  397. }
  398. /**
  399. * Creates an iterator to iter a {@link CharSequence} codepoints.
  400. *
  401. * @see #codepointsIter(CharSequence, int, int)
  402. * @param s {@link CharSequence} to iter
  403. * @return codepoint iterator for the given {@link CharSequence}.
  404. */
  405. public static Iterable<Integer> codepointsIter(final CharSequence s) {
  406. return codepointsIter(s, 0, s.length());
  407. }
  408. /**
  409. * Creates an iterator to iter a sub-CharSequence codepoints.
  410. *
  411. * @see <a haref="http://bugs.java.com/bugdatabase/view_bug.do?bug_id=5003547">Bug JDK-5003547</a>
  412. * @param s {@link CharSequence} to iter
  413. * @param beginIndex lower range
  414. * @param endIndex upper range
  415. * @return codepoint iterator for the given sub-CharSequence.
  416. */
  417. public static Iterable<Integer> codepointsIter(final CharSequence s, final int beginIndex, final int endIndex) {
  418. if (beginIndex < 0) {
  419. throw new StringIndexOutOfBoundsException(beginIndex);
  420. }
  421. if (endIndex > s.length()) {
  422. throw new StringIndexOutOfBoundsException(endIndex);
  423. }
  424. int subLen = endIndex - beginIndex;
  425. if (subLen < 0) {
  426. throw new StringIndexOutOfBoundsException(subLen);
  427. }
  428. return new Iterable<Integer>() {
  429. public Iterator<Integer> iterator() {
  430. return new Iterator<Integer>() {
  431. int nextIndex = beginIndex;
  432. public boolean hasNext() {
  433. return nextIndex < endIndex;
  434. }
  435. public Integer next() {
  436. if (!hasNext()) {
  437. // Findbugs wants this: IT_NO_SUCH_ELEMENT
  438. throw new NoSuchElementException();
  439. }
  440. int result = Character.codePointAt(s, nextIndex);
  441. nextIndex += Character.charCount(result);
  442. return result;
  443. }
  444. public void remove() {
  445. throw new UnsupportedOperationException();
  446. }
  447. };
  448. }
  449. };
  450. }
  451. }