You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

UTF32.java 4.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.complexscripts.util;
  19. import org.apache.fop.util.CharUtilities;
  20. /**
  21. * <p>UTF32 related utilities.</p>
  22. *
  23. * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
  24. */
  25. public final class UTF32 {
  26. private UTF32() {
  27. }
  28. /**
  29. * Convert Java string (UTF-16) to a Unicode scalar array (UTF-32).
  30. * Note that if there are any non-BMP encoded characters present in the
  31. * input, then the number of entries in the output array will be less
  32. * than the number of elements in the input string. Any
  33. * @param s input string
  34. * @param substitution value to substitute for ill-formed surrogate
  35. * @param errorOnSubstitution throw runtime exception (IllegalArgumentException) in
  36. * case this argument is true and a substitution would be attempted
  37. * @return output scalar array
  38. * @throws IllegalArgumentException if substitution required and errorOnSubstitution
  39. * is not false
  40. */
  41. public static Integer[] toUTF32(String s, int substitution, boolean errorOnSubstitution)
  42. throws IllegalArgumentException {
  43. int n;
  44. if ((n = s.length()) == 0) {
  45. return new Integer[0];
  46. } else {
  47. Integer[] sa = new Integer [ n ];
  48. int k = 0;
  49. for (int i = 0; i < n; i++) {
  50. int c = (int) s.charAt(i);
  51. if ((c >= 0xD800) && (c < 0xE000)) {
  52. int s1 = c;
  53. int s2 = ((i + 1) < n) ? (int) s.charAt(i + 1) : 0;
  54. if (s1 < 0xDC00) {
  55. if ((s2 >= 0xDC00) && (s2 < 0xE000)) {
  56. c = ((s1 - 0xD800) << 10) + (s2 - 0xDC00) + 65536;
  57. i++;
  58. } else {
  59. if (errorOnSubstitution) {
  60. throw new IllegalArgumentException(
  61. "isolated high (leading) surrogate");
  62. } else {
  63. c = substitution;
  64. }
  65. }
  66. } else {
  67. if (errorOnSubstitution) {
  68. throw new IllegalArgumentException(
  69. "isolated low (trailing) surrogate");
  70. } else {
  71. c = substitution;
  72. }
  73. }
  74. }
  75. sa[k++] = c;
  76. }
  77. if (k == n) {
  78. return sa;
  79. } else {
  80. Integer[] na = new Integer [ k ];
  81. System.arraycopy(sa, 0, na, 0, k);
  82. return na;
  83. }
  84. }
  85. }
  86. /**
  87. * Convert a Unicode scalar array (UTF-32) a Java string (UTF-16).
  88. * @param sa input scalar array
  89. * @return output (UTF-16) string
  90. * @throws IllegalArgumentException if an input scalar value is illegal,
  91. * e.g., a surrogate or out of range
  92. */
  93. public static String fromUTF32(Integer[] sa) throws IllegalArgumentException {
  94. StringBuffer sb = new StringBuffer();
  95. for (int s : sa) {
  96. if (s < 65535) {
  97. if ((s < 0xD800) || (s > 0xDFFF)) {
  98. sb.append((char) s);
  99. } else {
  100. String ncr = CharUtilities.charToNCRef(s);
  101. throw new IllegalArgumentException(
  102. "illegal scalar value 0x" + ncr.substring(2, ncr.length() - 1)
  103. + "; cannot be UTF-16 surrogate");
  104. }
  105. } else if (s < 1114112) {
  106. int s1 = (((s - 65536) >> 10) & 0x3FF) + 0xD800;
  107. int s2 = (((s - 65536) >> 0) & 0x3FF) + 0xDC00;
  108. sb.append((char) s1);
  109. sb.append((char) s2);
  110. } else {
  111. String ncr = CharUtilities.charToNCRef(s);
  112. throw new IllegalArgumentException(
  113. "illegal scalar value 0x" + ncr.substring(2, ncr.length() - 1)
  114. + "; out of range for UTF-16");
  115. }
  116. }
  117. return sb.toString();
  118. }
  119. }