You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PDFToUnicodeCMap.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. /* $Id$ */
  18. package org.apache.fop.pdf;
  19. import java.io.IOException;
  20. import java.io.Writer;
  21. /**
  22. * Class representing ToUnicode CMaps.
  23. * Here are some documentation resources:
  24. * <ul>
  25. * <li>PDF Reference, Second Edition, Section 5.6.4, for general information
  26. * about CMaps in PDF Files.</li>
  27. * <li>PDF Reference, Second Edition, Section 5.9, for specific information
  28. * about ToUnicodeCMaps in PDF Files.</li>
  29. * <li>
  30. * <a href="http://partners.adobe.com/asn/developer/pdfs/tn/5411.ToUnicode.pdf">
  31. * Adobe Technical Note #5411, "ToUnicode Mapping File Tutorial"</a>.
  32. * </ul>
  33. */
  34. public class PDFToUnicodeCMap extends PDFCMap {
  35. /**
  36. * The array of Unicode characters ordered by character code
  37. * (maps from character code to Unicode code point).
  38. */
  39. protected char[] unicodeCharMap;
  40. private boolean singleByte;
  41. /**
  42. * Constructor.
  43. *
  44. * @param unicodeCharMap An array of Unicode characters ordered by character code
  45. * (maps from character code to Unicode code point)
  46. * @param name One of the registered names found in Table 5.14 in PDF
  47. * Reference, Second Edition.
  48. * @param sysInfo The attributes of the character collection of the CIDFont.
  49. * @param singleByte true for single-byte, false for double-byte
  50. */
  51. public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo,
  52. boolean singleByte) {
  53. super(name, sysInfo);
  54. if (singleByte && unicodeCharMap.length > 256) {
  55. throw new IllegalArgumentException("unicodeCharMap may not contain more than"
  56. + " 256 characters for single-byte encodings");
  57. }
  58. this.unicodeCharMap = unicodeCharMap;
  59. this.singleByte = singleByte;
  60. }
  61. /** {@inheritDoc} */
  62. protected CMapBuilder createCMapBuilder(Writer writer) {
  63. return new ToUnicodeCMapBuilder(writer);
  64. }
  65. class ToUnicodeCMapBuilder extends CMapBuilder {
  66. public ToUnicodeCMapBuilder(Writer writer) {
  67. super(writer, null);
  68. }
  69. /**
  70. * Writes the CMap to a Writer.
  71. * @throws IOException if an I/O error occurs
  72. */
  73. public void writeCMap() throws IOException {
  74. writeCIDInit();
  75. writeCIDSystemInfo("Adobe", "UCS", 0);
  76. writeName("Adobe-Identity-UCS");
  77. writeType("2");
  78. writeCodeSpaceRange(singleByte);
  79. writeBFEntries();
  80. writeWrapUp();
  81. }
  82. /**
  83. * Writes the character mappings for this font.
  84. */
  85. protected void writeBFEntries() throws IOException {
  86. if (unicodeCharMap != null) {
  87. writeBFCharEntries(unicodeCharMap);
  88. writeBFRangeEntries(unicodeCharMap);
  89. }
  90. }
  91. /**
  92. * Writes the entries for single characters of a base font (only characters which cannot be
  93. * expressed as part of a character range).
  94. * @param charArray all the characters to map
  95. * @throws IOException
  96. */
  97. protected void writeBFCharEntries(char[] charArray) throws IOException {
  98. int totalEntries = 0;
  99. for (int i = 0; i < charArray.length; i++) {
  100. if (!partOfRange(charArray, i)) {
  101. totalEntries++;
  102. }
  103. }
  104. if (totalEntries < 1) {
  105. return;
  106. }
  107. int remainingEntries = totalEntries;
  108. int charIndex = 0;
  109. do {
  110. /* Limited to 100 entries in each section */
  111. int entriesThisSection = Math.min(remainingEntries, 100);
  112. writer.write(entriesThisSection + " beginbfchar\n");
  113. for (int i = 0; i < entriesThisSection; i++) {
  114. /* Go to the next char not in a range */
  115. while (partOfRange(charArray, charIndex)) {
  116. charIndex++;
  117. }
  118. writer.write("<" + padCharIndex(charIndex) + "> ");
  119. if (Character.codePointAt(charArray, charIndex) > 0xFFFF) {
  120. // Handle UTF-16 surrogate pairs
  121. String pairs = Integer.toHexString(charArray[charIndex])
  122. + Integer.toHexString(charArray[++charIndex]);
  123. writer.write("<" + pairs + ">\n");
  124. i++;
  125. } else {
  126. writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4)
  127. + ">\n");
  128. }
  129. charIndex++;
  130. }
  131. remainingEntries -= entriesThisSection;
  132. writer.write("endbfchar\n");
  133. } while (remainingEntries > 0);
  134. }
  135. private String padCharIndex(int charIndex) {
  136. return padHexString(Integer.toHexString(charIndex), (singleByte ? 2 : 4));
  137. }
  138. /**
  139. * Writes the entries for character ranges for a base font.
  140. * @param charArray all the characters to map
  141. * @throws IOException
  142. */
  143. protected void writeBFRangeEntries(char[] charArray) throws IOException {
  144. int totalEntries = 0;
  145. for (int i = 0; i < charArray.length; i++) {
  146. if (startOfRange(charArray, i)) {
  147. totalEntries++;
  148. }
  149. }
  150. if (totalEntries < 1) {
  151. return;
  152. }
  153. int remainingEntries = totalEntries;
  154. int charIndex = 0;
  155. do {
  156. /* Limited to 100 entries in each section */
  157. int entriesThisSection = Math.min(remainingEntries, 100);
  158. writer.write(entriesThisSection + " beginbfrange\n");
  159. for (int i = 0; i < entriesThisSection; i++) {
  160. /* Go to the next start of a range */
  161. while (!startOfRange(charArray, charIndex)) {
  162. charIndex++;
  163. }
  164. writer.write("<" + padCharIndex(charIndex) + "> ");
  165. writer.write("<"
  166. + padCharIndex(endOfRange(charArray, charIndex))
  167. + "> ");
  168. writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4)
  169. + ">\n");
  170. charIndex++;
  171. }
  172. remainingEntries -= entriesThisSection;
  173. writer.write("endbfrange\n");
  174. } while (remainingEntries > 0);
  175. }
  176. /**
  177. * Find the end of the current range.
  178. * @param charArray The array which is being tested.
  179. * @param startOfRange The index to the array element that is the start of
  180. * the range.
  181. * @return The index to the element that is the end of the range.
  182. */
  183. private int endOfRange(char[] charArray, int startOfRange) {
  184. int i = startOfRange;
  185. while (i < charArray.length - 1 && sameRangeEntryAsNext(charArray, i)) {
  186. i++;
  187. }
  188. return i;
  189. }
  190. /**
  191. * Determine whether this array element should be part of a bfchar entry or
  192. * a bfrange entry.
  193. * @param charArray The array to be tested.
  194. * @param arrayIndex The index to the array element to be tested.
  195. * @return True if this array element should be included in a range.
  196. */
  197. private boolean partOfRange(char[] charArray, int arrayIndex) {
  198. if (charArray.length < 2) {
  199. return false;
  200. }
  201. if (arrayIndex == 0) {
  202. return sameRangeEntryAsNext(charArray, 0);
  203. }
  204. if (arrayIndex == charArray.length - 1) {
  205. return sameRangeEntryAsNext(charArray, arrayIndex - 1);
  206. }
  207. if (sameRangeEntryAsNext(charArray, arrayIndex - 1)) {
  208. return true;
  209. }
  210. if (sameRangeEntryAsNext(charArray, arrayIndex)) {
  211. return true;
  212. }
  213. return false;
  214. }
  215. /**
  216. * Determine whether two bytes can be written in the same bfrange entry.
  217. * @param charArray The array to be tested.
  218. * @param firstItem The first of the two items in the array to be tested.
  219. * The second item is firstItem + 1.
  220. * @return True if both 1) the next item in the array is sequential with
  221. * this one, and 2) the first byte of the character in the first position
  222. * is equal to the first byte of the character in the second position.
  223. */
  224. private boolean sameRangeEntryAsNext(char[] charArray, int firstItem) {
  225. if (charArray[firstItem] + 1 != charArray[firstItem + 1]) {
  226. return false;
  227. }
  228. if (firstItem / 256 != (firstItem + 1) / 256) {
  229. return false;
  230. }
  231. return true;
  232. }
  233. /**
  234. * Determine whether this array element should be the start of a bfrange
  235. * entry.
  236. * @param charArray The array to be tested.
  237. * @param arrayIndex The index to the array element to be tested.
  238. * @return True if this array element is the beginning of a range.
  239. */
  240. private boolean startOfRange(char[] charArray, int arrayIndex) {
  241. // Can't be the start of a range if not part of a range.
  242. if (!partOfRange(charArray, arrayIndex)) {
  243. return false;
  244. }
  245. // If first element in the array, must be start of a range
  246. if (arrayIndex == 0) {
  247. return true;
  248. }
  249. // If last element in the array, cannot be start of a range
  250. if (arrayIndex == charArray.length - 1) {
  251. return false;
  252. }
  253. /*
  254. * If part of same range as the previous element is, cannot be start
  255. * of range.
  256. */
  257. if (sameRangeEntryAsNext(charArray, arrayIndex - 1)) {
  258. return false;
  259. }
  260. // Otherwise, this is start of a range.
  261. return true;
  262. }
  263. /**
  264. * Prepends the input string with a sufficient number of "0" characters to
  265. * get the returned string to be numChars length.
  266. * @param input The input string.
  267. * @param numChars The minimum characters in the output string.
  268. * @return The padded string.
  269. */
  270. private String padHexString(String input, int numChars) {
  271. int length = input.length();
  272. if (length >= numChars) {
  273. return input;
  274. }
  275. StringBuffer returnString = new StringBuffer();
  276. for (int i = 1; i <= numChars - length; i++) {
  277. returnString.append("0");
  278. }
  279. returnString.append(input);
  280. return returnString.toString();
  281. }
  282. }
  283. }