You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CodePageUtil.java 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.util;
  16. import java.io.UnsupportedEncodingException;
  17. import java.nio.charset.Charset;
  18. import java.util.Collections;
  19. import java.util.Set;
  20. /**
  21. * Utilities for working with Microsoft CodePages.
  22. *
  23. * <p>Provides constants for understanding numeric codepages,
  24. * along with utilities to translate these into Java Character Sets.</p>
  25. */
  26. public class CodePageUtil
  27. {
  28. /** <p>Codepage 037, a special case</p> */
  29. public static final int CP_037 = 37;
  30. /** <p>Codepage for SJIS</p> */
  31. public static final int CP_SJIS = 932;
  32. /** <p>Codepage for GBK, aka MS936</p> */
  33. public static final int CP_GBK = 936;
  34. /** <p>Codepage for MS949</p> */
  35. public static final int CP_MS949 = 949;
  36. /** <p>Codepage for UTF-16</p> */
  37. public static final int CP_UTF16 = 1200;
  38. /** <p>Codepage for UTF-16 big-endian</p> */
  39. public static final int CP_UTF16_BE = 1201;
  40. /** <p>Codepage for Windows 1250</p> */
  41. public static final int CP_WINDOWS_1250 = 1250;
  42. /** <p>Codepage for Windows 1251</p> */
  43. public static final int CP_WINDOWS_1251 = 1251;
  44. /** <p>Codepage for Windows 1252</p> */
  45. public static final int CP_WINDOWS_1252 = 1252;
  46. public static final int CP_WINDOWS_1252_BIFF23 = 32769;
  47. /** <p>Codepage for Windows 1253</p> */
  48. public static final int CP_WINDOWS_1253 = 1253;
  49. /** <p>Codepage for Windows 1254</p> */
  50. public static final int CP_WINDOWS_1254 = 1254;
  51. /** <p>Codepage for Windows 1255</p> */
  52. public static final int CP_WINDOWS_1255 = 1255;
  53. /** <p>Codepage for Windows 1256</p> */
  54. public static final int CP_WINDOWS_1256 = 1256;
  55. /** <p>Codepage for Windows 1257</p> */
  56. public static final int CP_WINDOWS_1257 = 1257;
  57. /** <p>Codepage for Windows 1258</p> */
  58. public static final int CP_WINDOWS_1258 = 1258;
  59. /** <p>Codepage for Johab</p> */
  60. public static final int CP_JOHAB = 1361;
  61. /** <p>Codepage for Macintosh Roman (Java: MacRoman)</p> */
  62. public static final int CP_MAC_ROMAN = 10000;
  63. public static final int CP_MAC_ROMAN_BIFF23 = 32768;
  64. /** <p>Codepage for Macintosh Japan (Java: unknown - use SJIS, cp942 or
  65. * cp943)</p> */
  66. public static final int CP_MAC_JAPAN = 10001;
  67. /** <p>Codepage for Macintosh Chinese Traditional (Java: unknown - use Big5,
  68. * MS950, or cp937)</p> */
  69. public static final int CP_MAC_CHINESE_TRADITIONAL = 10002;
  70. /** <p>Codepage for Macintosh Korean (Java: unknown - use EUC_KR or
  71. * cp949)</p> */
  72. public static final int CP_MAC_KOREAN = 10003;
  73. /** <p>Codepage for Macintosh Arabic (Java: MacArabic)</p> */
  74. public static final int CP_MAC_ARABIC = 10004;
  75. /** <p>Codepage for Macintosh Hebrew (Java: MacHebrew)</p> */
  76. public static final int CP_MAC_HEBREW = 10005;
  77. /** <p>Codepage for Macintosh Greek (Java: MacGreek)</p> */
  78. public static final int CP_MAC_GREEK = 10006;
  79. /** <p>Codepage for Macintosh Cyrillic (Java: MacCyrillic)</p> */
  80. public static final int CP_MAC_CYRILLIC = 10007;
  81. /** <p>Codepage for Macintosh Chinese Simplified (Java: unknown - use
  82. * EUC_CN, ISO2022_CN_GB, MS936 or cp935)</p> */
  83. public static final int CP_MAC_CHINESE_SIMPLE = 10008;
  84. /** <p>Codepage for Macintosh Romanian (Java: MacRomania)</p> */
  85. public static final int CP_MAC_ROMANIA = 10010;
  86. /** <p>Codepage for Macintosh Ukrainian (Java: MacUkraine)</p> */
  87. public static final int CP_MAC_UKRAINE = 10017;
  88. /** <p>Codepage for Macintosh Thai (Java: MacThai)</p> */
  89. public static final int CP_MAC_THAI = 10021;
  90. /** <p>Codepage for Macintosh Central Europe (Latin-2)
  91. * (Java: MacCentralEurope)</p> */
  92. public static final int CP_MAC_CENTRAL_EUROPE = 10029;
  93. /** <p>Codepage for Macintosh Iceland (Java: MacIceland)</p> */
  94. public static final int CP_MAC_ICELAND = 10079;
  95. /** <p>Codepage for Macintosh Turkish (Java: MacTurkish)</p> */
  96. public static final int CP_MAC_TURKISH = 10081;
  97. /** <p>Codepage for Macintosh Croatian (Java: MacCroatian)</p> */
  98. public static final int CP_MAC_CROATIAN = 10082;
  99. /** <p>Codepage for US-ASCII</p> */
  100. public static final int CP_US_ACSII = 20127;
  101. /** <p>Codepage for KOI8-R</p> */
  102. public static final int CP_KOI8_R = 20866;
  103. /** <p>Codepage for ISO-8859-1</p> */
  104. public static final int CP_ISO_8859_1 = 28591;
  105. /** <p>Codepage for ISO-8859-2</p> */
  106. public static final int CP_ISO_8859_2 = 28592;
  107. /** <p>Codepage for ISO-8859-3</p> */
  108. public static final int CP_ISO_8859_3 = 28593;
  109. /** <p>Codepage for ISO-8859-4</p> */
  110. public static final int CP_ISO_8859_4 = 28594;
  111. /** <p>Codepage for ISO-8859-5</p> */
  112. public static final int CP_ISO_8859_5 = 28595;
  113. /** <p>Codepage for ISO-8859-6</p> */
  114. public static final int CP_ISO_8859_6 = 28596;
  115. /** <p>Codepage for ISO-8859-7</p> */
  116. public static final int CP_ISO_8859_7 = 28597;
  117. /** <p>Codepage for ISO-8859-8</p> */
  118. public static final int CP_ISO_8859_8 = 28598;
  119. /** <p>Codepage for ISO-8859-9</p> */
  120. public static final int CP_ISO_8859_9 = 28599;
  121. /** <p>Codepage for ISO-2022-JP</p> */
  122. public static final int CP_ISO_2022_JP1 = 50220;
  123. /** <p>Another codepage for ISO-2022-JP</p> */
  124. public static final int CP_ISO_2022_JP2 = 50221;
  125. /** <p>Yet another codepage for ISO-2022-JP</p> */
  126. public static final int CP_ISO_2022_JP3 = 50222;
  127. /** <p>Codepage for ISO-2022-KR</p> */
  128. public static final int CP_ISO_2022_KR = 50225;
  129. /** <p>Codepage for EUC-JP</p> */
  130. public static final int CP_EUC_JP = 51932;
  131. /** <p>Codepage for EUC-KR</p> */
  132. public static final int CP_EUC_KR = 51949;
  133. /** <p>Codepage for GB2312</p> */
  134. public static final int CP_GB2312 = 52936;
  135. /** <p>Codepage for GB18030</p> */
  136. public static final int CP_GB18030 = 54936;
  137. /** <p>Another codepage for US-ASCII</p> */
  138. public static final int CP_US_ASCII2 = 65000;
  139. /** <p>Codepage for UTF-8</p> */
  140. public static final int CP_UTF8 = 65001;
  141. /** <p>Codepage for Unicode</p> */
  142. public static final int CP_UNICODE = CP_UTF16;
  143. /**
  144. * Converts a string into bytes, in the equivalent character encoding
  145. * to the supplied codepage number.
  146. * @param string The string to convert
  147. * @param codepage The codepage number
  148. */
  149. public static byte[] getBytesInCodePage(final String string, final int codepage)
  150. throws UnsupportedEncodingException
  151. {
  152. String encoding = codepageToEncoding(codepage);
  153. return string.getBytes(encoding);
  154. }
  155. /**
  156. * Converts the bytes into a String, based on the equivalent character encoding
  157. * to the supplied codepage number.
  158. * @param string The byte of the string to convert
  159. * @param codepage The codepage number
  160. */
  161. public static String getStringFromCodePage(final byte[] string, final int codepage)
  162. throws UnsupportedEncodingException
  163. {
  164. return getStringFromCodePage(string, 0, string.length, codepage);
  165. }
  166. /**
  167. * Converts the bytes into a String, based on the equivalent character encoding
  168. * to the supplied codepage number.
  169. * @param string The byte of the string to convert
  170. * @param codepage The codepage number
  171. */
  172. public static String getStringFromCodePage(final byte[] string, final int offset,
  173. final int length, final int codepage) throws UnsupportedEncodingException
  174. {
  175. String encoding = codepageToEncoding(codepage);
  176. return new String(string, offset, length, encoding);
  177. }
  178. /**
  179. * <p>Turns a codepage number into the equivalent character encoding's
  180. * name (in Java NIO canonical naming format).</p>
  181. *
  182. * @param codepage The codepage number
  183. *
  184. * @return The character encoding's name. If the codepage number is 65001,
  185. * the encoding name is "UTF-8". All other positive numbers are mapped to
  186. * their Java NIO names, normally either "windows-" followed by the number,
  187. * eg "windows-1251", or "cp" followed by the number, e.g. if the codepage
  188. * number is 1252 the returned character encoding name will be "cp1252".
  189. *
  190. * @throws UnsupportedEncodingException if the specified codepage is
  191. * less than zero.
  192. */
  193. public static String codepageToEncoding(final int codepage)
  194. throws UnsupportedEncodingException
  195. {
  196. return codepageToEncoding(codepage, false);
  197. }
  198. /**
  199. * <p>Turns a codepage number into the equivalent character encoding's
  200. * name, in either Java NIO or Java Lang canonical naming.</p>
  201. *
  202. * @param codepage The codepage number
  203. * @param javaLangFormat Should Java Lang or Java NIO naming be used?
  204. *
  205. * @return The character encoding's name, in either Java Lang format
  206. * (eg Cp1251, ISO8859_5) or Java NIO format (eg windows-1252, ISO-8859-9)
  207. *
  208. * @see <a href="http://docs.oracle.com/javase/6/docs/technotes/guides/intl/encoding.doc.html">Supported Encodings</a>
  209. *
  210. * @throws UnsupportedEncodingException if the specified codepage is
  211. * less than zero.
  212. */
  213. public static String codepageToEncoding(final int codepage, boolean javaLangFormat)
  214. throws UnsupportedEncodingException
  215. {
  216. if (codepage <= 0)
  217. throw new UnsupportedEncodingException("Codepage number may not be " + codepage);
  218. switch (codepage) {
  219. case CP_UTF16:
  220. return "UTF-16LE";
  221. case CP_UTF16_BE:
  222. return "UTF-16BE";
  223. case CP_UTF8:
  224. return "UTF-8";
  225. case CP_037:
  226. return "cp037";
  227. case CP_GBK:
  228. return "GBK";
  229. case CP_MS949:
  230. return "ms949";
  231. case CP_WINDOWS_1250:
  232. if (javaLangFormat)
  233. return "Cp1250";
  234. else
  235. return "windows-1250";
  236. case CP_WINDOWS_1251:
  237. if (javaLangFormat)
  238. return "Cp1251";
  239. else
  240. return "windows-1251";
  241. case CP_WINDOWS_1252:
  242. case CP_WINDOWS_1252_BIFF23:
  243. if (javaLangFormat)
  244. return "Cp1252";
  245. else
  246. return "windows-1252";
  247. case CP_WINDOWS_1253:
  248. if (javaLangFormat)
  249. return "Cp1253";
  250. else
  251. return "windows-1253";
  252. case CP_WINDOWS_1254:
  253. if (javaLangFormat)
  254. return "Cp1254";
  255. else
  256. return "windows-1254";
  257. case CP_WINDOWS_1255:
  258. if (javaLangFormat)
  259. return "Cp1255";
  260. else
  261. return "windows-1255";
  262. case CP_WINDOWS_1256:
  263. if (javaLangFormat)
  264. return "Cp1255";
  265. else
  266. return "windows-1256";
  267. case CP_WINDOWS_1257:
  268. if (javaLangFormat)
  269. return "Cp1257";
  270. else
  271. return "windows-1257";
  272. case CP_WINDOWS_1258:
  273. if (javaLangFormat)
  274. return "Cp1258";
  275. else
  276. return "windows-1258";
  277. case CP_JOHAB:
  278. return "johab";
  279. case CP_MAC_ROMAN:
  280. case CP_MAC_ROMAN_BIFF23:
  281. return "MacRoman";
  282. case CP_MAC_JAPAN:
  283. return "SJIS";
  284. case CP_MAC_CHINESE_TRADITIONAL:
  285. return "Big5";
  286. case CP_MAC_KOREAN:
  287. return "EUC-KR";
  288. case CP_MAC_ARABIC:
  289. return "MacArabic";
  290. case CP_MAC_HEBREW:
  291. return "MacHebrew";
  292. case CP_MAC_GREEK:
  293. return "MacGreek";
  294. case CP_MAC_CYRILLIC:
  295. return "MacCyrillic";
  296. case CP_MAC_CHINESE_SIMPLE:
  297. return "EUC_CN";
  298. case CP_MAC_ROMANIA:
  299. return "MacRomania";
  300. case CP_MAC_UKRAINE:
  301. return "MacUkraine";
  302. case CP_MAC_THAI:
  303. return "MacThai";
  304. case CP_MAC_CENTRAL_EUROPE:
  305. return "MacCentralEurope";
  306. case CP_MAC_ICELAND:
  307. return "MacIceland";
  308. case CP_MAC_TURKISH:
  309. return "MacTurkish";
  310. case CP_MAC_CROATIAN:
  311. return "MacCroatian";
  312. case CP_US_ACSII:
  313. case CP_US_ASCII2:
  314. return "US-ASCII";
  315. case CP_KOI8_R:
  316. return "KOI8-R";
  317. case CP_ISO_8859_1:
  318. if (javaLangFormat)
  319. return "ISO8859_1";
  320. else
  321. return "ISO-8859-1";
  322. case CP_ISO_8859_2:
  323. if (javaLangFormat)
  324. return "ISO8859_2";
  325. else
  326. return "ISO-8859-2";
  327. case CP_ISO_8859_3:
  328. if (javaLangFormat)
  329. return "ISO8859_3";
  330. else
  331. return "ISO-8859-3";
  332. case CP_ISO_8859_4:
  333. if (javaLangFormat)
  334. return "ISO8859_4";
  335. else
  336. return "ISO-8859-4";
  337. case CP_ISO_8859_5:
  338. if (javaLangFormat)
  339. return "ISO8859_5";
  340. else
  341. return "ISO-8859-5";
  342. case CP_ISO_8859_6:
  343. if (javaLangFormat)
  344. return "ISO8859_6";
  345. else
  346. return "ISO-8859-6";
  347. case CP_ISO_8859_7:
  348. if (javaLangFormat)
  349. return "ISO8859_7";
  350. else
  351. return "ISO-8859-7";
  352. case CP_ISO_8859_8:
  353. if (javaLangFormat)
  354. return "ISO8859_8";
  355. else
  356. return "ISO-8859-8";
  357. case CP_ISO_8859_9:
  358. if (javaLangFormat)
  359. return "ISO8859_9";
  360. else
  361. return "ISO-8859-9";
  362. case CP_ISO_2022_JP1:
  363. case CP_ISO_2022_JP2:
  364. case CP_ISO_2022_JP3:
  365. return "ISO-2022-JP";
  366. case CP_ISO_2022_KR:
  367. return "ISO-2022-KR";
  368. case CP_EUC_JP:
  369. return "EUC-JP";
  370. case CP_EUC_KR:
  371. return "EUC-KR";
  372. case CP_GB2312:
  373. return "GB2312";
  374. case CP_GB18030:
  375. return "GB18030";
  376. case CP_SJIS:
  377. return "SJIS";
  378. default:
  379. return "cp" + codepage;
  380. }
  381. }
  382. }