/* ==================================================================== Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==================================================================== */ package org.apache.poi.util; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.Collections; import java.util.Set; /** * Utilities for working with Microsoft CodePages. * *

Provides constants for understanding numeric codepages, * along with utilities to translate these into Java Character Sets.

*/ public class CodePageUtil { public static final Set DOUBLE_BYTE_CHARSETS = Collections.singleton(StringUtil.BIG5); /**

Codepage 037, a special case

*/ public static final int CP_037 = 37; /**

Codepage for SJIS

*/ public static final int CP_SJIS = 932; /**

Codepage for GBK, aka MS936

*/ public static final int CP_GBK = 936; /**

Codepage for MS949

*/ public static final int CP_MS949 = 949; /**

Codepage for UTF-16

*/ public static final int CP_UTF16 = 1200; /**

Codepage for UTF-16 big-endian

*/ public static final int CP_UTF16_BE = 1201; /**

Codepage for Windows 1250

*/ public static final int CP_WINDOWS_1250 = 1250; /**

Codepage for Windows 1251

*/ public static final int CP_WINDOWS_1251 = 1251; /**

Codepage for Windows 1252

*/ public static final int CP_WINDOWS_1252 = 1252; public static final int CP_WINDOWS_1252_BIFF23 = 32769; /**

Codepage for Windows 1253

*/ public static final int CP_WINDOWS_1253 = 1253; /**

Codepage for Windows 1254

*/ public static final int CP_WINDOWS_1254 = 1254; /**

Codepage for Windows 1255

*/ public static final int CP_WINDOWS_1255 = 1255; /**

Codepage for Windows 1256

*/ public static final int CP_WINDOWS_1256 = 1256; /**

Codepage for Windows 1257

*/ public static final int CP_WINDOWS_1257 = 1257; /**

Codepage for Windows 1258

*/ public static final int CP_WINDOWS_1258 = 1258; /**

Codepage for Johab

*/ public static final int CP_JOHAB = 1361; /**

Codepage for Macintosh Roman (Java: MacRoman)

*/ public static final int CP_MAC_ROMAN = 10000; public static final int CP_MAC_ROMAN_BIFF23 = 32768; /**

Codepage for Macintosh Japan (Java: unknown - use SJIS, cp942 or * cp943)

*/ public static final int CP_MAC_JAPAN = 10001; /**

Codepage for Macintosh Chinese Traditional (Java: unknown - use Big5, * MS950, or cp937)

*/ public static final int CP_MAC_CHINESE_TRADITIONAL = 10002; /**

Codepage for Macintosh Korean (Java: unknown - use EUC_KR or * cp949)

*/ public static final int CP_MAC_KOREAN = 10003; /**

Codepage for Macintosh Arabic (Java: MacArabic)

*/ public static final int CP_MAC_ARABIC = 10004; /**

Codepage for Macintosh Hebrew (Java: MacHebrew)

*/ public static final int CP_MAC_HEBREW = 10005; /**

Codepage for Macintosh Greek (Java: MacGreek)

*/ public static final int CP_MAC_GREEK = 10006; /**

Codepage for Macintosh Cyrillic (Java: MacCyrillic)

*/ public static final int CP_MAC_CYRILLIC = 10007; /**

Codepage for Macintosh Chinese Simplified (Java: unknown - use * EUC_CN, ISO2022_CN_GB, MS936 or cp935)

*/ public static final int CP_MAC_CHINESE_SIMPLE = 10008; /**

Codepage for Macintosh Romanian (Java: MacRomania)

*/ public static final int CP_MAC_ROMANIA = 10010; /**

Codepage for Macintosh Ukrainian (Java: MacUkraine)

*/ public static final int CP_MAC_UKRAINE = 10017; /**

Codepage for Macintosh Thai (Java: MacThai)

*/ public static final int CP_MAC_THAI = 10021; /**

Codepage for Macintosh Central Europe (Latin-2) * (Java: MacCentralEurope)

*/ public static final int CP_MAC_CENTRAL_EUROPE = 10029; /**

Codepage for Macintosh Iceland (Java: MacIceland)

*/ public static final int CP_MAC_ICELAND = 10079; /**

Codepage for Macintosh Turkish (Java: MacTurkish)

*/ public static final int CP_MAC_TURKISH = 10081; /**

Codepage for Macintosh Croatian (Java: MacCroatian)

*/ public static final int CP_MAC_CROATIAN = 10082; /**

Codepage for US-ASCII

*/ public static final int CP_US_ACSII = 20127; /**

Codepage for KOI8-R

*/ public static final int CP_KOI8_R = 20866; /**

Codepage for ISO-8859-1

*/ public static final int CP_ISO_8859_1 = 28591; /**

Codepage for ISO-8859-2

*/ public static final int CP_ISO_8859_2 = 28592; /**

Codepage for ISO-8859-3

*/ public static final int CP_ISO_8859_3 = 28593; /**

Codepage for ISO-8859-4

*/ public static final int CP_ISO_8859_4 = 28594; /**

Codepage for ISO-8859-5

*/ public static final int CP_ISO_8859_5 = 28595; /**

Codepage for ISO-8859-6

*/ public static final int CP_ISO_8859_6 = 28596; /**

Codepage for ISO-8859-7

*/ public static final int CP_ISO_8859_7 = 28597; /**

Codepage for ISO-8859-8

*/ public static final int CP_ISO_8859_8 = 28598; /**

Codepage for ISO-8859-9

*/ public static final int CP_ISO_8859_9 = 28599; /**

Codepage for ISO-2022-JP

*/ public static final int CP_ISO_2022_JP1 = 50220; /**

Another codepage for ISO-2022-JP

*/ public static final int CP_ISO_2022_JP2 = 50221; /**

Yet another codepage for ISO-2022-JP

*/ public static final int CP_ISO_2022_JP3 = 50222; /**

Codepage for ISO-2022-KR

*/ public static final int CP_ISO_2022_KR = 50225; /**

Codepage for EUC-JP

*/ public static final int CP_EUC_JP = 51932; /**

Codepage for EUC-KR

*/ public static final int CP_EUC_KR = 51949; /**

Codepage for GB2312

*/ public static final int CP_GB2312 = 52936; /**

Codepage for GB18030

*/ public static final int CP_GB18030 = 54936; /**

Another codepage for US-ASCII

*/ public static final int CP_US_ASCII2 = 65000; /**

Codepage for UTF-8

*/ public static final int CP_UTF8 = 65001; /**

Codepage for Unicode

*/ public static final int CP_UNICODE = CP_UTF16; /** * Converts a string into bytes, in the equivalent character encoding * to the supplied codepage number. * @param string The string to convert * @param codepage The codepage number */ public static byte[] getBytesInCodePage(final String string, final int codepage) throws UnsupportedEncodingException { String encoding = codepageToEncoding(codepage); return string.getBytes(encoding); } /** * Converts the bytes into a String, based on the equivalent character encoding * to the supplied codepage number. * @param string The byte of the string to convert * @param codepage The codepage number */ public static String getStringFromCodePage(final byte[] string, final int codepage) throws UnsupportedEncodingException { return getStringFromCodePage(string, 0, string.length, codepage); } /** * Converts the bytes into a String, based on the equivalent character encoding * to the supplied codepage number. * @param string The byte of the string to convert * @param codepage The codepage number */ public static String getStringFromCodePage(final byte[] string, final int offset, final int length, final int codepage) throws UnsupportedEncodingException { String encoding = codepageToEncoding(codepage); return new String(string, offset, length, encoding); } /** *

Turns a codepage number into the equivalent character encoding's * name (in Java NIO canonical naming format).

* * @param codepage The codepage number * * @return The character encoding's name. If the codepage number is 65001, * the encoding name is "UTF-8". All other positive numbers are mapped to * their Java NIO names, normally either "windows-" followed by the number, * eg "windows-1251", or "cp" followed by the number, e.g. if the codepage * number is 1252 the returned character encoding name will be "cp1252". * * @exception UnsupportedEncodingException if the specified codepage is * less than zero. */ public static String codepageToEncoding(final int codepage) throws UnsupportedEncodingException { return codepageToEncoding(codepage, false); } /** *

Turns a codepage number into the equivalent character encoding's * name, in either Java NIO or Java Lang canonical naming.

* * @param codepage The codepage number * @param javaLangFormat Should Java Lang or Java NIO naming be used? * * @return The character encoding's name, in either Java Lang format * (eg Cp1251, ISO8859_5) or Java NIO format (eg windows-1252, ISO-8859-9) * * @see Supported Encodings * * @exception UnsupportedEncodingException if the specified codepage is * less than zero. */ public static String codepageToEncoding(final int codepage, boolean javaLangFormat) throws UnsupportedEncodingException { if (codepage <= 0) throw new UnsupportedEncodingException("Codepage number may not be " + codepage); switch (codepage) { case CP_UTF16: return "UTF-16LE"; case CP_UTF16_BE: return "UTF-16BE"; case CP_UTF8: return "UTF-8"; case CP_037: return "cp037"; case CP_GBK: return "GBK"; case CP_MS949: return "ms949"; case CP_WINDOWS_1250: if (javaLangFormat) return "Cp1250"; else return "windows-1250"; case CP_WINDOWS_1251: if (javaLangFormat) return "Cp1251"; else return "windows-1251"; case CP_WINDOWS_1252: case CP_WINDOWS_1252_BIFF23: if (javaLangFormat) return "Cp1252"; else return "windows-1252"; case CP_WINDOWS_1253: if (javaLangFormat) return "Cp1253"; else return "windows-1253"; case CP_WINDOWS_1254: if (javaLangFormat) return "Cp1254"; else return "windows-1254"; case CP_WINDOWS_1255: if (javaLangFormat) return "Cp1255"; else return "windows-1255"; case CP_WINDOWS_1256: if (javaLangFormat) return "Cp1255"; else return "windows-1256"; case CP_WINDOWS_1257: if (javaLangFormat) return "Cp1257"; else return "windows-1257"; case CP_WINDOWS_1258: if (javaLangFormat) return "Cp1258"; else return "windows-1258"; case CP_JOHAB: return "johab"; case CP_MAC_ROMAN: case CP_MAC_ROMAN_BIFF23: return "MacRoman"; case CP_MAC_JAPAN: return "SJIS"; case CP_MAC_CHINESE_TRADITIONAL: return "Big5"; case CP_MAC_KOREAN: return "EUC-KR"; case CP_MAC_ARABIC: return "MacArabic"; case CP_MAC_HEBREW: return "MacHebrew"; case CP_MAC_GREEK: return "MacGreek"; case CP_MAC_CYRILLIC: return "MacCyrillic"; case CP_MAC_CHINESE_SIMPLE: return "EUC_CN"; case CP_MAC_ROMANIA: return "MacRomania"; case CP_MAC_UKRAINE: return "MacUkraine"; case CP_MAC_THAI: return "MacThai"; case CP_MAC_CENTRAL_EUROPE: return "MacCentralEurope"; case CP_MAC_ICELAND: return "MacIceland"; case CP_MAC_TURKISH: return "MacTurkish"; case CP_MAC_CROATIAN: return "MacCroatian"; case CP_US_ACSII: case CP_US_ASCII2: return "US-ASCII"; case CP_KOI8_R: return "KOI8-R"; case CP_ISO_8859_1: if (javaLangFormat) return "ISO8859_1"; else return "ISO-8859-1"; case CP_ISO_8859_2: if (javaLangFormat) return "ISO8859_2"; else return "ISO-8859-2"; case CP_ISO_8859_3: if (javaLangFormat) return "ISO8859_3"; else return "ISO-8859-3"; case CP_ISO_8859_4: if (javaLangFormat) return "ISO8859_4"; else return "ISO-8859-4"; case CP_ISO_8859_5: if (javaLangFormat) return "ISO8859_5"; else return "ISO-8859-5"; case CP_ISO_8859_6: if (javaLangFormat) return "ISO8859_6"; else return "ISO-8859-6"; case CP_ISO_8859_7: if (javaLangFormat) return "ISO8859_7"; else return "ISO-8859-7"; case CP_ISO_8859_8: if (javaLangFormat) return "ISO8859_8"; else return "ISO-8859-8"; case CP_ISO_8859_9: if (javaLangFormat) return "ISO8859_9"; else return "ISO-8859-9"; case CP_ISO_2022_JP1: case CP_ISO_2022_JP2: case CP_ISO_2022_JP3: return "ISO-2022-JP"; case CP_ISO_2022_KR: return "ISO-2022-KR"; case CP_EUC_JP: return "EUC-JP"; case CP_EUC_KR: return "EUC-KR"; case CP_GB2312: return "GB2312"; case CP_GB18030: return "GB18030"; case CP_SJIS: return "SJIS"; default: return "cp" + codepage; } } /** * This tries to convert a LE byte array in cp950 * (Microsoft's dialect of Big5) to a String. * We know MS zero-padded ascii, and we drop those. * There may be areas for improvement in this. * * @param data * @param offset * @param lengthInBytes * @return Decoded String */ public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { StringBuilder sb = new StringBuilder(); LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); int c = reader.read(); while (c != -1) { sb.append((char)c); c = reader.read(); } reader.close(); return sb.toString(); } }