From: Nick Burch Date: Wed, 26 Jun 2013 18:18:21 +0000 (+0000) Subject: Bring the numeric CodePage support from HPSF (constants and converters) out to a... X-Git-Tag: REL_3_10_BETA2~56 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=182b941b2cb5cf4411cd1c577f332fa75a16b9f3;p=poi.git Bring the numeric CodePage support from HPSF (constants and converters) out to a new Util class, so that HSMF can later use it git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1497032 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java new file mode 100644 index 0000000000..510a89bbe8 --- /dev/null +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -0,0 +1,363 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + +import java.io.UnsupportedEncodingException; + +/** + * Utilities for working with Microsoft CodePages. + * + *

Provides constants for understanding numeric codepages, + * along with utilities to translate these into Java Character Sets.

+ */ +public class CodePageUtil +{ + /**

Codepage 037, a special case

*/ + public static final int CP_037 = 37; + + /**

Codepage for SJIS

*/ + public static final int CP_SJIS = 932; + + /**

Codepage for GBK, aka MS936

*/ + public static final int CP_GBK = 936; + + /**

Codepage for MS949

*/ + public static final int CP_MS949 = 949; + + /**

Codepage for UTF-16

*/ + public static final int CP_UTF16 = 1200; + + /**

Codepage for UTF-16 big-endian

*/ + public static final int CP_UTF16_BE = 1201; + + /**

Codepage for Windows 1250

*/ + public static final int CP_WINDOWS_1250 = 1250; + + /**

Codepage for Windows 1251

*/ + public static final int CP_WINDOWS_1251 = 1251; + + /**

Codepage for Windows 1252

*/ + public static final int CP_WINDOWS_1252 = 1252; + + /**

Codepage for Windows 1253

*/ + public static final int CP_WINDOWS_1253 = 1253; + + /**

Codepage for Windows 1254

*/ + public static final int CP_WINDOWS_1254 = 1254; + + /**

Codepage for Windows 1255

*/ + public static final int CP_WINDOWS_1255 = 1255; + + /**

Codepage for Windows 1256

*/ + public static final int CP_WINDOWS_1256 = 1256; + + /**

Codepage for Windows 1257

*/ + public static final int CP_WINDOWS_1257 = 1257; + + /**

Codepage for Windows 1258

*/ + public static final int CP_WINDOWS_1258 = 1258; + + /**

Codepage for Johab

*/ + public static final int CP_JOHAB = 1361; + + /**

Codepage for Macintosh Roman (Java: MacRoman)

*/ + public static final int CP_MAC_ROMAN = 10000; + + /**

Codepage for Macintosh Japan (Java: unknown - use SJIS, cp942 or + * cp943)

*/ + public static final int CP_MAC_JAPAN = 10001; + + /**

Codepage for Macintosh Chinese Traditional (Java: unknown - use Big5, + * MS950, or cp937)

*/ + public static final int CP_MAC_CHINESE_TRADITIONAL = 10002; + + /**

Codepage for Macintosh Korean (Java: unknown - use EUC_KR or + * cp949)

*/ + public static final int CP_MAC_KOREAN = 10003; + + /**

Codepage for Macintosh Arabic (Java: MacArabic)

*/ + public static final int CP_MAC_ARABIC = 10004; + + /**

Codepage for Macintosh Hebrew (Java: MacHebrew)

*/ + public static final int CP_MAC_HEBREW = 10005; + + /**

Codepage for Macintosh Greek (Java: MacGreek)

*/ + public static final int CP_MAC_GREEK = 10006; + + /**

Codepage for Macintosh Cyrillic (Java: MacCyrillic)

*/ + public static final int CP_MAC_CYRILLIC = 10007; + + /**

Codepage for Macintosh Chinese Simplified (Java: unknown - use + * EUC_CN, ISO2022_CN_GB, MS936 or cp935)

*/ + public static final int CP_MAC_CHINESE_SIMPLE = 10008; + + /**

Codepage for Macintosh Romanian (Java: MacRomania)

*/ + public static final int CP_MAC_ROMANIA = 10010; + + /**

Codepage for Macintosh Ukrainian (Java: MacUkraine)

*/ + public static final int CP_MAC_UKRAINE = 10017; + + /**

Codepage for Macintosh Thai (Java: MacThai)

*/ + public static final int CP_MAC_THAI = 10021; + + /**

Codepage for Macintosh Central Europe (Latin-2) + * (Java: MacCentralEurope)

*/ + public static final int CP_MAC_CENTRAL_EUROPE = 10029; + + /**

Codepage for Macintosh Iceland (Java: MacIceland)

*/ + public static final int CP_MAC_ICELAND = 10079; + + /**

Codepage for Macintosh Turkish (Java: MacTurkish)

*/ + public static final int CP_MAC_TURKISH = 10081; + + /**

Codepage for Macintosh Croatian (Java: MacCroatian)

*/ + public static final int CP_MAC_CROATIAN = 10082; + + /**

Codepage for US-ASCII

*/ + public static final int CP_US_ACSII = 20127; + + /**

Codepage for KOI8-R

*/ + public static final int CP_KOI8_R = 20866; + + /**

Codepage for ISO-8859-1

*/ + public static final int CP_ISO_8859_1 = 28591; + + /**

Codepage for ISO-8859-2

*/ + public static final int CP_ISO_8859_2 = 28592; + + /**

Codepage for ISO-8859-3

*/ + public static final int CP_ISO_8859_3 = 28593; + + /**

Codepage for ISO-8859-4

*/ + public static final int CP_ISO_8859_4 = 28594; + + /**

Codepage for ISO-8859-5

*/ + public static final int CP_ISO_8859_5 = 28595; + + /**

Codepage for ISO-8859-6

*/ + public static final int CP_ISO_8859_6 = 28596; + + /**

Codepage for ISO-8859-7

*/ + public static final int CP_ISO_8859_7 = 28597; + + /**

Codepage for ISO-8859-8

*/ + public static final int CP_ISO_8859_8 = 28598; + + /**

Codepage for ISO-8859-9

*/ + public static final int CP_ISO_8859_9 = 28599; + + /**

Codepage for ISO-2022-JP

*/ + public static final int CP_ISO_2022_JP1 = 50220; + + /**

Another codepage for ISO-2022-JP

*/ + public static final int CP_ISO_2022_JP2 = 50221; + + /**

Yet another codepage for ISO-2022-JP

*/ + public static final int CP_ISO_2022_JP3 = 50222; + + /**

Codepage for ISO-2022-KR

*/ + public static final int CP_ISO_2022_KR = 50225; + + /**

Codepage for EUC-JP

*/ + public static final int CP_EUC_JP = 51932; + + /**

Codepage for EUC-KR

*/ + public static final int CP_EUC_KR = 51949; + + /**

Codepage for GB2312

*/ + public static final int CP_GB2312 = 52936; + + /**

Codepage for GB18030

*/ + public static final int CP_GB18030 = 54936; + + /**

Another codepage for US-ASCII

*/ + public static final int CP_US_ASCII2 = 65000; + + /**

Codepage for UTF-8

*/ + public static final int CP_UTF8 = 65001; + + /**

Codepage for Unicode

*/ + public static final int CP_UNICODE = CP_UTF16; + + /** + * Converts a string into bytes, in the equivalent character encoding + * to the supplied codepage number. + * @param string The string to convert + * @param codepage The codepage number + */ + public static byte[] getBytesInCodePage(final String string, final int codepage) + throws UnsupportedEncodingException + { + String encoding = codepageToEncoding(codepage); + return string.getBytes(encoding); + } + + /** + * Converts the bytes into a String, based on the equivalent character encoding + * to the supplied codepage number. + * @param string The byte of the string to convert + * @param codepage The codepage number + */ + public static String getStringFromCodePage(final byte[] string, final int codepage) + throws UnsupportedEncodingException + { + return getStringFromCodePage(string, 0, string.length, codepage); + } + + /** + * Converts the bytes into a String, based on the equivalent character encoding + * to the supplied codepage number. + * @param string The byte of the string to convert + * @param codepage The codepage number + */ + public static String getStringFromCodePage(final byte[] string, final int offset, + final int length, final int codepage) throws UnsupportedEncodingException + { + String encoding = codepageToEncoding(codepage); + return new String(string, offset, length, encoding); + } + + /** + *

Turns a codepage number into the equivalent character encoding's + * name.

+ * + * @param codepage The codepage number + * + * @return The character encoding's name. If the codepage number is 65001, + * the encoding name is "UTF-8". All other positive numbers are mapped to + * "cp" followed by the number, e.g. if the codepage number is 1252 the + * returned character encoding name will be "cp1252". + * + * @exception UnsupportedEncodingException if the specified codepage is + * less than zero. + */ + public static String codepageToEncoding(final int codepage) + throws UnsupportedEncodingException + { + if (codepage <= 0) + throw new UnsupportedEncodingException("Codepage number may not be " + codepage); + + switch (codepage) { + case CP_UTF16: + return "UTF-16"; + case CP_UTF16_BE: + return "UTF-16BE"; + case CP_UTF8: + return "UTF-8"; + case CP_037: + return "cp037"; + case CP_GBK: + return "GBK"; + case CP_MS949: + return "ms949"; + case CP_WINDOWS_1250: + return "windows-1250"; + case CP_WINDOWS_1251: + return "windows-1251"; + case CP_WINDOWS_1252: + return "windows-1252"; + case CP_WINDOWS_1253: + return "windows-1253"; + case CP_WINDOWS_1254: + return "windows-1254"; + case CP_WINDOWS_1255: + return "windows-1255"; + case CP_WINDOWS_1256: + return "windows-1256"; + case CP_WINDOWS_1257: + return "windows-1257"; + case CP_WINDOWS_1258: + return "windows-1258"; + case CP_JOHAB: + return "johab"; + case CP_MAC_ROMAN: + return "MacRoman"; + case CP_MAC_JAPAN: + return "SJIS"; + case CP_MAC_CHINESE_TRADITIONAL: + return "Big5"; + case CP_MAC_KOREAN: + return "EUC-KR"; + case CP_MAC_ARABIC: + return "MacArabic"; + case CP_MAC_HEBREW: + return "MacHebrew"; + case CP_MAC_GREEK: + return "MacGreek"; + case CP_MAC_CYRILLIC: + return "MacCyrillic"; + case CP_MAC_CHINESE_SIMPLE: + return "EUC_CN"; + case CP_MAC_ROMANIA: + return "MacRomania"; + case CP_MAC_UKRAINE: + return "MacUkraine"; + case CP_MAC_THAI: + return "MacThai"; + case CP_MAC_CENTRAL_EUROPE: + return "MacCentralEurope"; + case CP_MAC_ICELAND: + return "MacIceland"; + case CP_MAC_TURKISH: + return "MacTurkish"; + case CP_MAC_CROATIAN: + return "MacCroatian"; + case CP_US_ACSII: + case CP_US_ASCII2: + return "US-ASCII"; + case CP_KOI8_R: + return "KOI8-R"; + case CP_ISO_8859_1: + return "ISO-8859-1"; + case CP_ISO_8859_2: + return "ISO-8859-2"; + case CP_ISO_8859_3: + return "ISO-8859-3"; + case CP_ISO_8859_4: + return "ISO-8859-4"; + case CP_ISO_8859_5: + return "ISO-8859-5"; + case CP_ISO_8859_6: + return "ISO-8859-6"; + case CP_ISO_8859_7: + return "ISO-8859-7"; + case CP_ISO_8859_8: + return "ISO-8859-8"; + case CP_ISO_8859_9: + return "ISO-8859-9"; + case CP_ISO_2022_JP1: + case CP_ISO_2022_JP2: + case CP_ISO_2022_JP3: + return "ISO-2022-JP"; + case CP_ISO_2022_KR: + return "ISO-2022-KR"; + case CP_EUC_JP: + return "EUC-JP"; + case CP_EUC_KR: + return "EUC-KR"; + case CP_GB2312: + return "GB2312"; + case CP_GB18030: + return "GB18030"; + case CP_SJIS: + return "SJIS"; + default: + return "cp" + codepage; + } + } +}