diff options
author | Nick Burch <nick@apache.org> | 2013-06-26 18:18:21 +0000 |
---|---|---|
committer | Nick Burch <nick@apache.org> | 2013-06-26 18:18:21 +0000 |
commit | 182b941b2cb5cf4411cd1c577f332fa75a16b9f3 (patch) | |
tree | 8991d48a63c88800d4fbc00347625888fa8659ff /src | |
parent | 14e2a5be28311023a50db331c65ea55f237c72f6 (diff) | |
download | poi-182b941b2cb5cf4411cd1c577f332fa75a16b9f3.tar.gz poi-182b941b2cb5cf4411cd1c577f332fa75a16b9f3.zip |
Bring the numeric CodePage support from HPSF (constants and converters) out to a new Util class, so that HSMF can later use it
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1497032 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src')
-rw-r--r-- | src/java/org/apache/poi/util/CodePageUtil.java | 363 |
1 files changed, 363 insertions, 0 deletions
diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java new file mode 100644 index 0000000000..510a89bbe8 --- /dev/null +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -0,0 +1,363 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + +import java.io.UnsupportedEncodingException; + +/** + * Utilities for working with Microsoft CodePages. + * + * <p>Provides constants for understanding numeric codepages, + * along with utilities to translate these into Java Character Sets.</p> + */ +public class CodePageUtil +{ + /** <p>Codepage 037, a special case</p> */ + public static final int CP_037 = 37; + + /** <p>Codepage for SJIS</p> */ + public static final int CP_SJIS = 932; + + /** <p>Codepage for GBK, aka MS936</p> */ + public static final int CP_GBK = 936; + + /** <p>Codepage for MS949</p> */ + public static final int CP_MS949 = 949; + + /** <p>Codepage for UTF-16</p> */ + public static final int CP_UTF16 = 1200; + + /** <p>Codepage for UTF-16 big-endian</p> */ + public static final int CP_UTF16_BE = 1201; + + /** <p>Codepage for Windows 1250</p> */ + public static final int CP_WINDOWS_1250 = 1250; + + /** <p>Codepage for Windows 1251</p> */ + public static final int CP_WINDOWS_1251 = 1251; + + /** <p>Codepage for Windows 1252</p> */ + public static final int CP_WINDOWS_1252 = 1252; + + /** <p>Codepage for Windows 1253</p> */ + public static final int CP_WINDOWS_1253 = 1253; + + /** <p>Codepage for Windows 1254</p> */ + public static final int CP_WINDOWS_1254 = 1254; + + /** <p>Codepage for Windows 1255</p> */ + public static final int CP_WINDOWS_1255 = 1255; + + /** <p>Codepage for Windows 1256</p> */ + public static final int CP_WINDOWS_1256 = 1256; + + /** <p>Codepage for Windows 1257</p> */ + public static final int CP_WINDOWS_1257 = 1257; + + /** <p>Codepage for Windows 1258</p> */ + public static final int CP_WINDOWS_1258 = 1258; + + /** <p>Codepage for Johab</p> */ + public static final int CP_JOHAB = 1361; + + /** <p>Codepage for Macintosh Roman (Java: MacRoman)</p> */ + public static final int CP_MAC_ROMAN = 10000; + + /** <p>Codepage for Macintosh Japan (Java: unknown - use SJIS, cp942 or + * cp943)</p> */ + public static final int CP_MAC_JAPAN = 10001; + + /** <p>Codepage for Macintosh Chinese Traditional (Java: unknown - use Big5, + * MS950, or cp937)</p> */ + public static final int CP_MAC_CHINESE_TRADITIONAL = 10002; + + /** <p>Codepage for Macintosh Korean (Java: unknown - use EUC_KR or + * cp949)</p> */ + public static final int CP_MAC_KOREAN = 10003; + + /** <p>Codepage for Macintosh Arabic (Java: MacArabic)</p> */ + public static final int CP_MAC_ARABIC = 10004; + + /** <p>Codepage for Macintosh Hebrew (Java: MacHebrew)</p> */ + public static final int CP_MAC_HEBREW = 10005; + + /** <p>Codepage for Macintosh Greek (Java: MacGreek)</p> */ + public static final int CP_MAC_GREEK = 10006; + + /** <p>Codepage for Macintosh Cyrillic (Java: MacCyrillic)</p> */ + public static final int CP_MAC_CYRILLIC = 10007; + + /** <p>Codepage for Macintosh Chinese Simplified (Java: unknown - use + * EUC_CN, ISO2022_CN_GB, MS936 or cp935)</p> */ + public static final int CP_MAC_CHINESE_SIMPLE = 10008; + + /** <p>Codepage for Macintosh Romanian (Java: MacRomania)</p> */ + public static final int CP_MAC_ROMANIA = 10010; + + /** <p>Codepage for Macintosh Ukrainian (Java: MacUkraine)</p> */ + public static final int CP_MAC_UKRAINE = 10017; + + /** <p>Codepage for Macintosh Thai (Java: MacThai)</p> */ + public static final int CP_MAC_THAI = 10021; + + /** <p>Codepage for Macintosh Central Europe (Latin-2) + * (Java: MacCentralEurope)</p> */ + public static final int CP_MAC_CENTRAL_EUROPE = 10029; + + /** <p>Codepage for Macintosh Iceland (Java: MacIceland)</p> */ + public static final int CP_MAC_ICELAND = 10079; + + /** <p>Codepage for Macintosh Turkish (Java: MacTurkish)</p> */ + public static final int CP_MAC_TURKISH = 10081; + + /** <p>Codepage for Macintosh Croatian (Java: MacCroatian)</p> */ + public static final int CP_MAC_CROATIAN = 10082; + + /** <p>Codepage for US-ASCII</p> */ + public static final int CP_US_ACSII = 20127; + + /** <p>Codepage for KOI8-R</p> */ + public static final int CP_KOI8_R = 20866; + + /** <p>Codepage for ISO-8859-1</p> */ + public static final int CP_ISO_8859_1 = 28591; + + /** <p>Codepage for ISO-8859-2</p> */ + public static final int CP_ISO_8859_2 = 28592; + + /** <p>Codepage for ISO-8859-3</p> */ + public static final int CP_ISO_8859_3 = 28593; + + /** <p>Codepage for ISO-8859-4</p> */ + public static final int CP_ISO_8859_4 = 28594; + + /** <p>Codepage for ISO-8859-5</p> */ + public static final int CP_ISO_8859_5 = 28595; + + /** <p>Codepage for ISO-8859-6</p> */ + public static final int CP_ISO_8859_6 = 28596; + + /** <p>Codepage for ISO-8859-7</p> */ + public static final int CP_ISO_8859_7 = 28597; + + /** <p>Codepage for ISO-8859-8</p> */ + public static final int CP_ISO_8859_8 = 28598; + + /** <p>Codepage for ISO-8859-9</p> */ + public static final int CP_ISO_8859_9 = 28599; + + /** <p>Codepage for ISO-2022-JP</p> */ + public static final int CP_ISO_2022_JP1 = 50220; + + /** <p>Another codepage for ISO-2022-JP</p> */ + public static final int CP_ISO_2022_JP2 = 50221; + + /** <p>Yet another codepage for ISO-2022-JP</p> */ + public static final int CP_ISO_2022_JP3 = 50222; + + /** <p>Codepage for ISO-2022-KR</p> */ + public static final int CP_ISO_2022_KR = 50225; + + /** <p>Codepage for EUC-JP</p> */ + public static final int CP_EUC_JP = 51932; + + /** <p>Codepage for EUC-KR</p> */ + public static final int CP_EUC_KR = 51949; + + /** <p>Codepage for GB2312</p> */ + public static final int CP_GB2312 = 52936; + + /** <p>Codepage for GB18030</p> */ + public static final int CP_GB18030 = 54936; + + /** <p>Another codepage for US-ASCII</p> */ + public static final int CP_US_ASCII2 = 65000; + + /** <p>Codepage for UTF-8</p> */ + public static final int CP_UTF8 = 65001; + + /** <p>Codepage for Unicode</p> */ + public static final int CP_UNICODE = CP_UTF16; + + /** + * Converts a string into bytes, in the equivalent character encoding + * to the supplied codepage number. + * @param string The string to convert + * @param codepage The codepage number + */ + public static byte[] getBytesInCodePage(final String string, final int codepage) + throws UnsupportedEncodingException + { + String encoding = codepageToEncoding(codepage); + return string.getBytes(encoding); + } + + /** + * Converts the bytes into a String, based on the equivalent character encoding + * to the supplied codepage number. + * @param string The byte of the string to convert + * @param codepage The codepage number + */ + public static String getStringFromCodePage(final byte[] string, final int codepage) + throws UnsupportedEncodingException + { + return getStringFromCodePage(string, 0, string.length, codepage); + } + + /** + * Converts the bytes into a String, based on the equivalent character encoding + * to the supplied codepage number. + * @param string The byte of the string to convert + * @param codepage The codepage number + */ + public static String getStringFromCodePage(final byte[] string, final int offset, + final int length, final int codepage) throws UnsupportedEncodingException + { + String encoding = codepageToEncoding(codepage); + return new String(string, offset, length, encoding); + } + + /** + * <p>Turns a codepage number into the equivalent character encoding's + * name.</p> + * + * @param codepage The codepage number + * + * @return The character encoding's name. If the codepage number is 65001, + * the encoding name is "UTF-8". All other positive numbers are mapped to + * "cp" followed by the number, e.g. if the codepage number is 1252 the + * returned character encoding name will be "cp1252". + * + * @exception UnsupportedEncodingException if the specified codepage is + * less than zero. + */ + public static String codepageToEncoding(final int codepage) + throws UnsupportedEncodingException + { + if (codepage <= 0) + throw new UnsupportedEncodingException("Codepage number may not be " + codepage); + + switch (codepage) { + case CP_UTF16: + return "UTF-16"; + case CP_UTF16_BE: + return "UTF-16BE"; + case CP_UTF8: + return "UTF-8"; + case CP_037: + return "cp037"; + case CP_GBK: + return "GBK"; + case CP_MS949: + return "ms949"; + case CP_WINDOWS_1250: + return "windows-1250"; + case CP_WINDOWS_1251: + return "windows-1251"; + case CP_WINDOWS_1252: + return "windows-1252"; + case CP_WINDOWS_1253: + return "windows-1253"; + case CP_WINDOWS_1254: + return "windows-1254"; + case CP_WINDOWS_1255: + return "windows-1255"; + case CP_WINDOWS_1256: + return "windows-1256"; + case CP_WINDOWS_1257: + return "windows-1257"; + case CP_WINDOWS_1258: + return "windows-1258"; + case CP_JOHAB: + return "johab"; + case CP_MAC_ROMAN: + return "MacRoman"; + case CP_MAC_JAPAN: + return "SJIS"; + case CP_MAC_CHINESE_TRADITIONAL: + return "Big5"; + case CP_MAC_KOREAN: + return "EUC-KR"; + case CP_MAC_ARABIC: + return "MacArabic"; + case CP_MAC_HEBREW: + return "MacHebrew"; + case CP_MAC_GREEK: + return "MacGreek"; + case CP_MAC_CYRILLIC: + return "MacCyrillic"; + case CP_MAC_CHINESE_SIMPLE: + return "EUC_CN"; + case CP_MAC_ROMANIA: + return "MacRomania"; + case CP_MAC_UKRAINE: + return "MacUkraine"; + case CP_MAC_THAI: + return "MacThai"; + case CP_MAC_CENTRAL_EUROPE: + return "MacCentralEurope"; + case CP_MAC_ICELAND: + return "MacIceland"; + case CP_MAC_TURKISH: + return "MacTurkish"; + case CP_MAC_CROATIAN: + return "MacCroatian"; + case CP_US_ACSII: + case CP_US_ASCII2: + return "US-ASCII"; + case CP_KOI8_R: + return "KOI8-R"; + case CP_ISO_8859_1: + return "ISO-8859-1"; + case CP_ISO_8859_2: + return "ISO-8859-2"; + case CP_ISO_8859_3: + return "ISO-8859-3"; + case CP_ISO_8859_4: + return "ISO-8859-4"; + case CP_ISO_8859_5: + return "ISO-8859-5"; + case CP_ISO_8859_6: + return "ISO-8859-6"; + case CP_ISO_8859_7: + return "ISO-8859-7"; + case CP_ISO_8859_8: + return "ISO-8859-8"; + case CP_ISO_8859_9: + return "ISO-8859-9"; + case CP_ISO_2022_JP1: + case CP_ISO_2022_JP2: + case CP_ISO_2022_JP3: + return "ISO-2022-JP"; + case CP_ISO_2022_KR: + return "ISO-2022-KR"; + case CP_EUC_JP: + return "EUC-JP"; + case CP_EUC_KR: + return "EUC-KR"; + case CP_GB2312: + return "GB2312"; + case CP_GB18030: + return "GB18030"; + case CP_SJIS: + return "SJIS"; + default: + return "cp" + codepage; + } + } +} |