123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.util;
-
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.nio.charset.Charset;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.Map;
-
- /**
- * Collection of string handling utilities
- */
- @Internal
- public class StringUtil {
-
- private static final POILogger logger = POILogFactory
- .getLogger(StringUtil.class);
- protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
- public static final Charset UTF16LE = Charset.forName("UTF-16LE");
- public static final Charset UTF8 = Charset.forName("UTF-8");
- public static final Charset WIN_1252 = Charset.forName("cp1252");
- public static final Charset BIG5 = Charset.forName("Big5");
-
- private static Map<Integer,Integer> msCodepointToUnicode;
-
- private StringUtil() {
- // no instances of this class
- }
-
- /**
- * Given a byte array of 16-bit unicode characters in Little Endian
- * format (most important byte last), return a Java String representation
- * of it.
- *
- * { 0x16, 0x00 } -0x16
- *
- * @param string the byte array to be converted
- * @param offset the initial offset into the
- * byte array. it is assumed that string[ offset ] and string[ offset +
- * 1 ] contain the first 16-bit unicode character
- * @param len the length of the final string
- * @return the converted string, never <code>null</code>.
- * @exception ArrayIndexOutOfBoundsException if offset is out of bounds for
- * the byte array (i.e., is negative or is greater than or equal to
- * string.length)
- * @exception IllegalArgumentException if len is too large (i.e.,
- * there is not enough data in string to create a String of that
- * length)
- */
- public static String getFromUnicodeLE(
- final byte[] string,
- final int offset,
- final int len)
- throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
- if ((offset < 0) || (offset >= string.length)) {
- throw new ArrayIndexOutOfBoundsException("Illegal offset " + offset + " (String data is of length " + string.length + ")");
- }
- if ((len < 0) || (((string.length - offset) / 2) < len)) {
- throw new IllegalArgumentException("Illegal length " + len);
- }
-
- return new String(string, offset, len * 2, UTF16LE);
- }
-
- /**
- * Given a byte array of 16-bit unicode characters in little endian
- * format (most important byte last), return a Java String representation
- * of it.
- *
- * { 0x16, 0x00 } -0x16
- *
- * @param string the byte array to be converted
- * @return the converted string, never <code>null</code>
- */
- public static String getFromUnicodeLE(byte[] string) {
- if(string.length == 0) { return ""; }
- return getFromUnicodeLE(string, 0, string.length / 2);
- }
-
- /**
- * Convert String to 16-bit unicode characters in little endian format
- *
- * @param string the string
- * @return the byte array of 16-bit unicode characters
- */
- public static byte[] getToUnicodeLE(String string) {
- return string.getBytes(UTF16LE);
- }
-
- /**
- * Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
- * String and return.
- * (In Excel terms, read compressed 8 bit unicode as a string)
- *
- * @param string byte array to read
- * @param offset offset to read byte array
- * @param len length to read byte array
- * @return String generated String instance by reading byte array
- */
- public static String getFromCompressedUnicode(
- final byte[] string,
- final int offset,
- final int len) {
- int len_to_use = Math.min(len, string.length - offset);
- return new String(string, offset, len_to_use, ISO_8859_1);
- }
-
- public static String readCompressedUnicode(LittleEndianInput in, int nChars) {
- byte[] buf = new byte[nChars];
- in.readFully(buf);
- return new String(buf, ISO_8859_1);
- }
-
- /**
- * InputStream <tt>in</tt> is expected to contain:
- * <ol>
- * <li>ushort nChars</li>
- * <li>byte is16BitFlag</li>
- * <li>byte[]/char[] characterData</li>
- * </ol>
- * For this encoding, the is16BitFlag is always present even if nChars==0.
- *
- * This structure is also known as a XLUnicodeString.
- */
- public static String readUnicodeString(LittleEndianInput in) {
-
- int nChars = in.readUShort();
- byte flag = in.readByte();
- if ((flag & 0x01) == 0) {
- return readCompressedUnicode(in, nChars);
- }
- return readUnicodeLE(in, nChars);
- }
- /**
- * InputStream <tt>in</tt> is expected to contain:
- * <ol>
- * <li>byte is16BitFlag</li>
- * <li>byte[]/char[] characterData</li>
- * </ol>
- * For this encoding, the is16BitFlag is always present even if nChars==0.
- * <br/>
- * This method should be used when the nChars field is <em>not</em> stored
- * as a ushort immediately before the is16BitFlag. Otherwise, {@link
- * #readUnicodeString(LittleEndianInput)} can be used.
- */
- public static String readUnicodeString(LittleEndianInput in, int nChars) {
- byte is16Bit = in.readByte();
- if ((is16Bit & 0x01) == 0) {
- return readCompressedUnicode(in, nChars);
- }
- return readUnicodeLE(in, nChars);
- }
- /**
- * OutputStream <tt>out</tt> will get:
- * <ol>
- * <li>ushort nChars</li>
- * <li>byte is16BitFlag</li>
- * <li>byte[]/char[] characterData</li>
- * </ol>
- * For this encoding, the is16BitFlag is always present even if nChars==0.
- */
- public static void writeUnicodeString(LittleEndianOutput out, String value) {
- int nChars = value.length();
- out.writeShort(nChars);
- boolean is16Bit = hasMultibyte(value);
- out.writeByte(is16Bit ? 0x01 : 0x00);
- if (is16Bit) {
- putUnicodeLE(value, out);
- } else {
- putCompressedUnicode(value, out);
- }
- }
- /**
- * OutputStream <tt>out</tt> will get:
- * <ol>
- * <li>byte is16BitFlag</li>
- * <li>byte[]/char[] characterData</li>
- * </ol>
- * For this encoding, the is16BitFlag is always present even if nChars==0.
- * <br/>
- * This method should be used when the nChars field is <em>not</em> stored
- * as a ushort immediately before the is16BitFlag. Otherwise, {@link
- * #writeUnicodeString(LittleEndianOutput, String)} can be used.
- */
- public static void writeUnicodeStringFlagAndData(LittleEndianOutput out, String value) {
- boolean is16Bit = hasMultibyte(value);
- out.writeByte(is16Bit ? 0x01 : 0x00);
- if (is16Bit) {
- putUnicodeLE(value, out);
- } else {
- putCompressedUnicode(value, out);
- }
- }
-
- /**
- * @return the number of bytes that would be written by {@link #writeUnicodeString(LittleEndianOutput, String)}
- */
- public static int getEncodedSize(String value) {
- int result = 2 + 1;
- result += value.length() * (StringUtil.hasMultibyte(value) ? 2 : 1);
- return result;
- }
-
- /**
- * Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
- * codepage).
- * (In Excel terms, write compressed 8 bit unicode)
- *
- * @param input the String containing the data to be written
- * @param output the byte array to which the data is to be written
- * @param offset an offset into the byte arrat at which the data is start
- * when written
- */
- public static void putCompressedUnicode(String input, byte[] output, int offset) {
- byte[] bytes = input.getBytes(ISO_8859_1);
- System.arraycopy(bytes, 0, output, offset, bytes.length);
- }
-
- public static void putCompressedUnicode(String input, LittleEndianOutput out) {
- byte[] bytes = input.getBytes(ISO_8859_1);
- out.write(bytes);
- }
-
- /**
- * Takes a unicode string, and returns it as little endian (most
- * important byte last) bytes in the supplied byte array.
- * (In Excel terms, write uncompressed unicode)
- *
- * @param input the String containing the unicode data to be written
- * @param output the byte array to hold the uncompressed unicode, should be twice the length of the String
- * @param offset the offset to start writing into the byte array
- */
- public static void putUnicodeLE(String input, byte[] output, int offset) {
- byte[] bytes = input.getBytes(UTF16LE);
- System.arraycopy(bytes, 0, output, offset, bytes.length);
- }
- public static void putUnicodeLE(String input, LittleEndianOutput out) {
- byte[] bytes = input.getBytes(UTF16LE);
- out.write(bytes);
- }
-
- public static String readUnicodeLE(LittleEndianInput in, int nChars) {
- byte[] bytes = new byte[nChars*2];
- in.readFully(bytes);
- return new String(bytes, UTF16LE);
- }
-
- /**
- * @return the encoding we want to use, currently hardcoded to ISO-8859-1
- */
- public static String getPreferredEncoding() {
- return ISO_8859_1.name();
- }
-
- /**
- * check the parameter has multibyte character
- *
- * @param value string to check
- * @return boolean result true:string has at least one multibyte character
- */
- public static boolean hasMultibyte(String value) {
- if (value == null)
- return false;
- for (char c : value.toCharArray()) {
- if (c > 0xFF) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Checks to see if a given String needs to be represented as Unicode
- *
- * @param value The string to look at.
- * @return true if string needs Unicode to be represented.
- */
- public static boolean isUnicodeString(final String value) {
- return !value.equals(new String(value.getBytes(ISO_8859_1), ISO_8859_1));
- }
-
- /**
- * Tests if the string starts with the specified prefix, ignoring case consideration.
- */
- public static boolean startsWithIgnoreCase(String haystack, String prefix) {
- return haystack.regionMatches(true, 0, prefix, 0, prefix.length());
- }
-
- /**
- * Tests if the string ends with the specified suffix, ignoring case consideration.
- */
- public static boolean endsWithIgnoreCase(String haystack, String suffix) {
- int length = suffix.length();
- int start = haystack.length() - length;
- return haystack.regionMatches(true, start, suffix, 0, length);
- }
-
- /**
- * An Iterator over an array of Strings.
- */
- public static class StringsIterator implements Iterator<String> {
- private String[] strings = {};
- private int position = 0;
- public StringsIterator(String[] strings) {
- if (strings != null) {
- this.strings = strings.clone();
- }
- }
-
- public boolean hasNext() {
- return position < strings.length;
- }
- public String next() {
- int ourPos = position++;
- if(ourPos >= strings.length) {
- throw new ArrayIndexOutOfBoundsException(ourPos);
- }
- return strings[ourPos];
- }
- public void remove() {}
- }
-
-
- /**
- * Some strings may contain encoded characters of the unicode private use area.
- * Currently the characters of the symbol fonts are mapped to the corresponding
- * characters in the normal unicode range.
- *
- * @param string the original string
- * @return the string with mapped characters
- *
- * @see <a href="http://www.alanwood.net/unicode/private_use_area.html#symbol">Private Use Area (symbol)</a>
- * @see <a href="http://www.alanwood.net/demos/symbol.html">Symbol font - Unicode alternatives for Greek and special characters in HTML</a>
- */
- public static String mapMsCodepointString(String string) {
- if (string == null || "".equals(string)) return string;
- initMsCodepointMap();
-
- StringBuilder sb = new StringBuilder();
- final int length = string.length();
- for (int offset = 0; offset < length; ) {
- Integer msCodepoint = string.codePointAt(offset);
- Integer uniCodepoint = msCodepointToUnicode.get(msCodepoint);
- sb.appendCodePoint(uniCodepoint == null ? msCodepoint : uniCodepoint);
- offset += Character.charCount(msCodepoint);
- }
-
- return sb.toString();
- }
-
- public static synchronized void mapMsCodepoint(int msCodepoint, int unicodeCodepoint) {
- initMsCodepointMap();
- msCodepointToUnicode.put(msCodepoint, unicodeCodepoint);
- }
-
- private static synchronized void initMsCodepointMap() {
- if (msCodepointToUnicode != null) return;
- msCodepointToUnicode = new HashMap<Integer,Integer>();
- int i=0xF020;
- for (int ch : symbolMap_f020) {
- msCodepointToUnicode.put(i++, ch);
- }
- i = 0xf0a0;
- for (int ch : symbolMap_f0a0) {
- msCodepointToUnicode.put(i++, ch);
- }
- }
-
- private static final int symbolMap_f020[] = {
- ' ', // 0xf020 space
- '!', // 0xf021 exclam
- 8704, // 0xf022 universal
- '#', // 0xf023 numbersign
- 8707, // 0xf024 existential
- '%', // 0xf025 percent
- '&', // 0xf026 ampersand
- 8717, // 0xf027 suchthat
- '(', // 0xf028 parenleft
- ')', // 0xf029 parentright
- 8727, // 0xf02a asteriskmath
- '+', // 0xf02b plus
- ',', // 0xf02c comma
- 8722, // 0xf02d minus sign (long -)
- '.', // 0xf02e period
- '/', // 0xf02f slash
- '0', // 0xf030 0
- '1', // 0xf031 1
- '2', // 0xf032 2
- '3', // 0xf033 3
- '4', // 0xf034 4
- '5', // 0xf035 5
- '6', // 0xf036 6
- '7', // 0xf037 7
- '8', // 0xf038 8
- '9', // 0xf039 9
- ':', // 0xf03a colon
- ';', // 0xf03b semicolon
- '<', // 0xf03c less
- '=', // 0xf03d equal
- '>', // 0xf03e greater
- '?', // 0xf03f question
- 8773, // 0xf040 congruent
- 913, // 0xf041 alpha (upper)
- 914, // 0xf042 beta (upper)
- 935, // 0xf043 chi (upper)
- 916, // 0xf044 delta (upper)
- 917, // 0xf045 epsilon (upper)
- 934, // 0xf046 phi (upper)
- 915, // 0xf047 gamma (upper)
- 919, // 0xf048 eta (upper)
- 921, // 0xf049 iota (upper)
- 977, // 0xf04a theta1 (lower)
- 922, // 0xf04b kappa (upper)
- 923, // 0xf04c lambda (upper)
- 924, // 0xf04d mu (upper)
- 925, // 0xf04e nu (upper)
- 927, // 0xf04f omicron (upper)
- 928, // 0xf050 pi (upper)
- 920, // 0xf051 theta (upper)
- 929, // 0xf052 rho (upper)
- 931, // 0xf053 sigma (upper)
- 932, // 0xf054 tau (upper)
- 933, // 0xf055 upsilon (upper)
- 962, // 0xf056 simga1 (lower)
- 937, // 0xf057 omega (upper)
- 926, // 0xf058 xi (upper)
- 936, // 0xf059 psi (upper)
- 918, // 0xf05a zeta (upper)
- '[', // 0xf05b bracketleft
- 8765, // 0xf05c therefore
- ']', // 0xf05d bracketright
- 8869, // 0xf05e perpendicular
- '_', // 0xf05f underscore
- ' ', // 0xf060 radicalex (doesn't exist in unicode)
- 945, // 0xf061 alpha (lower)
- 946, // 0xf062 beta (lower)
- 967, // 0xf063 chi (lower)
- 948, // 0xf064 delta (lower)
- 949, // 0xf065 epsilon (lower)
- 966, // 0xf066 phi (lower)
- 947, // 0xf067 gamma (lower)
- 951, // 0xf068 eta (lower)
- 953, // 0xf069 iota (lower)
- 981, // 0xf06a phi1 (lower)
- 954, // 0xf06b kappa (lower)
- 955, // 0xf06c lambda (lower)
- 956, // 0xf06d mu (lower)
- 957, // 0xf06e nu (lower)
- 959, // 0xf06f omnicron (lower)
- 960, // 0xf070 pi (lower)
- 952, // 0xf071 theta (lower)
- 961, // 0xf072 rho (lower)
- 963, // 0xf073 sigma (lower)
- 964, // 0xf074 tau (lower)
- 965, // 0xf075 upsilon (lower)
- 982, // 0xf076 piv (lower)
- 969, // 0xf077 omega (lower)
- 958, // 0xf078 xi (lower)
- 968, // 0xf079 psi (lower)
- 950, // 0xf07a zeta (lower)
- '{', // 0xf07b braceleft
- '|', // 0xf07c bar
- '}', // 0xf07d braceright
- 8764, // 0xf07e similar '~'
- ' ', // 0xf07f not defined
- };
-
- private static final int symbolMap_f0a0[] = {
- 8364, // 0xf0a0 not defined / euro symbol
- 978, // 0xf0a1 upsilon1 (upper)
- 8242, // 0xf0a2 minute
- 8804, // 0xf0a3 lessequal
- 8260, // 0xf0a4 fraction
- 8734, // 0xf0a5 infinity
- 402, // 0xf0a6 florin
- 9827, // 0xf0a7 club
- 9830, // 0xf0a8 diamond
- 9829, // 0xf0a9 heart
- 9824, // 0xf0aa spade
- 8596, // 0xf0ab arrowboth
- 8591, // 0xf0ac arrowleft
- 8593, // 0xf0ad arrowup
- 8594, // 0xf0ae arrowright
- 8595, // 0xf0af arrowdown
- 176, // 0xf0b0 degree
- 177, // 0xf0b1 plusminus
- 8243, // 0xf0b2 second
- 8805, // 0xf0b3 greaterequal
- 215, // 0xf0b4 multiply
- 181, // 0xf0b5 proportional
- 8706, // 0xf0b6 partialdiff
- 8729, // 0xf0b7 bullet
- 247, // 0xf0b8 divide
- 8800, // 0xf0b9 notequal
- 8801, // 0xf0ba equivalence
- 8776, // 0xf0bb approxequal
- 8230, // 0xf0bc ellipsis
- 9168, // 0xf0bd arrowvertex
- 9135, // 0xf0be arrowhorizex
- 8629, // 0xf0bf carriagereturn
- 8501, // 0xf0c0 aleph
- 8475, // 0xf0c1 Ifraktur
- 8476, // 0xf0c2 Rfraktur
- 8472, // 0xf0c3 weierstrass
- 8855, // 0xf0c4 circlemultiply
- 8853, // 0xf0c5 circleplus
- 8709, // 0xf0c6 emptyset
- 8745, // 0xf0c7 intersection
- 8746, // 0xf0c8 union
- 8835, // 0xf0c9 propersuperset
- 8839, // 0xf0ca reflexsuperset
- 8836, // 0xf0cb notsubset
- 8834, // 0xf0cc propersubset
- 8838, // 0xf0cd reflexsubset
- 8712, // 0xf0ce element
- 8713, // 0xf0cf notelement
- 8736, // 0xf0d0 angle
- 8711, // 0xf0d1 gradient
- 174, // 0xf0d2 registerserif
- 169, // 0xf0d3 copyrightserif
- 8482, // 0xf0d4 trademarkserif
- 8719, // 0xf0d5 product
- 8730, // 0xf0d6 radical
- 8901, // 0xf0d7 dotmath
- 172, // 0xf0d8 logicalnot
- 8743, // 0xf0d9 logicaland
- 8744, // 0xf0da logicalor
- 8660, // 0xf0db arrowdblboth
- 8656, // 0xf0dc arrowdblleft
- 8657, // 0xf0dd arrowdblup
- 8658, // 0xf0de arrowdblright
- 8659, // 0xf0df arrowdbldown
- 9674, // 0xf0e0 lozenge
- 9001, // 0xf0e1 angleleft
- 174, // 0xf0e2 registersans
- 169, // 0xf0e3 copyrightsans
- 8482, // 0xf0e4 trademarksans
- 8721, // 0xf0e5 summation
- 9115, // 0xf0e6 parenlefttp
- 9116, // 0xf0e7 parenleftex
- 9117, // 0xf0e8 parenleftbt
- 9121, // 0xf0e9 bracketlefttp
- 9122, // 0xf0ea bracketleftex
- 9123, // 0xf0eb bracketleftbt
- 9127, // 0xf0ec bracelefttp
- 9128, // 0xf0ed braceleftmid
- 9129, // 0xf0ee braceleftbt
- 9130, // 0xf0ef braceex
- ' ', // 0xf0f0 not defined
- 9002, // 0xf0f1 angleright
- 8747, // 0xf0f2 integral
- 8992, // 0xf0f3 integraltp
- 9134, // 0xf0f4 integralex
- 8993, // 0xf0f5 integralbt
- 9118, // 0xf0f6 parenrighttp
- 9119, // 0xf0f7 parenrightex
- 9120, // 0xf0f8 parenrightbt
- 9124, // 0xf0f9 bracketrighttp
- 9125, // 0xf0fa bracketrightex
- 9126, // 0xf0fb bracketrightbt
- 9131, // 0xf0fc bracerighttp
- 9132, // 0xf0fd bracerightmid
- 9133, // 0xf0fe bracerightbt
- ' ', // 0xf0ff not defined
- };
-
- /**
- * This tries to convert a LE byte array in Big5 to a String.
- * We know MS zero-padded ascii, and we drop those.
- * However, there may be areas for improvement in this.
- *
- * @param data
- * @param offset
- * @param lengthInBytes
- * @return
- */
- public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
- ByteArrayOutputStream os = new ByteArrayOutputStream();
- try {
- IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
- } catch (IOException e) {
- logger.log(POILogger.WARN,
- "IOException while copying a byte array stream to a byte array stream?!");
- }
- return new String(os.toByteArray(), BIG5);
- }
-
- // Could be replaced with org.apache.commons.lang3.StringUtils#join
- @Internal
- public static String join(Object[] array, String separator) {
- if (array == null || array.length == 0) return "";
- StringBuilder sb = new StringBuilder();
- sb.append(array[0]);
- for (int i=1; i<array.length; i++) {
- sb.append(separator).append(array[i]);
- }
- return sb.toString();
- }
-
- @Internal
- public static String join(Object[] array) {
- if (array == null) return "";
- StringBuilder sb = new StringBuilder();
- for (Object o : array) {
- sb.append(o);
- }
- return sb.toString();
- }
-
- @Internal
- public static String join(String separator, Object... array) {
- return join(array, separator);
- }
-
- /**
- * Count number of occurrences of needle in haystack
- * Has same signature as org.apache.commons.lang3.StringUtils#countMatches
- *
- * @param haystack the CharSequence to check, may be null
- * @param needle the character to count the quantity of
- * @return the number of occurrences, 0 if the CharSequence is null
- */
- public static int countMatches(CharSequence haystack, char needle) {
- if (haystack == null) return 0;
- int count = 0;
- final int length = haystack.length();
- for (int i=0; i<length; i++) {
- if (haystack.charAt(i) == needle) {
- count++;
- }
- }
- return count;
- }
- }
|