From: PJ Fanning Date: Sat, 19 Dec 2020 19:45:09 +0000 (+0000) Subject: [github-198] Remove jdk.charset module dependency for spreadsheets generation. Thanks... X-Git-Tag: before_junit5_update~3 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=d20fa4430554760967f193cafc3e1a07fb28ced0;p=poi.git [github-198] Remove jdk.charset module dependency for spreadsheets generation. Thanks to Robert Marcano. This closes #198 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1884631 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java index da8f8a9842..2c1480253d 100644 --- a/src/java/org/apache/poi/util/CodePageUtil.java +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -31,8 +31,6 @@ import java.util.Set; public class CodePageUtil { - public static final Set DOUBLE_BYTE_CHARSETS = Collections.singleton(StringUtil.BIG5); - /**

Codepage 037, a special case

*/ public static final int CP_037 = 37; @@ -446,27 +444,4 @@ public class CodePageUtil return "cp" + codepage; } } - - /** - * This tries to convert a LE byte array in cp950 - * (Microsoft's dialect of Big5) to a String. - * We know MS zero-padded ascii, and we drop those. - * There may be areas for improvement in this. - * - * @param data - * @param offset - * @param lengthInBytes - * @return Decoded String - */ - public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { - StringBuilder sb = new StringBuilder(); - LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); - int c = reader.read(); - while (c != -1) { - sb.append((char)c); - c = reader.read(); - } - reader.close(); - return sb.toString(); - } } diff --git a/src/java/org/apache/poi/util/LittleEndianCP950Reader.java b/src/java/org/apache/poi/util/LittleEndianCP950Reader.java deleted file mode 100644 index 61808afcaa..0000000000 --- a/src/java/org/apache/poi/util/LittleEndianCP950Reader.java +++ /dev/null @@ -1,479 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ - -package org.apache.poi.util; -import java.io.IOException; -import java.io.Reader; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.CharsetDecoder; - -/** - * Stream that converts CP950 (MSOffice's dialect of Big5), with - * zero-byte padding for ASCII and in LittleEndianOrder. - */ -@Internal -public class LittleEndianCP950Reader extends Reader { - - private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class); - - - private static final char UNMAPPABLE = '?'; - private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2); - private final CharBuffer charBuffer = CharBuffer.allocate(2); - private final CharsetDecoder decoder = StringUtil.BIG5.newDecoder(); - - //https://en.wikipedia.org/wiki/Code_page_950 - //see private use area - private final static char range1Low = '\u8140'; - private final static char range1High = '\u8DFE'; - private final static char range2Low = '\u8E40'; - private final static char range2High = '\uA0FE'; - private final static char range3Low = '\uC6A1'; - private final static char range3High = '\uC8FE'; - private final static char range4Low = '\uFA40'; - private final static char range4High = '\uFEFE'; - - private final byte[] data; - private final int startOffset; - private final int length; - private int offset; - private int trailing; - private int leading; - int cnt; - //the char that is logically trailing in Big5 encoding - //however in LittleEndian order, this is the first encountered. - public LittleEndianCP950Reader(byte[] data) { - this(data, 0, data.length); - } - - public LittleEndianCP950Reader(byte[] data, int offset, int length) { - this.data = data; - this.startOffset = offset; - this.offset = startOffset; - this.length = length; - } - - @Override - public int read() { - if (offset + 1 > data.length || offset - startOffset > length) { - return -1; - } - trailing = data[offset++] & 0xff; - leading = data[offset++] & 0xff; - decoder.reset(); - if (leading < 0x81) { - //return trailing alone - //there may be some subtleties here - return trailing; - } else if (leading == 0xf9) { - return handleF9(trailing); - } else { - int ch = (leading << 8) + trailing; - if (ch >= range1Low && ch <= range1High) { - return handleRange1(leading, trailing); - } else if (ch >= range2Low && ch <= range2High) { - return handleRange2(leading, trailing); - } else if (ch >= range3Low && ch <= range3High) { - return handleRange3(leading, trailing); - } else if (ch >= range4Low && ch <= range4High) { - return handleRange4(leading, trailing); - } - - charBuffer.clear(); - doubleByteBuffer.clear(); - doubleByteBuffer.put((byte) leading); - doubleByteBuffer.put((byte) trailing); - doubleByteBuffer.flip(); - decoder.decode(doubleByteBuffer, charBuffer, true); - charBuffer.flip(); - - if (charBuffer.length() == 0) { - LOGGER.log(POILogger.WARN, "couldn't create char for: " - + Integer.toString((leading & 0xff), 16) - + " " + Integer.toString((trailing & 0xff), 16)); - return UNMAPPABLE; - } else { - return Character.codePointAt(charBuffer, 0); - } - } - - - } - - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - //there may be some efficiencies, but this should do for now. - - for (int i = off; i < off + len; i++) { - int c = read(); - if (c == -1) { - return i - off; - } - cbuf[i] = (char) c; - } - return len; - } - - @Override - public void close() { - } - - private int handleRange1(int leading, int trailing) { - return (0xeeb8 + (157 * (leading - 0x81))) + - ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); - } - - private int handleRange2(int leading, int trailing) { - return (0xe311 + (157 * (leading - 0x8e))) + - ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); - } - - private int handleRange3(int leading, int trailing) { - return (0xf672 + (157 * (leading - 0xc6))) + - ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); - } - - private int handleRange4(int leading, int trailing) { - return (0xe000 + (157 * (leading - 0xfa))) + - ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); - } - - private int handleF9(int trailing) { - switch (trailing) { - case 0x40: - return 0x7e98; - case 0x41: - return 0x7e9b; - case 0x42: - return 0x7e99; - case 0x43: - return 0x81e0; - case 0x44: - return 0x81e1; - case 0x45: - return 0x8646; - case 0x46: - return 0x8647; - case 0x47: - return 0x8648; - case 0x48: - return 0x8979; - case 0x49: - return 0x897a; - case 0x4a: - return 0x897c; - case 0x4b: - return 0x897b; - case 0x4c: - return 0x89ff; - case 0x4d: - return 0x8b98; - case 0x4e: - return 0x8b99; - case 0x4f: - return 0x8ea5; - case 0x50: - return 0x8ea4; - case 0x51: - return 0x8ea3; - case 0x52: - return 0x946e; - case 0x53: - return 0x946d; - case 0x54: - return 0x946f; - case 0x55: - return 0x9471; - case 0x56: - return 0x9473; - case 0x57: - return 0x9749; - case 0x58: - return 0x9872; - case 0x59: - return 0x995f; - case 0x5a: - return 0x9c68; - case 0x5b: - return 0x9c6e; - case 0x5c: - return 0x9c6d; - case 0x5d: - return 0x9e0b; - case 0x5e: - return 0x9e0d; - case 0x5f: - return 0x9e10; - case 0x60: - return 0x9e0f; - case 0x61: - return 0x9e12; - case 0x62: - return 0x9e11; - case 0x63: - return 0x9ea1; - case 0x64: - return 0x9ef5; - case 0x65: - return 0x9f09; - case 0x66: - return 0x9f47; - case 0x67: - return 0x9f78; - case 0x68: - return 0x9f7b; - case 0x69: - return 0x9f7a; - case 0x6a: - return 0x9f79; - case 0x6b: - return 0x571e; - case 0x6c: - return 0x7066; - case 0x6d: - return 0x7c6f; - case 0x6e: - return 0x883c; - case 0x6f: - return 0x8db2; - case 0x70: - return 0x8ea6; - case 0x71: - return 0x91c3; - case 0x72: - return 0x9474; - case 0x73: - return 0x9478; - case 0x74: - return 0x9476; - case 0x75: - return 0x9475; - case 0x76: - return 0x9a60; - case 0x77: - return 0x9c74; - case 0x78: - return 0x9c73; - case 0x79: - return 0x9c71; - case 0x7a: - return 0x9c75; - case 0x7b: - return 0x9e14; - case 0x7c: - return 0x9e13; - case 0x7d: - return 0x9ef6; - case 0x7e: - return 0x9f0a; - case 0xa1: - return 0x9fa4; - case 0xa2: - return 0x7068; - case 0xa3: - return 0x7065; - case 0xa4: - return 0x7cf7; - case 0xa5: - return 0x866a; - case 0xa6: - return 0x883e; - case 0xa7: - return 0x883d; - case 0xa8: - return 0x883f; - case 0xa9: - return 0x8b9e; - case 0xaa: - return 0x8c9c; - case 0xab: - return 0x8ea9; - case 0xac: - return 0x8ec9; - case 0xad: - return 0x974b; - case 0xae: - return 0x9873; - case 0xaf: - return 0x9874; - case 0xb0: - return 0x98cc; - case 0xb1: - return 0x9961; - case 0xb2: - return 0x99ab; - case 0xb3: - return 0x9a64; - case 0xb4: - return 0x9a66; - case 0xb5: - return 0x9a67; - case 0xb6: - return 0x9b24; - case 0xb7: - return 0x9e15; - case 0xb8: - return 0x9e17; - case 0xb9: - return 0x9f48; - case 0xba: - return 0x6207; - case 0xbb: - return 0x6b1e; - case 0xbc: - return 0x7227; - case 0xbd: - return 0x864c; - case 0xbe: - return 0x8ea8; - case 0xbf: - return 0x9482; - case 0xc0: - return 0x9480; - case 0xc1: - return 0x9481; - case 0xc2: - return 0x9a69; - case 0xc3: - return 0x9a68; - case 0xc4: - return 0x9b2e; - case 0xc5: - return 0x9e19; - case 0xc6: - return 0x7229; - case 0xc7: - return 0x864b; - case 0xc8: - return 0x8b9f; - case 0xc9: - return 0x9483; - case 0xca: - return 0x9c79; - case 0xcb: - return 0x9eb7; - case 0xcc: - return 0x7675; - case 0xcd: - return 0x9a6b; - case 0xce: - return 0x9c7a; - case 0xcf: - return 0x9e1d; - case 0xd0: - return 0x7069; - case 0xd1: - return 0x706a; - case 0xd2: - return 0x9ea4; - case 0xd3: - return 0x9f7e; - case 0xd4: - return 0x9f49; - case 0xd5: - return 0x9f98; - case 0xd6: - return 0x7881; - case 0xd7: - return 0x92b9; - case 0xd8: - return 0x88cf; - case 0xd9: - return 0x58bb; - case 0xda: - return 0x6052; - case 0xdb: - return 0x7ca7; - case 0xdc: - return 0x5afa; - case 0xdd: - return 0x2554; - case 0xde: - return 0x2566; - case 0xdf: - return 0x2557; - case 0xe0: - return 0x2560; - case 0xe1: - return 0x256c; - case 0xe2: - return 0x2563; - case 0xe3: - return 0x255a; - case 0xe4: - return 0x2569; - case 0xe5: - return 0x255d; - case 0xe6: - return 0x2552; - case 0xe7: - return 0x2564; - case 0xe8: - return 0x2555; - case 0xe9: - return 0x255e; - case 0xea: - return 0x256a; - case 0xeb: - return 0x2561; - case 0xec: - return 0x2558; - case 0xed: - return 0x2567; - case 0xee: - return 0x255b; - case 0xef: - return 0x2553; - case 0xf0: - return 0x2565; - case 0xf1: - return 0x2556; - case 0xf2: - return 0x255f; - case 0xf3: - return 0x256b; - case 0xf4: - return 0x2562; - case 0xf5: - return 0x2559; - case 0xf6: - return 0x2568; - case 0xf7: - return 0x255c; - case 0xf8: - return 0x2551; - case 0xf9: - return 0x2550; - case 0xfa: - return 0x256d; - case 0xfb: - return 0x256e; - case 0xfc: - return 0x2570; - case 0xfd: - return 0x256f; - case 0xfe: - return 0x2593; - default: - LOGGER.log(POILogger.WARN, "couldn't create char for: f9" - + " " + Integer.toString((trailing & 0xff), 16)); - return UNMAPPABLE; - } - } -} diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java index a0778a3efa..d281c63386 100644 --- a/src/java/org/apache/poi/util/StringUtil.java +++ b/src/java/org/apache/poi/util/StringUtil.java @@ -34,7 +34,6 @@ public final class StringUtil { public static final Charset UTF16LE = StandardCharsets.UTF_16LE; public static final Charset UTF8 = StandardCharsets.UTF_8; public static final Charset WIN_1252 = Charset.forName("cp1252"); - public static final Charset BIG5 = Charset.forName("Big5"); private StringUtil() { // no instances of this class diff --git a/src/multimodule/scratchpad/test9/module-info.class b/src/multimodule/scratchpad/test9/module-info.class index cfda7bb495..e74671e4fb 100644 Binary files a/src/multimodule/scratchpad/test9/module-info.class and b/src/multimodule/scratchpad/test9/module-info.class differ diff --git a/src/multimodule/scratchpad/test9/module-info.java b/src/multimodule/scratchpad/test9/module-info.java index 9d406fd36e..2aaef57a0a 100644 --- a/src/multimodule/scratchpad/test9/module-info.java +++ b/src/multimodule/scratchpad/test9/module-info.java @@ -82,6 +82,7 @@ module org.apache.poi.scratchpad { exports org.apache.poi.hemf.hemfplus.extractor to junit; exports org.apache.poi.hslf to junit; exports org.apache.poi.hwmf to junit; + exports org.apache.poi.hwpf.util to junit; opens org.apache.poi.hwpf.model to org.mockito; opens org.apache.poi.hwpf.model.types to org.mockito; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 238e0406f8..a8fb4078a7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -37,7 +37,7 @@ import org.apache.poi.hwpf.model.TextPieceTable; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.IOUtils; import org.apache.poi.util.LittleEndian; import org.apache.poi.util.NotImplemented; @@ -176,7 +176,7 @@ public class HWPFOldDocument extends HWPFDocumentCore { _fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin(), MAX_RECORD_LENGTH); int numChars = textData.length; - if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { + if (DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { numChars /= 2; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java index e3cb94c868..e6a0887f32 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java @@ -20,7 +20,7 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.IOUtils; import org.apache.poi.util.Internal; @@ -73,7 +73,7 @@ public class OldTextPieceTable extends TextPieceTable { boolean unicode = pieces[x].isUnicode(); int multiple = 1; if (unicode || - (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { + (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { multiple = 2; } @@ -106,7 +106,7 @@ public class OldTextPieceTable extends TextPieceTable { @Override protected int getEncodingMultiplier(TextPiece textPiece) { Charset charset = textPiece.getPieceDescriptor().getCharset(); - if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { + if (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { return 2; } return 1; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index 5c9fcf70d9..0c606cbf03 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -20,7 +20,7 @@ package org.apache.poi.hwpf.model; import java.nio.charset.Charset; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.Internal; import org.apache.poi.util.StringUtil; @@ -77,8 +77,8 @@ public class TextPiece extends PropertyNode { * Create the StringBuilder from the text and unicode flag */ private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { - if (StringUtil.BIG5.equals(pd.getCharset())) { - return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length)); + if (DoubleByteUtil.BIG5.equals(pd.getCharset())) { + return new StringBuilder(DoubleByteUtil.cp950ToString(text, 0, text.length)); } String str = new String(text, 0, text.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java new file mode 100644 index 0000000000..5d55711ed9 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java @@ -0,0 +1,59 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.Set; + +/** + * Utilities for working with double byte CodePages. + * + *

Provides constants for understanding numeric codepages, + * along with utilities to translate these into Java Character Sets.

+ */ +public class DoubleByteUtil +{ + + public static final Charset BIG5 = Charset.forName("Big5"); + + public static final Set DOUBLE_BYTE_CHARSETS = Collections.singleton(BIG5); + + /** + * This tries to convert a LE byte array in cp950 + * (Microsoft's dialect of Big5) to a String. + * We know MS zero-padded ascii, and we drop those. + * There may be areas for improvement in this. + * + * @param data + * @param offset + * @param lengthInBytes + * @return Decoded String + */ + public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { + StringBuilder sb = new StringBuilder(); + LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); + int c = reader.read(); + while (c != -1) { + sb.append((char)c); + c = reader.read(); + } + reader.close(); + return sb.toString(); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java new file mode 100644 index 0000000000..195629bb04 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java @@ -0,0 +1,483 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import java.io.IOException; +import java.io.Reader; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; + +import org.apache.poi.util.Internal; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +/** + * Stream that converts CP950 (MSOffice's dialect of Big5), with + * zero-byte padding for ASCII and in LittleEndianOrder. + */ +@Internal +public class LittleEndianCP950Reader extends Reader { + + private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class); + + private static final char UNMAPPABLE = '?'; + private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2); + private final CharBuffer charBuffer = CharBuffer.allocate(2); + private final CharsetDecoder decoder = DoubleByteUtil.BIG5.newDecoder(); + + //https://en.wikipedia.org/wiki/Code_page_950 + //see private use area + private final static char range1Low = '\u8140'; + private final static char range1High = '\u8DFE'; + private final static char range2Low = '\u8E40'; + private final static char range2High = '\uA0FE'; + private final static char range3Low = '\uC6A1'; + private final static char range3High = '\uC8FE'; + private final static char range4Low = '\uFA40'; + private final static char range4High = '\uFEFE'; + + private final byte[] data; + private final int startOffset; + private final int length; + private int offset; + private int trailing; + private int leading; + int cnt; + //the char that is logically trailing in Big5 encoding + //however in LittleEndian order, this is the first encountered. + public LittleEndianCP950Reader(byte[] data) { + this(data, 0, data.length); + } + + public LittleEndianCP950Reader(byte[] data, int offset, int length) { + this.data = data; + this.startOffset = offset; + this.offset = startOffset; + this.length = length; + } + + @Override + public int read() { + if (offset + 1 > data.length || offset - startOffset > length) { + return -1; + } + trailing = data[offset++] & 0xff; + leading = data[offset++] & 0xff; + decoder.reset(); + if (leading < 0x81) { + //return trailing alone + //there may be some subtleties here + return trailing; + } else if (leading == 0xf9) { + return handleF9(trailing); + } else { + int ch = (leading << 8) + trailing; + if (ch >= range1Low && ch <= range1High) { + return handleRange1(leading, trailing); + } else if (ch >= range2Low && ch <= range2High) { + return handleRange2(leading, trailing); + } else if (ch >= range3Low && ch <= range3High) { + return handleRange3(leading, trailing); + } else if (ch >= range4Low && ch <= range4High) { + return handleRange4(leading, trailing); + } + + charBuffer.clear(); + doubleByteBuffer.clear(); + doubleByteBuffer.put((byte) leading); + doubleByteBuffer.put((byte) trailing); + doubleByteBuffer.flip(); + decoder.decode(doubleByteBuffer, charBuffer, true); + charBuffer.flip(); + + if (charBuffer.length() == 0) { + LOGGER.log(POILogger.WARN, "couldn't create char for: " + + Integer.toString((leading & 0xff), 16) + + " " + Integer.toString((trailing & 0xff), 16)); + return UNMAPPABLE; + } else { + return Character.codePointAt(charBuffer, 0); + } + } + + + } + + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + //there may be some efficiencies, but this should do for now. + + for (int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) { + return i - off; + } + cbuf[i] = (char) c; + } + return len; + } + + @Override + public void close() { + } + + private int handleRange1(int leading, int trailing) { + return (0xeeb8 + (157 * (leading - 0x81))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange2(int leading, int trailing) { + return (0xe311 + (157 * (leading - 0x8e))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange3(int leading, int trailing) { + return (0xf672 + (157 * (leading - 0xc6))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange4(int leading, int trailing) { + return (0xe000 + (157 * (leading - 0xfa))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleF9(int trailing) { + switch (trailing) { + case 0x40: + return 0x7e98; + case 0x41: + return 0x7e9b; + case 0x42: + return 0x7e99; + case 0x43: + return 0x81e0; + case 0x44: + return 0x81e1; + case 0x45: + return 0x8646; + case 0x46: + return 0x8647; + case 0x47: + return 0x8648; + case 0x48: + return 0x8979; + case 0x49: + return 0x897a; + case 0x4a: + return 0x897c; + case 0x4b: + return 0x897b; + case 0x4c: + return 0x89ff; + case 0x4d: + return 0x8b98; + case 0x4e: + return 0x8b99; + case 0x4f: + return 0x8ea5; + case 0x50: + return 0x8ea4; + case 0x51: + return 0x8ea3; + case 0x52: + return 0x946e; + case 0x53: + return 0x946d; + case 0x54: + return 0x946f; + case 0x55: + return 0x9471; + case 0x56: + return 0x9473; + case 0x57: + return 0x9749; + case 0x58: + return 0x9872; + case 0x59: + return 0x995f; + case 0x5a: + return 0x9c68; + case 0x5b: + return 0x9c6e; + case 0x5c: + return 0x9c6d; + case 0x5d: + return 0x9e0b; + case 0x5e: + return 0x9e0d; + case 0x5f: + return 0x9e10; + case 0x60: + return 0x9e0f; + case 0x61: + return 0x9e12; + case 0x62: + return 0x9e11; + case 0x63: + return 0x9ea1; + case 0x64: + return 0x9ef5; + case 0x65: + return 0x9f09; + case 0x66: + return 0x9f47; + case 0x67: + return 0x9f78; + case 0x68: + return 0x9f7b; + case 0x69: + return 0x9f7a; + case 0x6a: + return 0x9f79; + case 0x6b: + return 0x571e; + case 0x6c: + return 0x7066; + case 0x6d: + return 0x7c6f; + case 0x6e: + return 0x883c; + case 0x6f: + return 0x8db2; + case 0x70: + return 0x8ea6; + case 0x71: + return 0x91c3; + case 0x72: + return 0x9474; + case 0x73: + return 0x9478; + case 0x74: + return 0x9476; + case 0x75: + return 0x9475; + case 0x76: + return 0x9a60; + case 0x77: + return 0x9c74; + case 0x78: + return 0x9c73; + case 0x79: + return 0x9c71; + case 0x7a: + return 0x9c75; + case 0x7b: + return 0x9e14; + case 0x7c: + return 0x9e13; + case 0x7d: + return 0x9ef6; + case 0x7e: + return 0x9f0a; + case 0xa1: + return 0x9fa4; + case 0xa2: + return 0x7068; + case 0xa3: + return 0x7065; + case 0xa4: + return 0x7cf7; + case 0xa5: + return 0x866a; + case 0xa6: + return 0x883e; + case 0xa7: + return 0x883d; + case 0xa8: + return 0x883f; + case 0xa9: + return 0x8b9e; + case 0xaa: + return 0x8c9c; + case 0xab: + return 0x8ea9; + case 0xac: + return 0x8ec9; + case 0xad: + return 0x974b; + case 0xae: + return 0x9873; + case 0xaf: + return 0x9874; + case 0xb0: + return 0x98cc; + case 0xb1: + return 0x9961; + case 0xb2: + return 0x99ab; + case 0xb3: + return 0x9a64; + case 0xb4: + return 0x9a66; + case 0xb5: + return 0x9a67; + case 0xb6: + return 0x9b24; + case 0xb7: + return 0x9e15; + case 0xb8: + return 0x9e17; + case 0xb9: + return 0x9f48; + case 0xba: + return 0x6207; + case 0xbb: + return 0x6b1e; + case 0xbc: + return 0x7227; + case 0xbd: + return 0x864c; + case 0xbe: + return 0x8ea8; + case 0xbf: + return 0x9482; + case 0xc0: + return 0x9480; + case 0xc1: + return 0x9481; + case 0xc2: + return 0x9a69; + case 0xc3: + return 0x9a68; + case 0xc4: + return 0x9b2e; + case 0xc5: + return 0x9e19; + case 0xc6: + return 0x7229; + case 0xc7: + return 0x864b; + case 0xc8: + return 0x8b9f; + case 0xc9: + return 0x9483; + case 0xca: + return 0x9c79; + case 0xcb: + return 0x9eb7; + case 0xcc: + return 0x7675; + case 0xcd: + return 0x9a6b; + case 0xce: + return 0x9c7a; + case 0xcf: + return 0x9e1d; + case 0xd0: + return 0x7069; + case 0xd1: + return 0x706a; + case 0xd2: + return 0x9ea4; + case 0xd3: + return 0x9f7e; + case 0xd4: + return 0x9f49; + case 0xd5: + return 0x9f98; + case 0xd6: + return 0x7881; + case 0xd7: + return 0x92b9; + case 0xd8: + return 0x88cf; + case 0xd9: + return 0x58bb; + case 0xda: + return 0x6052; + case 0xdb: + return 0x7ca7; + case 0xdc: + return 0x5afa; + case 0xdd: + return 0x2554; + case 0xde: + return 0x2566; + case 0xdf: + return 0x2557; + case 0xe0: + return 0x2560; + case 0xe1: + return 0x256c; + case 0xe2: + return 0x2563; + case 0xe3: + return 0x255a; + case 0xe4: + return 0x2569; + case 0xe5: + return 0x255d; + case 0xe6: + return 0x2552; + case 0xe7: + return 0x2564; + case 0xe8: + return 0x2555; + case 0xe9: + return 0x255e; + case 0xea: + return 0x256a; + case 0xeb: + return 0x2561; + case 0xec: + return 0x2558; + case 0xed: + return 0x2567; + case 0xee: + return 0x255b; + case 0xef: + return 0x2553; + case 0xf0: + return 0x2565; + case 0xf1: + return 0x2556; + case 0xf2: + return 0x255f; + case 0xf3: + return 0x256b; + case 0xf4: + return 0x2562; + case 0xf5: + return 0x2559; + case 0xf6: + return 0x2568; + case 0xf7: + return 0x255c; + case 0xf8: + return 0x2551; + case 0xf9: + return 0x2550; + case 0xfa: + return 0x256d; + case 0xfb: + return 0x256e; + case 0xfc: + return 0x2570; + case 0xfd: + return 0x256f; + case 0xfe: + return 0x2593; + default: + LOGGER.log(POILogger.WARN, "couldn't create char for: f9" + + " " + Integer.toString((trailing & 0xff), 16)); + return UNMAPPABLE; + } + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java new file mode 100644 index 0000000000..b6c7fd201a --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java @@ -0,0 +1,68 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.io.Reader; + +import org.junit.Test; + +public class TestLittleEndianCP950Reader { + + @Test + public void testPersonalUseMappings() throws Exception { + //ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt + byte[] data = new byte[2]; + data[1] = (byte) 0xfe; + data[0] = (byte) 0xd3; + assertCharEquals('\uE2E5', data); + + data[1] = (byte) 0x90; + data[0] = (byte) 0xb6; + assertCharEquals('\uE49F', data); + + //actually found in document + //but this disagrees with file above + data[1] = (byte) 0x8E; + data[0] = (byte) 0xA8; + assertCharEquals('\uE357', data); + + data[1] = (byte) 0x8E; + data[0] = (byte) 0xE6; + assertCharEquals('\uE395', data); + + /* + //TODO: figure out why this isn't working + data[0] = (byte)0xF9; + data[1] = (byte)0xD8; + assertCharEquals('\u88CF', data); + */ + + } + + + private void assertCharEquals(char expected, byte[] data) throws IOException { + Reader reader = new LittleEndianCP950Reader(data); + int c = reader.read(); + assertEquals((int) expected, c); + int eof = reader.read(); + assertEquals("should be end of stream", -1, eof); + } +} diff --git a/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java b/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java deleted file mode 100644 index ef648e4f00..0000000000 --- a/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java +++ /dev/null @@ -1,69 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ - -package org.apache.poi.util; - - -import static org.junit.Assert.assertEquals; - -import java.io.IOException; -import java.io.Reader; - -import org.junit.Test; - -public class TestLittleEndianCP950Reader { - - @Test - public void testPersonalUseMappings() throws Exception { - //ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt - byte[] data = new byte[2]; - data[1] = (byte) 0xfe; - data[0] = (byte) 0xd3; - assertCharEquals('\uE2E5', data); - - data[1] = (byte) 0x90; - data[0] = (byte) 0xb6; - assertCharEquals('\uE49F', data); - - //actually found in document - //but this disagrees with file above - data[1] = (byte) 0x8E; - data[0] = (byte) 0xA8; - assertCharEquals('\uE357', data); - - data[1] = (byte) 0x8E; - data[0] = (byte) 0xE6; - assertCharEquals('\uE395', data); - - /* - //TODO: figure out why this isn't working - data[0] = (byte)0xF9; - data[1] = (byte)0xD8; - assertCharEquals('\u88CF', data); - */ - - } - - - private void assertCharEquals(char expected, byte[] data) throws IOException { - Reader reader = new LittleEndianCP950Reader(data); - int c = reader.read(); - assertEquals((int) expected, c); - int eof = reader.read(); - assertEquals("should be end of stream", -1, eof); - } -}