From: Tim Allison Date: Wed, 5 Apr 2017 01:45:55 +0000 (+0000) Subject: bug 50955 and bug 60953 improve Big5 reader; ensure one character X-Git-Tag: REL_3_16_FINAL~14 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=55ed734108be87f0487343d86a616e87a5ee20ca;p=poi.git bug 50955 and bug 60953 improve Big5 reader; ensure one character per byte pair git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790172 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java index 5be1c5077e..5877a38cfb 100644 --- a/src/java/org/apache/poi/util/CodePageUtil.java +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -31,10 +31,12 @@ import java.util.Set; public class CodePageUtil { - public static final Set VARIABLE_BYTE_CHARSETS = new HashSet(); + public static final Set DOUBLE_BYTE_CHARSETS + = new HashSet(); + static { + DOUBLE_BYTE_CHARSETS.add(StringUtil.BIG5); //others? - VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5); } /**

Codepage 037, a special case

*/ @@ -450,4 +452,26 @@ public class CodePageUtil return "cp" + codepage; } } + + /** + * This tries to convert a LE byte array in cp950 + * (Microsoft's dialect of Big5) to a String. + * We know MS zero-padded ascii, and we drop those. + * There may be areas for improvement in this. + * + * @param data + * @param offset + * @param lengthInBytes + * @return + */ + public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { + StringBuilder sb = new StringBuilder(); + LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); + int c = reader.read(); + while (c != -1) { + sb.append((char)c); + c = reader.read(); + } + return sb.toString(); + } } diff --git a/src/java/org/apache/poi/util/LittleEndianBig5Stream.java b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java deleted file mode 100644 index f68b1cdb9e..0000000000 --- a/src/java/org/apache/poi/util/LittleEndianBig5Stream.java +++ /dev/null @@ -1,107 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ - -package org.apache.poi.util; - -import java.io.ByteArrayInputStream; - -/** - * Stream that converts MSOffice's way of storing Big5, with - * zero-byte padding for ASCII and in LittleEndianOrder. - */ -@Internal -public class LittleEndianBig5Stream extends ByteArrayInputStream { - private static final int EOF = -1; - private static final int INVALID_PAIR = -2; - private static final int EMPTY_TRAILING = -3; - - //the char that is logically trailing in Big5 encoding - //however in LittleEndian order, this is the first encountered. - int trailing = EMPTY_TRAILING; - public LittleEndianBig5Stream(byte[] buf) { - super(buf); - } - - public LittleEndianBig5Stream(byte[] buf, int offset, int length) { - super(buf, offset, length); - } - - @Override - public int read() { - - if (trailing != EMPTY_TRAILING) { - int tmp = trailing; - trailing = EMPTY_TRAILING; - return tmp; - } - int leading = readNext(); - while (leading == INVALID_PAIR) { - leading = readNext(); - } - - if (leading == EOF) { - return EOF; - } - return leading; - } - - //returns leading, sets trailing appropriately - //returns -1 if it hits the end of the stream - //returns -2 for an invalid big5 code pair - private final int readNext() { - trailing = super.read(); - if (trailing == -1) { - return EOF; - } - int leading = super.read(); - if (leading == EOF) { - return EOF; - } - int lead = leading&0xff; - if (lead > 0x80) { - return leading; - } else if (lead == 0) { - int ret = trailing; - trailing = EMPTY_TRAILING; - return ret; - } else { - int ret = trailing; - trailing = EMPTY_TRAILING; - return ret; - //return INVALID_PAIR; - } - - } - - @Override - public int read(byte[] buff, int off, int len) { - int bytesRead = 0; - for (int i = off; i < off+len; i++) { - int b = read(); - if (b == -1) { - if (bytesRead == 0) { - return -1; - } else { - return bytesRead; - } - } - bytesRead++; - buff[i] = (byte)b; - } - return bytesRead; - } -} diff --git a/src/java/org/apache/poi/util/LittleEndianCP950Reader.java b/src/java/org/apache/poi/util/LittleEndianCP950Reader.java new file mode 100644 index 0000000000..1a680031bf --- /dev/null +++ b/src/java/org/apache/poi/util/LittleEndianCP950Reader.java @@ -0,0 +1,480 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; +import java.io.IOException; +import java.io.Reader; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; + +/** + * Stream that converts CP950 (MSOffice's dialect of Big5), with + * zero-byte padding for ASCII and in LittleEndianOrder. + */ +@Internal +public class LittleEndianCP950Reader extends Reader { + + private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class); + + + private static final char UNMAPPABLE = (char) '?'; + private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2); + private final CharBuffer charBuffer = CharBuffer.allocate(2); + private final CharsetDecoder decoder = StringUtil.BIG5.newDecoder(); + + //https://en.wikipedia.org/wiki/Code_page_950 + //see private use area + private final static char range1Low = '\u8140'; + private final static char range1High = '\u8DFE'; + private final static char range2Low = '\u8E40'; + private final static char range2High = '\uA0FE'; + private final static char range3Low = '\uC6A1'; + private final static char range3High = '\uC8FE'; + private final static char range4Low = '\uFA40'; + private final static char range4High = '\uFEFE'; + + private final byte[] data; + private final int startOffset; + private final int length; + private int offset; + private int trailing; + private int leading; + int cnt = 0; + //the char that is logically trailing in Big5 encoding + //however in LittleEndian order, this is the first encountered. + public LittleEndianCP950Reader(byte[] data) { + this(data, 0, data.length); + } + + public LittleEndianCP950Reader(byte[] data, int offset, int length) { + this.data = data; + this.startOffset = offset; + this.offset = startOffset; + this.length = length; + } + + @Override + public int read() { + if (offset + 1 > data.length || offset - startOffset > length) { + return -1; + } + trailing = data[offset++] & 0xff; + leading = data[offset++] & 0xff; + decoder.reset(); + if (leading < 0x81) { + //return trailing alone + //there may be some subtleties here + return trailing; + } else if (leading == 0xf9) { + return handleF9(trailing); + } else { + int ch = (leading << 8) + trailing; + if (ch >= range1Low && ch <= range1High) { + return handleRange1(leading, trailing); + } else if (ch >= range2Low && ch <= range2High) { + return handleRange2(leading, trailing); + } else if (ch >= range3Low && ch <= range3High) { + return handleRange3(leading, trailing); + } else if (ch >= range4Low && ch <= range4High) { + return handleRange4(leading, trailing); + } + + charBuffer.clear(); + doubleByteBuffer.clear(); + doubleByteBuffer.put((byte) leading); + doubleByteBuffer.put((byte) trailing); + doubleByteBuffer.flip(); + decoder.decode(doubleByteBuffer, charBuffer, true); + charBuffer.flip(); + + if (charBuffer.length() == 0) { + LOGGER.log(POILogger.WARN, "couldn't create char for: " + + Integer.toString((leading & 0xff), 16) + + " " + Integer.toString((trailing & 0xff), 16)); + return UNMAPPABLE; + } else { + return Character.codePointAt(charBuffer, 0); + } + } + + + } + + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + //there may be some efficiencies, but this should do for now. + + for (int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) { + return i - off; + } + cbuf[i] = (char) c; + } + return len; + } + + @Override + public void close() throws IOException { + + } + + private int handleRange1(int leading, int trailing) { + return (0xeeb8 + (157 * (leading - 0x81))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange2(int leading, int trailing) { + return (0xe311 + (157 * (leading - 0x8e))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange3(int leading, int trailing) { + return (0xf672 + (157 * (leading - 0xc6))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange4(int leading, int trailing) { + return (0xe000 + (157 * (leading - 0xfa))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleF9(int trailing) { + switch (trailing) { + case 0x40: + return 0x7e98; + case 0x41: + return 0x7e9b; + case 0x42: + return 0x7e99; + case 0x43: + return 0x81e0; + case 0x44: + return 0x81e1; + case 0x45: + return 0x8646; + case 0x46: + return 0x8647; + case 0x47: + return 0x8648; + case 0x48: + return 0x8979; + case 0x49: + return 0x897a; + case 0x4a: + return 0x897c; + case 0x4b: + return 0x897b; + case 0x4c: + return 0x89ff; + case 0x4d: + return 0x8b98; + case 0x4e: + return 0x8b99; + case 0x4f: + return 0x8ea5; + case 0x50: + return 0x8ea4; + case 0x51: + return 0x8ea3; + case 0x52: + return 0x946e; + case 0x53: + return 0x946d; + case 0x54: + return 0x946f; + case 0x55: + return 0x9471; + case 0x56: + return 0x9473; + case 0x57: + return 0x9749; + case 0x58: + return 0x9872; + case 0x59: + return 0x995f; + case 0x5a: + return 0x9c68; + case 0x5b: + return 0x9c6e; + case 0x5c: + return 0x9c6d; + case 0x5d: + return 0x9e0b; + case 0x5e: + return 0x9e0d; + case 0x5f: + return 0x9e10; + case 0x60: + return 0x9e0f; + case 0x61: + return 0x9e12; + case 0x62: + return 0x9e11; + case 0x63: + return 0x9ea1; + case 0x64: + return 0x9ef5; + case 0x65: + return 0x9f09; + case 0x66: + return 0x9f47; + case 0x67: + return 0x9f78; + case 0x68: + return 0x9f7b; + case 0x69: + return 0x9f7a; + case 0x6a: + return 0x9f79; + case 0x6b: + return 0x571e; + case 0x6c: + return 0x7066; + case 0x6d: + return 0x7c6f; + case 0x6e: + return 0x883c; + case 0x6f: + return 0x8db2; + case 0x70: + return 0x8ea6; + case 0x71: + return 0x91c3; + case 0x72: + return 0x9474; + case 0x73: + return 0x9478; + case 0x74: + return 0x9476; + case 0x75: + return 0x9475; + case 0x76: + return 0x9a60; + case 0x77: + return 0x9c74; + case 0x78: + return 0x9c73; + case 0x79: + return 0x9c71; + case 0x7a: + return 0x9c75; + case 0x7b: + return 0x9e14; + case 0x7c: + return 0x9e13; + case 0x7d: + return 0x9ef6; + case 0x7e: + return 0x9f0a; + case 0xa1: + return 0x9fa4; + case 0xa2: + return 0x7068; + case 0xa3: + return 0x7065; + case 0xa4: + return 0x7cf7; + case 0xa5: + return 0x866a; + case 0xa6: + return 0x883e; + case 0xa7: + return 0x883d; + case 0xa8: + return 0x883f; + case 0xa9: + return 0x8b9e; + case 0xaa: + return 0x8c9c; + case 0xab: + return 0x8ea9; + case 0xac: + return 0x8ec9; + case 0xad: + return 0x974b; + case 0xae: + return 0x9873; + case 0xaf: + return 0x9874; + case 0xb0: + return 0x98cc; + case 0xb1: + return 0x9961; + case 0xb2: + return 0x99ab; + case 0xb3: + return 0x9a64; + case 0xb4: + return 0x9a66; + case 0xb5: + return 0x9a67; + case 0xb6: + return 0x9b24; + case 0xb7: + return 0x9e15; + case 0xb8: + return 0x9e17; + case 0xb9: + return 0x9f48; + case 0xba: + return 0x6207; + case 0xbb: + return 0x6b1e; + case 0xbc: + return 0x7227; + case 0xbd: + return 0x864c; + case 0xbe: + return 0x8ea8; + case 0xbf: + return 0x9482; + case 0xc0: + return 0x9480; + case 0xc1: + return 0x9481; + case 0xc2: + return 0x9a69; + case 0xc3: + return 0x9a68; + case 0xc4: + return 0x9b2e; + case 0xc5: + return 0x9e19; + case 0xc6: + return 0x7229; + case 0xc7: + return 0x864b; + case 0xc8: + return 0x8b9f; + case 0xc9: + return 0x9483; + case 0xca: + return 0x9c79; + case 0xcb: + return 0x9eb7; + case 0xcc: + return 0x7675; + case 0xcd: + return 0x9a6b; + case 0xce: + return 0x9c7a; + case 0xcf: + return 0x9e1d; + case 0xd0: + return 0x7069; + case 0xd1: + return 0x706a; + case 0xd2: + return 0x9ea4; + case 0xd3: + return 0x9f7e; + case 0xd4: + return 0x9f49; + case 0xd5: + return 0x9f98; + case 0xd6: + return 0x7881; + case 0xd7: + return 0x92b9; + case 0xd8: + return 0x88cf; + case 0xd9: + return 0x58bb; + case 0xda: + return 0x6052; + case 0xdb: + return 0x7ca7; + case 0xdc: + return 0x5afa; + case 0xdd: + return 0x2554; + case 0xde: + return 0x2566; + case 0xdf: + return 0x2557; + case 0xe0: + return 0x2560; + case 0xe1: + return 0x256c; + case 0xe2: + return 0x2563; + case 0xe3: + return 0x255a; + case 0xe4: + return 0x2569; + case 0xe5: + return 0x255d; + case 0xe6: + return 0x2552; + case 0xe7: + return 0x2564; + case 0xe8: + return 0x2555; + case 0xe9: + return 0x255e; + case 0xea: + return 0x256a; + case 0xeb: + return 0x2561; + case 0xec: + return 0x2558; + case 0xed: + return 0x2567; + case 0xee: + return 0x255b; + case 0xef: + return 0x2553; + case 0xf0: + return 0x2565; + case 0xf1: + return 0x2556; + case 0xf2: + return 0x255f; + case 0xf3: + return 0x256b; + case 0xf4: + return 0x2562; + case 0xf5: + return 0x2559; + case 0xf6: + return 0x2568; + case 0xf7: + return 0x255c; + case 0xf8: + return 0x2551; + case 0xf9: + return 0x2550; + case 0xfa: + return 0x256d; + case 0xfb: + return 0x256e; + case 0xfc: + return 0x2570; + case 0xfd: + return 0x256f; + case 0xfe: + return 0x2593; + default: + LOGGER.log(POILogger.WARN, "couldn't create char for: f9" + + " " + Integer.toString((trailing & 0xff), 16)); + return UNMAPPABLE; + } + } +} diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java index 5d09dff56d..15b87ffdf8 100644 --- a/src/java/org/apache/poi/util/StringUtil.java +++ b/src/java/org/apache/poi/util/StringUtil.java @@ -17,8 +17,6 @@ package org.apache.poi.util; -import java.io.ByteArrayOutputStream; -import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Iterator; @@ -581,26 +579,6 @@ public class StringUtil { ' ', // 0xf0ff not defined }; - /** - * This tries to convert a LE byte array in Big5 to a String. - * We know MS zero-padded ascii, and we drop those. - * However, there may be areas for improvement in this. - * - * @param data - * @param offset - * @param lengthInBytes - * @return - */ - public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) { - ByteArrayOutputStream os = new ByteArrayOutputStream(); - try { - IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os); - } catch (IOException e) { - logger.log(POILogger.WARN, - "IOException while copying a byte array stream to a byte array stream?!"); - } - return new String(os.toByteArray(), BIG5); - } // Could be replaced with org.apache.commons.lang3.StringUtils#join @Internal diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 505789e2c3..b0b374a4e3 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -108,7 +108,7 @@ public class HWPFOldDocument extends HWPFDocumentCore { System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); int numChars = textData.length; - if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) { + if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { numChars /= 2; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java index 87cccfa765..4e6f71983c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java @@ -18,7 +18,6 @@ package org.apache.poi.hwpf.model; -import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.Internal; import org.apache.poi.util.NotImplemented; @@ -43,17 +42,6 @@ public class OldTextPiece extends TextPiece { this.rawBytes = text; } - @Override - protected void validateLengths(int start, int end, int length, PieceDescriptor pd) { - //things are still wonky with Big5 char/byte length mapping - //sometimes working w/ Java 8 but not w/ Java 7! - //for now, if we're dealing w/ Big5 don't bother checking - if (pd.getCharset() != null && - CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(pd.getCharset())) { - return; - } - super.validateLengths(start, end, length, pd); - } /** * @return nothing, ever. Always throws an UnsupportedOperationException * @throws UnsupportedOperationException diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java index 3fd34ade09..f141cddbab 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java @@ -76,7 +76,7 @@ public class OldTextPieceTable extends TextPieceTable { boolean unicode = pieces[x].isUnicode(); int multiple = 1; if (unicode || - (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) { + (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { multiple = 2; } @@ -111,7 +111,7 @@ public class OldTextPieceTable extends TextPieceTable { @Override protected int getEncodingMultiplier(TextPiece textPiece) { Charset charset = textPiece.getPieceDescriptor().getCharset(); - if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) { + if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { return 2; } return 1; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index b383cbcfb2..e137727fda 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -20,6 +20,7 @@ package org.apache.poi.hwpf.model; import java.nio.charset.Charset; +import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.Internal; import org.apache.poi.util.StringUtil; @@ -60,25 +61,21 @@ public class TextPiece extends PropertyNode { // Validate int textLength = ((CharSequence) _buf).length(); - validateLengths(start, end, textLength, pd); + if (end - start != textLength) { + throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!"); + } if (end < start) { throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); } } - protected void validateLengths(int start, int end, int textLength, PieceDescriptor pd) { - if (end - start != textLength) { - throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!"); - } - } /** * Create the StringBuilder from the text and unicode flag */ private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { byte[] textBuffer = text; if (StringUtil.BIG5.equals(pd.getCharset())) { - String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString(); - return new StringBuilder(txt); + return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length)); } String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java index 4747577f1d..06cfcb44a0 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java @@ -49,7 +49,6 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.util.IOUtils; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; -import org.junit.Ignore; import org.junit.Test; /** @@ -729,7 +728,6 @@ public class TestBugs{ * Bug 51944 - PAPFormattedDiskPage.getPAPX - IndexOutOfBounds */ @Test - @Ignore("Test now passes in Java 1.7 and 1.8, but not 1.6") public void testBug51944() throws Exception { HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java index bf355959a6..925b8d0566 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java @@ -247,8 +247,8 @@ public final class TestHWPFOldDocument extends HWPFTestCase { */ assertContains(txt, "\n9-55 xxxxx block5"); //TODO: figure out why these two aren't passing -// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly -// assertContains(txt, "We are able to");//not sure if we can get this easily? + //assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly + //assertContains(txt, "We are able to");//not sure if we can get this easily? } } diff --git a/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java b/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java new file mode 100644 index 0000000000..96106dc751 --- /dev/null +++ b/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java @@ -0,0 +1,77 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.io.Reader; + +import org.junit.Test; + +public class TestLittleEndianCP950Reader { + + @Test + public void testPersonalUseMappings() throws Exception { + //ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt + byte[] data = new byte[2]; + data[1] = (byte) 0xfe; + data[0] = (byte) 0xd3; + assertCharEquals('\uE2E5', data); + + data[1] = (byte) 0x90; + data[0] = (byte) 0xb6; + assertCharEquals('\uE49F', data); + + //actually found in document + //but this disagrees with file above + data[1] = (byte) 0x8E; + data[0] = (byte) 0xA8; + assertCharEquals('\uE357', data); + + data[1] = (byte) 0x8E; + data[0] = (byte) 0xE6; + assertCharEquals('\uE395', data); + + /* + //TODO: figure out why this isn't working + data[0] = (byte)0xF9; + data[1] = (byte)0xD8; + assertCharEquals('\u88CF', data); + */ + + } + + @Test + public void one() { + byte b = (byte) 0xfe; + byte c = (byte) 0xd3; + + int i = ((b & 0xff) << 8) + (c & 0xff); + System.out.println(i); + } + + private void assertCharEquals(char expected, byte[] data) throws IOException { + Reader reader = new LittleEndianCP950Reader(data); + int c = reader.read(); + assertEquals((int) expected, c); + int eof = reader.read(); + assertEquals("should be end of stream", -1, eof); + } +}