]> source.dussan.org Git - poi.git/commitdiff
bug 50955 and bug 60953 improve Big5 reader; ensure one character
authorTim Allison <tallison@apache.org>
Wed, 5 Apr 2017 01:45:55 +0000 (01:45 +0000)
committerTim Allison <tallison@apache.org>
Wed, 5 Apr 2017 01:45:55 +0000 (01:45 +0000)
per byte pair

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790172 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/util/CodePageUtil.java
src/java/org/apache/poi/util/LittleEndianBig5Stream.java [deleted file]
src/java/org/apache/poi/util/LittleEndianCP950Reader.java [new file with mode: 0644]
src/java/org/apache/poi/util/StringUtil.java
src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java
src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java
src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java [new file with mode: 0644]

index 5be1c5077e7470f74e0462a1e57678c1783b6819..5877a38cfb6d2fddd9bb96620cab7cd085512e3e 100644 (file)
@@ -31,10 +31,12 @@ import java.util.Set;
 public class CodePageUtil
 {
 
-    public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
+    public static final Set<Charset> DOUBLE_BYTE_CHARSETS
+            = new HashSet<Charset>();
+
     static {
+        DOUBLE_BYTE_CHARSETS.add(StringUtil.BIG5);
         //others?
-        VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
     }
 
     /** <p>Codepage 037, a special case</p> */
@@ -450,4 +452,26 @@ public class CodePageUtil
                 return "cp" + codepage;
         }
     }
+
+    /**
+     * This tries to convert a LE byte array in cp950
+     * (Microsoft's dialect of Big5) to a String.
+     * We know MS zero-padded ascii, and we drop those.
+     * There may be areas for improvement in this.
+     *
+     * @param data
+     * @param offset
+     * @param lengthInBytes
+     * @return
+     */
+    public static String cp950ToString(byte[] data, int offset, int lengthInBytes) {
+        StringBuilder sb = new StringBuilder();
+        LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes);
+        int c = reader.read();
+        while (c != -1) {
+            sb.append((char)c);
+            c = reader.read();
+        }
+        return sb.toString();
+    }
 }
diff --git a/src/java/org/apache/poi/util/LittleEndianBig5Stream.java b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java
deleted file mode 100644 (file)
index f68b1cd..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-
-package org.apache.poi.util;
-
-import java.io.ByteArrayInputStream;
-
-/**
- * Stream that converts MSOffice's way of storing Big5, with
- * zero-byte padding for ASCII and in LittleEndianOrder.
- */
-@Internal
-public class LittleEndianBig5Stream extends ByteArrayInputStream {
-    private static final int EOF = -1;
-    private static final int INVALID_PAIR = -2;
-    private static final int EMPTY_TRAILING = -3;
-
-    //the char that is logically trailing in Big5 encoding
-    //however in LittleEndian order, this is the first encountered.
-    int trailing = EMPTY_TRAILING;
-    public LittleEndianBig5Stream(byte[] buf) {
-        super(buf);
-    }
-
-    public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
-        super(buf, offset, length);
-    }
-
-    @Override
-    public int read() {
-
-        if (trailing != EMPTY_TRAILING) {
-            int tmp = trailing;
-            trailing = EMPTY_TRAILING;
-            return tmp;
-        }
-        int leading = readNext();
-        while (leading == INVALID_PAIR) {
-            leading = readNext();
-        }
-
-        if (leading == EOF) {
-            return EOF;
-        }
-        return leading;
-    }
-
-    //returns leading, sets trailing appropriately
-    //returns -1 if it hits the end of the stream
-    //returns -2 for an invalid big5 code pair
-    private final int readNext() {
-        trailing = super.read();
-        if (trailing == -1) {
-            return EOF;
-        }
-        int leading = super.read();
-        if (leading == EOF) {
-            return EOF;
-        }
-        int lead = leading&0xff;
-        if (lead > 0x80) {
-            return leading;
-        } else if (lead == 0) {
-            int ret = trailing;
-            trailing = EMPTY_TRAILING;
-            return ret;
-        } else {
-            int ret = trailing;
-            trailing = EMPTY_TRAILING;
-            return ret;
-            //return INVALID_PAIR;
-        }
-
-    }
-
-    @Override
-    public int read(byte[] buff, int off, int len) {
-        int bytesRead = 0;
-        for (int i = off; i < off+len; i++) {
-            int b = read();
-            if (b == -1) {
-                if (bytesRead == 0) {
-                    return -1;
-                } else {
-                    return bytesRead;
-                }
-            }
-            bytesRead++;
-            buff[i] = (byte)b;
-        }
-        return bytesRead;
-    }
-}
diff --git a/src/java/org/apache/poi/util/LittleEndianCP950Reader.java b/src/java/org/apache/poi/util/LittleEndianCP950Reader.java
new file mode 100644 (file)
index 0000000..1a68003
--- /dev/null
@@ -0,0 +1,480 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.util;
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+
+/**
+ * Stream that converts CP950 (MSOffice's dialect of Big5), with
+ * zero-byte padding for ASCII and in LittleEndianOrder.
+ */
+@Internal
+public class LittleEndianCP950Reader extends Reader {
+
+    private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class);
+
+
+    private static final char UNMAPPABLE = (char) '?';
+    private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2);
+    private final CharBuffer charBuffer = CharBuffer.allocate(2);
+    private final CharsetDecoder decoder = StringUtil.BIG5.newDecoder();
+
+    //https://en.wikipedia.org/wiki/Code_page_950
+    //see private use area
+    private final static char range1Low = '\u8140';
+    private final static char range1High = '\u8DFE';
+    private final static char range2Low = '\u8E40';
+    private final static char range2High = '\uA0FE';
+    private final static char range3Low = '\uC6A1';
+    private final static char range3High = '\uC8FE';
+    private final static char range4Low = '\uFA40';
+    private final static char range4High = '\uFEFE';
+
+    private final byte[] data;
+    private final int startOffset;
+    private final int length;
+    private int offset;
+    private int trailing;
+    private int leading;
+    int cnt = 0;
+    //the char that is logically trailing in Big5 encoding
+    //however in LittleEndian order, this is the first encountered.
+    public LittleEndianCP950Reader(byte[] data) {
+        this(data, 0, data.length);
+    }
+
+    public LittleEndianCP950Reader(byte[] data, int offset, int length) {
+        this.data = data;
+        this.startOffset = offset;
+        this.offset = startOffset;
+        this.length = length;
+    }
+
+    @Override
+    public int read() {
+        if (offset + 1 > data.length || offset - startOffset > length) {
+            return -1;
+        }
+        trailing = data[offset++] & 0xff;
+        leading = data[offset++] & 0xff;
+        decoder.reset();
+        if (leading < 0x81) {
+            //return trailing alone
+            //there may be some subtleties here
+            return trailing;
+        } else if (leading == 0xf9) {
+            return handleF9(trailing);
+        } else {
+            int ch = (leading << 8) + trailing;
+            if (ch >= range1Low && ch <= range1High) {
+                return handleRange1(leading, trailing);
+            } else if (ch >= range2Low && ch <= range2High) {
+                return handleRange2(leading, trailing);
+            } else if (ch >= range3Low && ch <= range3High) {
+                return handleRange3(leading, trailing);
+            } else if (ch >= range4Low && ch <= range4High) {
+                return handleRange4(leading, trailing);
+            }
+
+            charBuffer.clear();
+            doubleByteBuffer.clear();
+            doubleByteBuffer.put((byte) leading);
+            doubleByteBuffer.put((byte) trailing);
+            doubleByteBuffer.flip();
+            decoder.decode(doubleByteBuffer, charBuffer, true);
+            charBuffer.flip();
+
+            if (charBuffer.length() == 0) {
+                LOGGER.log(POILogger.WARN, "couldn't create char for: "
+                        + Integer.toString((leading & 0xff), 16)
+                        + " " + Integer.toString((trailing & 0xff), 16));
+                return UNMAPPABLE;
+            } else {
+                return Character.codePointAt(charBuffer, 0);
+            }
+        }
+
+
+    }
+
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        //there may be some efficiencies, but this should do for now.
+
+        for (int i = off; i < off + len; i++) {
+            int c = read();
+            if (c == -1) {
+                return i - off;
+            }
+            cbuf[i] = (char) c;
+        }
+        return len;
+    }
+
+    @Override
+    public void close() throws IOException {
+
+    }
+
+    private int handleRange1(int leading, int trailing) {
+        return (0xeeb8 + (157 * (leading - 0x81))) +
+                ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+    }
+
+    private int handleRange2(int leading, int trailing) {
+        return (0xe311 + (157 * (leading - 0x8e))) +
+                ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+    }
+
+    private int handleRange3(int leading, int trailing) {
+        return (0xf672 + (157 * (leading - 0xc6))) +
+                ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+    }
+
+    private int handleRange4(int leading, int trailing) {
+        return (0xe000 + (157 * (leading - 0xfa))) +
+                ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+    }
+
+    private int handleF9(int trailing) {
+        switch (trailing) {
+            case 0x40:
+                return 0x7e98;
+            case 0x41:
+                return 0x7e9b;
+            case 0x42:
+                return 0x7e99;
+            case 0x43:
+                return 0x81e0;
+            case 0x44:
+                return 0x81e1;
+            case 0x45:
+                return 0x8646;
+            case 0x46:
+                return 0x8647;
+            case 0x47:
+                return 0x8648;
+            case 0x48:
+                return 0x8979;
+            case 0x49:
+                return 0x897a;
+            case 0x4a:
+                return 0x897c;
+            case 0x4b:
+                return 0x897b;
+            case 0x4c:
+                return 0x89ff;
+            case 0x4d:
+                return 0x8b98;
+            case 0x4e:
+                return 0x8b99;
+            case 0x4f:
+                return 0x8ea5;
+            case 0x50:
+                return 0x8ea4;
+            case 0x51:
+                return 0x8ea3;
+            case 0x52:
+                return 0x946e;
+            case 0x53:
+                return 0x946d;
+            case 0x54:
+                return 0x946f;
+            case 0x55:
+                return 0x9471;
+            case 0x56:
+                return 0x9473;
+            case 0x57:
+                return 0x9749;
+            case 0x58:
+                return 0x9872;
+            case 0x59:
+                return 0x995f;
+            case 0x5a:
+                return 0x9c68;
+            case 0x5b:
+                return 0x9c6e;
+            case 0x5c:
+                return 0x9c6d;
+            case 0x5d:
+                return 0x9e0b;
+            case 0x5e:
+                return 0x9e0d;
+            case 0x5f:
+                return 0x9e10;
+            case 0x60:
+                return 0x9e0f;
+            case 0x61:
+                return 0x9e12;
+            case 0x62:
+                return 0x9e11;
+            case 0x63:
+                return 0x9ea1;
+            case 0x64:
+                return 0x9ef5;
+            case 0x65:
+                return 0x9f09;
+            case 0x66:
+                return 0x9f47;
+            case 0x67:
+                return 0x9f78;
+            case 0x68:
+                return 0x9f7b;
+            case 0x69:
+                return 0x9f7a;
+            case 0x6a:
+                return 0x9f79;
+            case 0x6b:
+                return 0x571e;
+            case 0x6c:
+                return 0x7066;
+            case 0x6d:
+                return 0x7c6f;
+            case 0x6e:
+                return 0x883c;
+            case 0x6f:
+                return 0x8db2;
+            case 0x70:
+                return 0x8ea6;
+            case 0x71:
+                return 0x91c3;
+            case 0x72:
+                return 0x9474;
+            case 0x73:
+                return 0x9478;
+            case 0x74:
+                return 0x9476;
+            case 0x75:
+                return 0x9475;
+            case 0x76:
+                return 0x9a60;
+            case 0x77:
+                return 0x9c74;
+            case 0x78:
+                return 0x9c73;
+            case 0x79:
+                return 0x9c71;
+            case 0x7a:
+                return 0x9c75;
+            case 0x7b:
+                return 0x9e14;
+            case 0x7c:
+                return 0x9e13;
+            case 0x7d:
+                return 0x9ef6;
+            case 0x7e:
+                return 0x9f0a;
+            case 0xa1:
+                return 0x9fa4;
+            case 0xa2:
+                return 0x7068;
+            case 0xa3:
+                return 0x7065;
+            case 0xa4:
+                return 0x7cf7;
+            case 0xa5:
+                return 0x866a;
+            case 0xa6:
+                return 0x883e;
+            case 0xa7:
+                return 0x883d;
+            case 0xa8:
+                return 0x883f;
+            case 0xa9:
+                return 0x8b9e;
+            case 0xaa:
+                return 0x8c9c;
+            case 0xab:
+                return 0x8ea9;
+            case 0xac:
+                return 0x8ec9;
+            case 0xad:
+                return 0x974b;
+            case 0xae:
+                return 0x9873;
+            case 0xaf:
+                return 0x9874;
+            case 0xb0:
+                return 0x98cc;
+            case 0xb1:
+                return 0x9961;
+            case 0xb2:
+                return 0x99ab;
+            case 0xb3:
+                return 0x9a64;
+            case 0xb4:
+                return 0x9a66;
+            case 0xb5:
+                return 0x9a67;
+            case 0xb6:
+                return 0x9b24;
+            case 0xb7:
+                return 0x9e15;
+            case 0xb8:
+                return 0x9e17;
+            case 0xb9:
+                return 0x9f48;
+            case 0xba:
+                return 0x6207;
+            case 0xbb:
+                return 0x6b1e;
+            case 0xbc:
+                return 0x7227;
+            case 0xbd:
+                return 0x864c;
+            case 0xbe:
+                return 0x8ea8;
+            case 0xbf:
+                return 0x9482;
+            case 0xc0:
+                return 0x9480;
+            case 0xc1:
+                return 0x9481;
+            case 0xc2:
+                return 0x9a69;
+            case 0xc3:
+                return 0x9a68;
+            case 0xc4:
+                return 0x9b2e;
+            case 0xc5:
+                return 0x9e19;
+            case 0xc6:
+                return 0x7229;
+            case 0xc7:
+                return 0x864b;
+            case 0xc8:
+                return 0x8b9f;
+            case 0xc9:
+                return 0x9483;
+            case 0xca:
+                return 0x9c79;
+            case 0xcb:
+                return 0x9eb7;
+            case 0xcc:
+                return 0x7675;
+            case 0xcd:
+                return 0x9a6b;
+            case 0xce:
+                return 0x9c7a;
+            case 0xcf:
+                return 0x9e1d;
+            case 0xd0:
+                return 0x7069;
+            case 0xd1:
+                return 0x706a;
+            case 0xd2:
+                return 0x9ea4;
+            case 0xd3:
+                return 0x9f7e;
+            case 0xd4:
+                return 0x9f49;
+            case 0xd5:
+                return 0x9f98;
+            case 0xd6:
+                return 0x7881;
+            case 0xd7:
+                return 0x92b9;
+            case 0xd8:
+                return 0x88cf;
+            case 0xd9:
+                return 0x58bb;
+            case 0xda:
+                return 0x6052;
+            case 0xdb:
+                return 0x7ca7;
+            case 0xdc:
+                return 0x5afa;
+            case 0xdd:
+                return 0x2554;
+            case 0xde:
+                return 0x2566;
+            case 0xdf:
+                return 0x2557;
+            case 0xe0:
+                return 0x2560;
+            case 0xe1:
+                return 0x256c;
+            case 0xe2:
+                return 0x2563;
+            case 0xe3:
+                return 0x255a;
+            case 0xe4:
+                return 0x2569;
+            case 0xe5:
+                return 0x255d;
+            case 0xe6:
+                return 0x2552;
+            case 0xe7:
+                return 0x2564;
+            case 0xe8:
+                return 0x2555;
+            case 0xe9:
+                return 0x255e;
+            case 0xea:
+                return 0x256a;
+            case 0xeb:
+                return 0x2561;
+            case 0xec:
+                return 0x2558;
+            case 0xed:
+                return 0x2567;
+            case 0xee:
+                return 0x255b;
+            case 0xef:
+                return 0x2553;
+            case 0xf0:
+                return 0x2565;
+            case 0xf1:
+                return 0x2556;
+            case 0xf2:
+                return 0x255f;
+            case 0xf3:
+                return 0x256b;
+            case 0xf4:
+                return 0x2562;
+            case 0xf5:
+                return 0x2559;
+            case 0xf6:
+                return 0x2568;
+            case 0xf7:
+                return 0x255c;
+            case 0xf8:
+                return 0x2551;
+            case 0xf9:
+                return 0x2550;
+            case 0xfa:
+                return 0x256d;
+            case 0xfb:
+                return 0x256e;
+            case 0xfc:
+                return 0x2570;
+            case 0xfd:
+                return 0x256f;
+            case 0xfe:
+                return 0x2593;
+            default:
+                LOGGER.log(POILogger.WARN, "couldn't create char for: f9"
+                        + " " + Integer.toString((trailing & 0xff), 16));
+                return UNMAPPABLE;
+        }
+    }
+}
index 5d09dff56d31d9e2f2864138a1222d8fd6bdea77..15b87ffdf86f22bef1205447087ce00611251407 100644 (file)
@@ -17,8 +17,6 @@
 
 package org.apache.poi.util;
 
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -581,26 +579,6 @@ public class StringUtil {
        ' ', // 0xf0ff not defined
    };
 
-    /**
-     * This tries to convert a LE byte array in Big5 to a String.
-     * We know MS zero-padded ascii, and we drop those.
-     * However, there may be areas for improvement in this.
-     *
-     * @param data
-     * @param offset
-     * @param lengthInBytes
-     * @return
-     */
-   public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
-       ByteArrayOutputStream os = new ByteArrayOutputStream();
-       try {
-           IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
-       } catch (IOException e) {
-           logger.log(POILogger.WARN,
-                   "IOException while copying a byte array stream to a byte array stream?!");
-       }
-       return new String(os.toByteArray(), BIG5);
-   }
 
    // Could be replaced with org.apache.commons.lang3.StringUtils#join
    @Internal
index 505789e2c301c6ad0c13b2be4b981923fddc4995..b0b374a4e3849e933128c58949766dd1c7d3f3af 100644 (file)
@@ -108,7 +108,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
             System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
 
             int numChars = textData.length;
-            if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
+            if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
                 numChars /= 2;
             }
 
index 87cccfa7656c27353ca18621a4c60ce210c07361..4e6f71983cf0dc2499a53c9737ad2e987e897193 100644 (file)
@@ -18,7 +18,6 @@
 package org.apache.poi.hwpf.model;
 
 
-import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.NotImplemented;
 
@@ -43,17 +42,6 @@ public class OldTextPiece extends TextPiece {
         this.rawBytes = text;
     }
 
-    @Override
-    protected void validateLengths(int start, int end, int length, PieceDescriptor pd) {
-        //things are still wonky with Big5 char/byte length mapping
-        //sometimes working w/ Java 8 but not w/ Java 7!
-        //for now, if we're dealing w/ Big5 don't bother checking
-        if (pd.getCharset() != null &&
-                CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(pd.getCharset())) {
-            return;
-        }
-        super.validateLengths(start, end, length, pd);
-    }
     /**
      * @return nothing, ever. Always throws an UnsupportedOperationException
      * @throws UnsupportedOperationException
index 3fd34ade095b807539783f6f640aadeef950796c..f141cddbabd00c00046ddd94abdf772ea5985a51 100644 (file)
@@ -76,7 +76,7 @@ public class OldTextPieceTable extends TextPieceTable {
             boolean unicode = pieces[x].isUnicode();
             int multiple = 1;
             if (unicode ||
-                    (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
+                    (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) {
                 multiple = 2;
             }
 
@@ -111,7 +111,7 @@ public class OldTextPieceTable extends TextPieceTable {
     @Override
     protected int getEncodingMultiplier(TextPiece textPiece) {
         Charset charset = textPiece.getPieceDescriptor().getCharset();
-        if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
+        if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) {
             return 2;
         }
         return 1;
index b383cbcfb2349bca4ea205a87a7a06fdc3fa9330..e137727fdad235285525972b3fa55336a291c0f3 100644 (file)
@@ -20,6 +20,7 @@ package org.apache.poi.hwpf.model;
 
 import java.nio.charset.Charset;
 
+import org.apache.poi.util.CodePageUtil;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.StringUtil;
 
@@ -60,25 +61,21 @@ public class TextPiece extends PropertyNode<TextPiece> {
 
         // Validate
         int textLength = ((CharSequence) _buf).length();
-        validateLengths(start, end, textLength, pd);
+        if (end - start != textLength) {
+            throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
+        }
         if (end < start) {
             throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
         }
     }
 
-    protected void validateLengths(int start, int end, int textLength, PieceDescriptor pd) {
-        if (end - start != textLength) {
-            throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
-        }
-    }
     /**
      * Create the StringBuilder from the text and unicode flag
      */
     private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
         byte[] textBuffer = text;
         if (StringUtil.BIG5.equals(pd.getCharset())) {
-            String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
-            return new StringBuilder(txt);
+            return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length));
         }
 
         String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
index 4747577f1d8fa98bbf00840add8fe1a9e5ea00c1..06cfcb44a0fe7fd0fa29552ef27ac512b862dca8 100644 (file)
@@ -49,7 +49,6 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -729,7 +728,6 @@ public class TestBugs{
      * Bug 51944 - PAPFormattedDiskPage.getPAPX - IndexOutOfBounds
      */
     @Test
-    @Ignore("Test now passes in Java 1.7 and 1.8, but not 1.6")
     public void testBug51944() throws Exception
     {
         HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
index bf355959a6a3ccc3a960d110edbc2b3fadd61c4d..925b8d0566eacedf18678e211e557e5f0eb0e2ad 100644 (file)
@@ -247,8 +247,8 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
         */
         assertContains(txt, "\n9-55 xxxxx block5");
         //TODO: figure out why these two aren't passing
-//        assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
-//        assertContains(txt, "We are able to");//not sure if we can get this easily?
+        //assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
+        //assertContains(txt, "We are able to");//not sure if we can get this easily?
     }
 
 }
diff --git a/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java b/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java
new file mode 100644 (file)
index 0000000..96106dc
--- /dev/null
@@ -0,0 +1,77 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.util;
+
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.junit.Test;
+
+public class TestLittleEndianCP950Reader {
+
+    @Test
+    public void testPersonalUseMappings() throws Exception {
+        //ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt
+        byte[] data = new byte[2];
+        data[1] = (byte) 0xfe;
+        data[0] = (byte) 0xd3;
+        assertCharEquals('\uE2E5', data);
+
+        data[1] = (byte) 0x90;
+        data[0] = (byte) 0xb6;
+        assertCharEquals('\uE49F', data);
+
+        //actually found in document
+        //but this disagrees with file above
+        data[1] = (byte) 0x8E;
+        data[0] = (byte) 0xA8;
+        assertCharEquals('\uE357', data);
+
+        data[1] = (byte) 0x8E;
+        data[0] = (byte) 0xE6;
+        assertCharEquals('\uE395', data);
+
+    /*
+        //TODO: figure out why this isn't working
+        data[0] = (byte)0xF9;
+        data[1] = (byte)0xD8;
+        assertCharEquals('\u88CF', data);
+     */
+
+    }
+
+    @Test
+    public void one() {
+        byte b = (byte) 0xfe;
+        byte c = (byte) 0xd3;
+
+        int i = ((b & 0xff) << 8) + (c & 0xff);
+        System.out.println(i);
+    }
+
+    private void assertCharEquals(char expected, byte[] data) throws IOException {
+        Reader reader = new LittleEndianCP950Reader(data);
+        int c = reader.read();
+        assertEquals((int) expected, c);
+        int eof = reader.read();
+        assertEquals("should be end of stream", -1, eof);
+    }
+}