aboutsummaryrefslogtreecommitdiffstats
path: root/src/scratchpad
diff options
context:
space:
mode:
Diffstat (limited to 'src/scratchpad')
-rw-r--r--src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java4
-rw-r--r--src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java6
-rw-r--r--src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java6
-rw-r--r--src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java59
-rw-r--r--src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java483
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java68
6 files changed, 618 insertions, 8 deletions
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
index 238e0406f8..a8fb4078a7 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@@ -37,7 +37,7 @@ import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.CodePageUtil;
+import org.apache.poi.hwpf.util.DoubleByteUtil;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.NotImplemented;
@@ -176,7 +176,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin(), MAX_RECORD_LENGTH);
int numChars = textData.length;
- if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
+ if (DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
numChars /= 2;
}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
index e3cb94c868..e6a0887f32 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java
@@ -20,7 +20,7 @@ import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
-import org.apache.poi.util.CodePageUtil;
+import org.apache.poi.hwpf.util.DoubleByteUtil;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.Internal;
@@ -73,7 +73,7 @@ public class OldTextPieceTable extends TextPieceTable {
boolean unicode = pieces[x].isUnicode();
int multiple = 1;
if (unicode ||
- (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) {
+ (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) {
multiple = 2;
}
@@ -106,7 +106,7 @@ public class OldTextPieceTable extends TextPieceTable {
@Override
protected int getEncodingMultiplier(TextPiece textPiece) {
Charset charset = textPiece.getPieceDescriptor().getCharset();
- if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) {
+ if (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) {
return 2;
}
return 1;
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
index 5c9fcf70d9..0c606cbf03 100644
--- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java
@@ -20,7 +20,7 @@ package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
-import org.apache.poi.util.CodePageUtil;
+import org.apache.poi.hwpf.util.DoubleByteUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.util.StringUtil;
@@ -77,8 +77,8 @@ public class TextPiece extends PropertyNode<TextPiece> {
* Create the StringBuilder from the text and unicode flag
*/
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
- if (StringUtil.BIG5.equals(pd.getCharset())) {
- return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length));
+ if (DoubleByteUtil.BIG5.equals(pd.getCharset())) {
+ return new StringBuilder(DoubleByteUtil.cp950ToString(text, 0, text.length));
}
String str = new String(text, 0, text.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java
new file mode 100644
index 0000000000..5d55711ed9
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java
@@ -0,0 +1,59 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.util;
+
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Utilities for working with double byte CodePages.
+ *
+ * <p>Provides constants for understanding numeric codepages,
+ * along with utilities to translate these into Java Character Sets.</p>
+ */
+public class DoubleByteUtil
+{
+
+ public static final Charset BIG5 = Charset.forName("Big5");
+
+ public static final Set<Charset> DOUBLE_BYTE_CHARSETS = Collections.singleton(BIG5);
+
+ /**
+ * This tries to convert a LE byte array in cp950
+ * (Microsoft's dialect of Big5) to a String.
+ * We know MS zero-padded ascii, and we drop those.
+ * There may be areas for improvement in this.
+ *
+ * @param data
+ * @param offset
+ * @param lengthInBytes
+ * @return Decoded String
+ */
+ public static String cp950ToString(byte[] data, int offset, int lengthInBytes) {
+ StringBuilder sb = new StringBuilder();
+ LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes);
+ int c = reader.read();
+ while (c != -1) {
+ sb.append((char)c);
+ c = reader.read();
+ }
+ reader.close();
+ return sb.toString();
+ }
+}
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java
new file mode 100644
index 0000000000..195629bb04
--- /dev/null
+++ b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java
@@ -0,0 +1,483 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.util;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+
+import org.apache.poi.util.Internal;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
+
+/**
+ * Stream that converts CP950 (MSOffice's dialect of Big5), with
+ * zero-byte padding for ASCII and in LittleEndianOrder.
+ */
+@Internal
+public class LittleEndianCP950Reader extends Reader {
+
+ private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class);
+
+ private static final char UNMAPPABLE = '?';
+ private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2);
+ private final CharBuffer charBuffer = CharBuffer.allocate(2);
+ private final CharsetDecoder decoder = DoubleByteUtil.BIG5.newDecoder();
+
+ //https://en.wikipedia.org/wiki/Code_page_950
+ //see private use area
+ private final static char range1Low = '\u8140';
+ private final static char range1High = '\u8DFE';
+ private final static char range2Low = '\u8E40';
+ private final static char range2High = '\uA0FE';
+ private final static char range3Low = '\uC6A1';
+ private final static char range3High = '\uC8FE';
+ private final static char range4Low = '\uFA40';
+ private final static char range4High = '\uFEFE';
+
+ private final byte[] data;
+ private final int startOffset;
+ private final int length;
+ private int offset;
+ private int trailing;
+ private int leading;
+ int cnt;
+ //the char that is logically trailing in Big5 encoding
+ //however in LittleEndian order, this is the first encountered.
+ public LittleEndianCP950Reader(byte[] data) {
+ this(data, 0, data.length);
+ }
+
+ public LittleEndianCP950Reader(byte[] data, int offset, int length) {
+ this.data = data;
+ this.startOffset = offset;
+ this.offset = startOffset;
+ this.length = length;
+ }
+
+ @Override
+ public int read() {
+ if (offset + 1 > data.length || offset - startOffset > length) {
+ return -1;
+ }
+ trailing = data[offset++] & 0xff;
+ leading = data[offset++] & 0xff;
+ decoder.reset();
+ if (leading < 0x81) {
+ //return trailing alone
+ //there may be some subtleties here
+ return trailing;
+ } else if (leading == 0xf9) {
+ return handleF9(trailing);
+ } else {
+ int ch = (leading << 8) + trailing;
+ if (ch >= range1Low && ch <= range1High) {
+ return handleRange1(leading, trailing);
+ } else if (ch >= range2Low && ch <= range2High) {
+ return handleRange2(leading, trailing);
+ } else if (ch >= range3Low && ch <= range3High) {
+ return handleRange3(leading, trailing);
+ } else if (ch >= range4Low && ch <= range4High) {
+ return handleRange4(leading, trailing);
+ }
+
+ charBuffer.clear();
+ doubleByteBuffer.clear();
+ doubleByteBuffer.put((byte) leading);
+ doubleByteBuffer.put((byte) trailing);
+ doubleByteBuffer.flip();
+ decoder.decode(doubleByteBuffer, charBuffer, true);
+ charBuffer.flip();
+
+ if (charBuffer.length() == 0) {
+ LOGGER.log(POILogger.WARN, "couldn't create char for: "
+ + Integer.toString((leading & 0xff), 16)
+ + " " + Integer.toString((trailing & 0xff), 16));
+ return UNMAPPABLE;
+ } else {
+ return Character.codePointAt(charBuffer, 0);
+ }
+ }
+
+
+ }
+
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ //there may be some efficiencies, but this should do for now.
+
+ for (int i = off; i < off + len; i++) {
+ int c = read();
+ if (c == -1) {
+ return i - off;
+ }
+ cbuf[i] = (char) c;
+ }
+ return len;
+ }
+
+ @Override
+ public void close() {
+ }
+
+ private int handleRange1(int leading, int trailing) {
+ return (0xeeb8 + (157 * (leading - 0x81))) +
+ ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+ }
+
+ private int handleRange2(int leading, int trailing) {
+ return (0xe311 + (157 * (leading - 0x8e))) +
+ ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+ }
+
+ private int handleRange3(int leading, int trailing) {
+ return (0xf672 + (157 * (leading - 0xc6))) +
+ ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+ }
+
+ private int handleRange4(int leading, int trailing) {
+ return (0xe000 + (157 * (leading - 0xfa))) +
+ ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
+ }
+
+ private int handleF9(int trailing) {
+ switch (trailing) {
+ case 0x40:
+ return 0x7e98;
+ case 0x41:
+ return 0x7e9b;
+ case 0x42:
+ return 0x7e99;
+ case 0x43:
+ return 0x81e0;
+ case 0x44:
+ return 0x81e1;
+ case 0x45:
+ return 0x8646;
+ case 0x46:
+ return 0x8647;
+ case 0x47:
+ return 0x8648;
+ case 0x48:
+ return 0x8979;
+ case 0x49:
+ return 0x897a;
+ case 0x4a:
+ return 0x897c;
+ case 0x4b:
+ return 0x897b;
+ case 0x4c:
+ return 0x89ff;
+ case 0x4d:
+ return 0x8b98;
+ case 0x4e:
+ return 0x8b99;
+ case 0x4f:
+ return 0x8ea5;
+ case 0x50:
+ return 0x8ea4;
+ case 0x51:
+ return 0x8ea3;
+ case 0x52:
+ return 0x946e;
+ case 0x53:
+ return 0x946d;
+ case 0x54:
+ return 0x946f;
+ case 0x55:
+ return 0x9471;
+ case 0x56:
+ return 0x9473;
+ case 0x57:
+ return 0x9749;
+ case 0x58:
+ return 0x9872;
+ case 0x59:
+ return 0x995f;
+ case 0x5a:
+ return 0x9c68;
+ case 0x5b:
+ return 0x9c6e;
+ case 0x5c:
+ return 0x9c6d;
+ case 0x5d:
+ return 0x9e0b;
+ case 0x5e:
+ return 0x9e0d;
+ case 0x5f:
+ return 0x9e10;
+ case 0x60:
+ return 0x9e0f;
+ case 0x61:
+ return 0x9e12;
+ case 0x62:
+ return 0x9e11;
+ case 0x63:
+ return 0x9ea1;
+ case 0x64:
+ return 0x9ef5;
+ case 0x65:
+ return 0x9f09;
+ case 0x66:
+ return 0x9f47;
+ case 0x67:
+ return 0x9f78;
+ case 0x68:
+ return 0x9f7b;
+ case 0x69:
+ return 0x9f7a;
+ case 0x6a:
+ return 0x9f79;
+ case 0x6b:
+ return 0x571e;
+ case 0x6c:
+ return 0x7066;
+ case 0x6d:
+ return 0x7c6f;
+ case 0x6e:
+ return 0x883c;
+ case 0x6f:
+ return 0x8db2;
+ case 0x70:
+ return 0x8ea6;
+ case 0x71:
+ return 0x91c3;
+ case 0x72:
+ return 0x9474;
+ case 0x73:
+ return 0x9478;
+ case 0x74:
+ return 0x9476;
+ case 0x75:
+ return 0x9475;
+ case 0x76:
+ return 0x9a60;
+ case 0x77:
+ return 0x9c74;
+ case 0x78:
+ return 0x9c73;
+ case 0x79:
+ return 0x9c71;
+ case 0x7a:
+ return 0x9c75;
+ case 0x7b:
+ return 0x9e14;
+ case 0x7c:
+ return 0x9e13;
+ case 0x7d:
+ return 0x9ef6;
+ case 0x7e:
+ return 0x9f0a;
+ case 0xa1:
+ return 0x9fa4;
+ case 0xa2:
+ return 0x7068;
+ case 0xa3:
+ return 0x7065;
+ case 0xa4:
+ return 0x7cf7;
+ case 0xa5:
+ return 0x866a;
+ case 0xa6:
+ return 0x883e;
+ case 0xa7:
+ return 0x883d;
+ case 0xa8:
+ return 0x883f;
+ case 0xa9:
+ return 0x8b9e;
+ case 0xaa:
+ return 0x8c9c;
+ case 0xab:
+ return 0x8ea9;
+ case 0xac:
+ return 0x8ec9;
+ case 0xad:
+ return 0x974b;
+ case 0xae:
+ return 0x9873;
+ case 0xaf:
+ return 0x9874;
+ case 0xb0:
+ return 0x98cc;
+ case 0xb1:
+ return 0x9961;
+ case 0xb2:
+ return 0x99ab;
+ case 0xb3:
+ return 0x9a64;
+ case 0xb4:
+ return 0x9a66;
+ case 0xb5:
+ return 0x9a67;
+ case 0xb6:
+ return 0x9b24;
+ case 0xb7:
+ return 0x9e15;
+ case 0xb8:
+ return 0x9e17;
+ case 0xb9:
+ return 0x9f48;
+ case 0xba:
+ return 0x6207;
+ case 0xbb:
+ return 0x6b1e;
+ case 0xbc:
+ return 0x7227;
+ case 0xbd:
+ return 0x864c;
+ case 0xbe:
+ return 0x8ea8;
+ case 0xbf:
+ return 0x9482;
+ case 0xc0:
+ return 0x9480;
+ case 0xc1:
+ return 0x9481;
+ case 0xc2:
+ return 0x9a69;
+ case 0xc3:
+ return 0x9a68;
+ case 0xc4:
+ return 0x9b2e;
+ case 0xc5:
+ return 0x9e19;
+ case 0xc6:
+ return 0x7229;
+ case 0xc7:
+ return 0x864b;
+ case 0xc8:
+ return 0x8b9f;
+ case 0xc9:
+ return 0x9483;
+ case 0xca:
+ return 0x9c79;
+ case 0xcb:
+ return 0x9eb7;
+ case 0xcc:
+ return 0x7675;
+ case 0xcd:
+ return 0x9a6b;
+ case 0xce:
+ return 0x9c7a;
+ case 0xcf:
+ return 0x9e1d;
+ case 0xd0:
+ return 0x7069;
+ case 0xd1:
+ return 0x706a;
+ case 0xd2:
+ return 0x9ea4;
+ case 0xd3:
+ return 0x9f7e;
+ case 0xd4:
+ return 0x9f49;
+ case 0xd5:
+ return 0x9f98;
+ case 0xd6:
+ return 0x7881;
+ case 0xd7:
+ return 0x92b9;
+ case 0xd8:
+ return 0x88cf;
+ case 0xd9:
+ return 0x58bb;
+ case 0xda:
+ return 0x6052;
+ case 0xdb:
+ return 0x7ca7;
+ case 0xdc:
+ return 0x5afa;
+ case 0xdd:
+ return 0x2554;
+ case 0xde:
+ return 0x2566;
+ case 0xdf:
+ return 0x2557;
+ case 0xe0:
+ return 0x2560;
+ case 0xe1:
+ return 0x256c;
+ case 0xe2:
+ return 0x2563;
+ case 0xe3:
+ return 0x255a;
+ case 0xe4:
+ return 0x2569;
+ case 0xe5:
+ return 0x255d;
+ case 0xe6:
+ return 0x2552;
+ case 0xe7:
+ return 0x2564;
+ case 0xe8:
+ return 0x2555;
+ case 0xe9:
+ return 0x255e;
+ case 0xea:
+ return 0x256a;
+ case 0xeb:
+ return 0x2561;
+ case 0xec:
+ return 0x2558;
+ case 0xed:
+ return 0x2567;
+ case 0xee:
+ return 0x255b;
+ case 0xef:
+ return 0x2553;
+ case 0xf0:
+ return 0x2565;
+ case 0xf1:
+ return 0x2556;
+ case 0xf2:
+ return 0x255f;
+ case 0xf3:
+ return 0x256b;
+ case 0xf4:
+ return 0x2562;
+ case 0xf5:
+ return 0x2559;
+ case 0xf6:
+ return 0x2568;
+ case 0xf7:
+ return 0x255c;
+ case 0xf8:
+ return 0x2551;
+ case 0xf9:
+ return 0x2550;
+ case 0xfa:
+ return 0x256d;
+ case 0xfb:
+ return 0x256e;
+ case 0xfc:
+ return 0x2570;
+ case 0xfd:
+ return 0x256f;
+ case 0xfe:
+ return 0x2593;
+ default:
+ LOGGER.log(POILogger.WARN, "couldn't create char for: f9"
+ + " " + Integer.toString((trailing & 0xff), 16));
+ return UNMAPPABLE;
+ }
+ }
+}
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java
new file mode 100644
index 0000000000..b6c7fd201a
--- /dev/null
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java
@@ -0,0 +1,68 @@
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hwpf.util;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.junit.Test;
+
+public class TestLittleEndianCP950Reader {
+
+ @Test
+ public void testPersonalUseMappings() throws Exception {
+ //ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt
+ byte[] data = new byte[2];
+ data[1] = (byte) 0xfe;
+ data[0] = (byte) 0xd3;
+ assertCharEquals('\uE2E5', data);
+
+ data[1] = (byte) 0x90;
+ data[0] = (byte) 0xb6;
+ assertCharEquals('\uE49F', data);
+
+ //actually found in document
+ //but this disagrees with file above
+ data[1] = (byte) 0x8E;
+ data[0] = (byte) 0xA8;
+ assertCharEquals('\uE357', data);
+
+ data[1] = (byte) 0x8E;
+ data[0] = (byte) 0xE6;
+ assertCharEquals('\uE395', data);
+
+ /*
+ //TODO: figure out why this isn't working
+ data[0] = (byte)0xF9;
+ data[1] = (byte)0xD8;
+ assertCharEquals('\u88CF', data);
+ */
+
+ }
+
+
+ private void assertCharEquals(char expected, byte[] data) throws IOException {
+ Reader reader = new LittleEndianCP950Reader(data);
+ int c = reader.read();
+ assertEquals((int) expected, c);
+ int eof = reader.read();
+ assertEquals("should be end of stream", -1, eof);
+ }
+}