diff options
Diffstat (limited to 'src/scratchpad')
6 files changed, 618 insertions, 8 deletions
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 238e0406f8..a8fb4078a7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -37,7 +37,7 @@ import org.apache.poi.hwpf.model.TextPieceTable; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.IOUtils; import org.apache.poi.util.LittleEndian; import org.apache.poi.util.NotImplemented; @@ -176,7 +176,7 @@ public class HWPFOldDocument extends HWPFDocumentCore { _fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin(), MAX_RECORD_LENGTH); int numChars = textData.length; - if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { + if (DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { numChars /= 2; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java index e3cb94c868..e6a0887f32 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java @@ -20,7 +20,7 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.IOUtils; import org.apache.poi.util.Internal; @@ -73,7 +73,7 @@ public class OldTextPieceTable extends TextPieceTable { boolean unicode = pieces[x].isUnicode(); int multiple = 1; if (unicode || - (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { + (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { multiple = 2; } @@ -106,7 +106,7 @@ public class OldTextPieceTable extends TextPieceTable { @Override protected int getEncodingMultiplier(TextPiece textPiece) { Charset charset = textPiece.getPieceDescriptor().getCharset(); - if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { + if (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { return 2; } return 1; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index 5c9fcf70d9..0c606cbf03 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -20,7 +20,7 @@ package org.apache.poi.hwpf.model; import java.nio.charset.Charset; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.Internal; import org.apache.poi.util.StringUtil; @@ -77,8 +77,8 @@ public class TextPiece extends PropertyNode<TextPiece> { * Create the StringBuilder from the text and unicode flag */ private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { - if (StringUtil.BIG5.equals(pd.getCharset())) { - return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length)); + if (DoubleByteUtil.BIG5.equals(pd.getCharset())) { + return new StringBuilder(DoubleByteUtil.cp950ToString(text, 0, text.length)); } String str = new String(text, 0, text.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java new file mode 100644 index 0000000000..5d55711ed9 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java @@ -0,0 +1,59 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.Set; + +/** + * Utilities for working with double byte CodePages. + * + * <p>Provides constants for understanding numeric codepages, + * along with utilities to translate these into Java Character Sets.</p> + */ +public class DoubleByteUtil +{ + + public static final Charset BIG5 = Charset.forName("Big5"); + + public static final Set<Charset> DOUBLE_BYTE_CHARSETS = Collections.singleton(BIG5); + + /** + * This tries to convert a LE byte array in cp950 + * (Microsoft's dialect of Big5) to a String. + * We know MS zero-padded ascii, and we drop those. + * There may be areas for improvement in this. + * + * @param data + * @param offset + * @param lengthInBytes + * @return Decoded String + */ + public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { + StringBuilder sb = new StringBuilder(); + LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); + int c = reader.read(); + while (c != -1) { + sb.append((char)c); + c = reader.read(); + } + reader.close(); + return sb.toString(); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java new file mode 100644 index 0000000000..195629bb04 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java @@ -0,0 +1,483 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import java.io.IOException; +import java.io.Reader; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; + +import org.apache.poi.util.Internal; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +/** + * Stream that converts CP950 (MSOffice's dialect of Big5), with + * zero-byte padding for ASCII and in LittleEndianOrder. + */ +@Internal +public class LittleEndianCP950Reader extends Reader { + + private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class); + + private static final char UNMAPPABLE = '?'; + private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2); + private final CharBuffer charBuffer = CharBuffer.allocate(2); + private final CharsetDecoder decoder = DoubleByteUtil.BIG5.newDecoder(); + + //https://en.wikipedia.org/wiki/Code_page_950 + //see private use area + private final static char range1Low = '\u8140'; + private final static char range1High = '\u8DFE'; + private final static char range2Low = '\u8E40'; + private final static char range2High = '\uA0FE'; + private final static char range3Low = '\uC6A1'; + private final static char range3High = '\uC8FE'; + private final static char range4Low = '\uFA40'; + private final static char range4High = '\uFEFE'; + + private final byte[] data; + private final int startOffset; + private final int length; + private int offset; + private int trailing; + private int leading; + int cnt; + //the char that is logically trailing in Big5 encoding + //however in LittleEndian order, this is the first encountered. + public LittleEndianCP950Reader(byte[] data) { + this(data, 0, data.length); + } + + public LittleEndianCP950Reader(byte[] data, int offset, int length) { + this.data = data; + this.startOffset = offset; + this.offset = startOffset; + this.length = length; + } + + @Override + public int read() { + if (offset + 1 > data.length || offset - startOffset > length) { + return -1; + } + trailing = data[offset++] & 0xff; + leading = data[offset++] & 0xff; + decoder.reset(); + if (leading < 0x81) { + //return trailing alone + //there may be some subtleties here + return trailing; + } else if (leading == 0xf9) { + return handleF9(trailing); + } else { + int ch = (leading << 8) + trailing; + if (ch >= range1Low && ch <= range1High) { + return handleRange1(leading, trailing); + } else if (ch >= range2Low && ch <= range2High) { + return handleRange2(leading, trailing); + } else if (ch >= range3Low && ch <= range3High) { + return handleRange3(leading, trailing); + } else if (ch >= range4Low && ch <= range4High) { + return handleRange4(leading, trailing); + } + + charBuffer.clear(); + doubleByteBuffer.clear(); + doubleByteBuffer.put((byte) leading); + doubleByteBuffer.put((byte) trailing); + doubleByteBuffer.flip(); + decoder.decode(doubleByteBuffer, charBuffer, true); + charBuffer.flip(); + + if (charBuffer.length() == 0) { + LOGGER.log(POILogger.WARN, "couldn't create char for: " + + Integer.toString((leading & 0xff), 16) + + " " + Integer.toString((trailing & 0xff), 16)); + return UNMAPPABLE; + } else { + return Character.codePointAt(charBuffer, 0); + } + } + + + } + + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + //there may be some efficiencies, but this should do for now. + + for (int i = off; i < off + len; i++) { + int c = read(); + if (c == -1) { + return i - off; + } + cbuf[i] = (char) c; + } + return len; + } + + @Override + public void close() { + } + + private int handleRange1(int leading, int trailing) { + return (0xeeb8 + (157 * (leading - 0x81))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange2(int leading, int trailing) { + return (0xe311 + (157 * (leading - 0x8e))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange3(int leading, int trailing) { + return (0xf672 + (157 * (leading - 0xc6))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleRange4(int leading, int trailing) { + return (0xe000 + (157 * (leading - 0xfa))) + + ((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62); + } + + private int handleF9(int trailing) { + switch (trailing) { + case 0x40: + return 0x7e98; + case 0x41: + return 0x7e9b; + case 0x42: + return 0x7e99; + case 0x43: + return 0x81e0; + case 0x44: + return 0x81e1; + case 0x45: + return 0x8646; + case 0x46: + return 0x8647; + case 0x47: + return 0x8648; + case 0x48: + return 0x8979; + case 0x49: + return 0x897a; + case 0x4a: + return 0x897c; + case 0x4b: + return 0x897b; + case 0x4c: + return 0x89ff; + case 0x4d: + return 0x8b98; + case 0x4e: + return 0x8b99; + case 0x4f: + return 0x8ea5; + case 0x50: + return 0x8ea4; + case 0x51: + return 0x8ea3; + case 0x52: + return 0x946e; + case 0x53: + return 0x946d; + case 0x54: + return 0x946f; + case 0x55: + return 0x9471; + case 0x56: + return 0x9473; + case 0x57: + return 0x9749; + case 0x58: + return 0x9872; + case 0x59: + return 0x995f; + case 0x5a: + return 0x9c68; + case 0x5b: + return 0x9c6e; + case 0x5c: + return 0x9c6d; + case 0x5d: + return 0x9e0b; + case 0x5e: + return 0x9e0d; + case 0x5f: + return 0x9e10; + case 0x60: + return 0x9e0f; + case 0x61: + return 0x9e12; + case 0x62: + return 0x9e11; + case 0x63: + return 0x9ea1; + case 0x64: + return 0x9ef5; + case 0x65: + return 0x9f09; + case 0x66: + return 0x9f47; + case 0x67: + return 0x9f78; + case 0x68: + return 0x9f7b; + case 0x69: + return 0x9f7a; + case 0x6a: + return 0x9f79; + case 0x6b: + return 0x571e; + case 0x6c: + return 0x7066; + case 0x6d: + return 0x7c6f; + case 0x6e: + return 0x883c; + case 0x6f: + return 0x8db2; + case 0x70: + return 0x8ea6; + case 0x71: + return 0x91c3; + case 0x72: + return 0x9474; + case 0x73: + return 0x9478; + case 0x74: + return 0x9476; + case 0x75: + return 0x9475; + case 0x76: + return 0x9a60; + case 0x77: + return 0x9c74; + case 0x78: + return 0x9c73; + case 0x79: + return 0x9c71; + case 0x7a: + return 0x9c75; + case 0x7b: + return 0x9e14; + case 0x7c: + return 0x9e13; + case 0x7d: + return 0x9ef6; + case 0x7e: + return 0x9f0a; + case 0xa1: + return 0x9fa4; + case 0xa2: + return 0x7068; + case 0xa3: + return 0x7065; + case 0xa4: + return 0x7cf7; + case 0xa5: + return 0x866a; + case 0xa6: + return 0x883e; + case 0xa7: + return 0x883d; + case 0xa8: + return 0x883f; + case 0xa9: + return 0x8b9e; + case 0xaa: + return 0x8c9c; + case 0xab: + return 0x8ea9; + case 0xac: + return 0x8ec9; + case 0xad: + return 0x974b; + case 0xae: + return 0x9873; + case 0xaf: + return 0x9874; + case 0xb0: + return 0x98cc; + case 0xb1: + return 0x9961; + case 0xb2: + return 0x99ab; + case 0xb3: + return 0x9a64; + case 0xb4: + return 0x9a66; + case 0xb5: + return 0x9a67; + case 0xb6: + return 0x9b24; + case 0xb7: + return 0x9e15; + case 0xb8: + return 0x9e17; + case 0xb9: + return 0x9f48; + case 0xba: + return 0x6207; + case 0xbb: + return 0x6b1e; + case 0xbc: + return 0x7227; + case 0xbd: + return 0x864c; + case 0xbe: + return 0x8ea8; + case 0xbf: + return 0x9482; + case 0xc0: + return 0x9480; + case 0xc1: + return 0x9481; + case 0xc2: + return 0x9a69; + case 0xc3: + return 0x9a68; + case 0xc4: + return 0x9b2e; + case 0xc5: + return 0x9e19; + case 0xc6: + return 0x7229; + case 0xc7: + return 0x864b; + case 0xc8: + return 0x8b9f; + case 0xc9: + return 0x9483; + case 0xca: + return 0x9c79; + case 0xcb: + return 0x9eb7; + case 0xcc: + return 0x7675; + case 0xcd: + return 0x9a6b; + case 0xce: + return 0x9c7a; + case 0xcf: + return 0x9e1d; + case 0xd0: + return 0x7069; + case 0xd1: + return 0x706a; + case 0xd2: + return 0x9ea4; + case 0xd3: + return 0x9f7e; + case 0xd4: + return 0x9f49; + case 0xd5: + return 0x9f98; + case 0xd6: + return 0x7881; + case 0xd7: + return 0x92b9; + case 0xd8: + return 0x88cf; + case 0xd9: + return 0x58bb; + case 0xda: + return 0x6052; + case 0xdb: + return 0x7ca7; + case 0xdc: + return 0x5afa; + case 0xdd: + return 0x2554; + case 0xde: + return 0x2566; + case 0xdf: + return 0x2557; + case 0xe0: + return 0x2560; + case 0xe1: + return 0x256c; + case 0xe2: + return 0x2563; + case 0xe3: + return 0x255a; + case 0xe4: + return 0x2569; + case 0xe5: + return 0x255d; + case 0xe6: + return 0x2552; + case 0xe7: + return 0x2564; + case 0xe8: + return 0x2555; + case 0xe9: + return 0x255e; + case 0xea: + return 0x256a; + case 0xeb: + return 0x2561; + case 0xec: + return 0x2558; + case 0xed: + return 0x2567; + case 0xee: + return 0x255b; + case 0xef: + return 0x2553; + case 0xf0: + return 0x2565; + case 0xf1: + return 0x2556; + case 0xf2: + return 0x255f; + case 0xf3: + return 0x256b; + case 0xf4: + return 0x2562; + case 0xf5: + return 0x2559; + case 0xf6: + return 0x2568; + case 0xf7: + return 0x255c; + case 0xf8: + return 0x2551; + case 0xf9: + return 0x2550; + case 0xfa: + return 0x256d; + case 0xfb: + return 0x256e; + case 0xfc: + return 0x2570; + case 0xfd: + return 0x256f; + case 0xfe: + return 0x2593; + default: + LOGGER.log(POILogger.WARN, "couldn't create char for: f9" + + " " + Integer.toString((trailing & 0xff), 16)); + return UNMAPPABLE; + } + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java new file mode 100644 index 0000000000..b6c7fd201a --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java @@ -0,0 +1,68 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.io.Reader; + +import org.junit.Test; + +public class TestLittleEndianCP950Reader { + + @Test + public void testPersonalUseMappings() throws Exception { + //ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt + byte[] data = new byte[2]; + data[1] = (byte) 0xfe; + data[0] = (byte) 0xd3; + assertCharEquals('\uE2E5', data); + + data[1] = (byte) 0x90; + data[0] = (byte) 0xb6; + assertCharEquals('\uE49F', data); + + //actually found in document + //but this disagrees with file above + data[1] = (byte) 0x8E; + data[0] = (byte) 0xA8; + assertCharEquals('\uE357', data); + + data[1] = (byte) 0x8E; + data[0] = (byte) 0xE6; + assertCharEquals('\uE395', data); + + /* + //TODO: figure out why this isn't working + data[0] = (byte)0xF9; + data[1] = (byte)0xD8; + assertCharEquals('\u88CF', data); + */ + + } + + + private void assertCharEquals(char expected, byte[] data) throws IOException { + Reader reader = new LittleEndianCP950Reader(data); + int c = reader.read(); + assertEquals((int) expected, c); + int eof = reader.read(); + assertEquals("should be end of stream", -1, eof); + } +} |