diff options
-rw-r--r-- | src/java/org/apache/poi/util/CodePageUtil.java | 25 | ||||
-rw-r--r-- | src/java/org/apache/poi/util/StringUtil.java | 1 | ||||
-rw-r--r-- | src/multimodule/scratchpad/test9/module-info.class | bin | 2652 -> 2690 bytes | |||
-rw-r--r-- | src/multimodule/scratchpad/test9/module-info.java | 1 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java | 4 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java | 6 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java | 6 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java | 59 | ||||
-rw-r--r-- | src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java (renamed from src/java/org/apache/poi/util/LittleEndianCP950Reader.java) | 10 | ||||
-rw-r--r-- | src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java (renamed from src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java) | 3 |
10 files changed, 76 insertions, 39 deletions
diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java index da8f8a9842..2c1480253d 100644 --- a/src/java/org/apache/poi/util/CodePageUtil.java +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -31,8 +31,6 @@ import java.util.Set; public class CodePageUtil { - public static final Set<Charset> DOUBLE_BYTE_CHARSETS = Collections.singleton(StringUtil.BIG5); - /** <p>Codepage 037, a special case</p> */ public static final int CP_037 = 37; @@ -446,27 +444,4 @@ public class CodePageUtil return "cp" + codepage; } } - - /** - * This tries to convert a LE byte array in cp950 - * (Microsoft's dialect of Big5) to a String. - * We know MS zero-padded ascii, and we drop those. - * There may be areas for improvement in this. - * - * @param data - * @param offset - * @param lengthInBytes - * @return Decoded String - */ - public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { - StringBuilder sb = new StringBuilder(); - LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); - int c = reader.read(); - while (c != -1) { - sb.append((char)c); - c = reader.read(); - } - reader.close(); - return sb.toString(); - } } diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java index a0778a3efa..d281c63386 100644 --- a/src/java/org/apache/poi/util/StringUtil.java +++ b/src/java/org/apache/poi/util/StringUtil.java @@ -34,7 +34,6 @@ public final class StringUtil { public static final Charset UTF16LE = StandardCharsets.UTF_16LE; public static final Charset UTF8 = StandardCharsets.UTF_8; public static final Charset WIN_1252 = Charset.forName("cp1252"); - public static final Charset BIG5 = Charset.forName("Big5"); private StringUtil() { // no instances of this class diff --git a/src/multimodule/scratchpad/test9/module-info.class b/src/multimodule/scratchpad/test9/module-info.class Binary files differindex cfda7bb495..e74671e4fb 100644 --- a/src/multimodule/scratchpad/test9/module-info.class +++ b/src/multimodule/scratchpad/test9/module-info.class diff --git a/src/multimodule/scratchpad/test9/module-info.java b/src/multimodule/scratchpad/test9/module-info.java index 9d406fd36e..2aaef57a0a 100644 --- a/src/multimodule/scratchpad/test9/module-info.java +++ b/src/multimodule/scratchpad/test9/module-info.java @@ -82,6 +82,7 @@ module org.apache.poi.scratchpad { exports org.apache.poi.hemf.hemfplus.extractor to junit; exports org.apache.poi.hslf to junit; exports org.apache.poi.hwmf to junit; + exports org.apache.poi.hwpf.util to junit; opens org.apache.poi.hwpf.model to org.mockito; opens org.apache.poi.hwpf.model.types to org.mockito; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 238e0406f8..a8fb4078a7 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -37,7 +37,7 @@ import org.apache.poi.hwpf.model.TextPieceTable; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.IOUtils; import org.apache.poi.util.LittleEndian; import org.apache.poi.util.NotImplemented; @@ -176,7 +176,7 @@ public class HWPFOldDocument extends HWPFDocumentCore { _fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin(), MAX_RECORD_LENGTH); int numChars = textData.length; - if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { + if (DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) { numChars /= 2; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java index e3cb94c868..e6a0887f32 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java @@ -20,7 +20,7 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.IOUtils; import org.apache.poi.util.Internal; @@ -73,7 +73,7 @@ public class OldTextPieceTable extends TextPieceTable { boolean unicode = pieces[x].isUnicode(); int multiple = 1; if (unicode || - (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { + (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) { multiple = 2; } @@ -106,7 +106,7 @@ public class OldTextPieceTable extends TextPieceTable { @Override protected int getEncodingMultiplier(TextPiece textPiece) { Charset charset = textPiece.getPieceDescriptor().getCharset(); - if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { + if (charset != null && DoubleByteUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) { return 2; } return 1; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index 5c9fcf70d9..0c606cbf03 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -20,7 +20,7 @@ package org.apache.poi.hwpf.model; import java.nio.charset.Charset; -import org.apache.poi.util.CodePageUtil; +import org.apache.poi.hwpf.util.DoubleByteUtil; import org.apache.poi.util.Internal; import org.apache.poi.util.StringUtil; @@ -77,8 +77,8 @@ public class TextPiece extends PropertyNode<TextPiece> { * Create the StringBuilder from the text and unicode flag */ private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { - if (StringUtil.BIG5.equals(pd.getCharset())) { - return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length)); + if (DoubleByteUtil.BIG5.equals(pd.getCharset())) { + return new StringBuilder(DoubleByteUtil.cp950ToString(text, 0, text.length)); } String str = new String(text, 0, text.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java new file mode 100644 index 0000000000..5d55711ed9 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/util/DoubleByteUtil.java @@ -0,0 +1,59 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.util; + +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.Set; + +/** + * Utilities for working with double byte CodePages. + * + * <p>Provides constants for understanding numeric codepages, + * along with utilities to translate these into Java Character Sets.</p> + */ +public class DoubleByteUtil +{ + + public static final Charset BIG5 = Charset.forName("Big5"); + + public static final Set<Charset> DOUBLE_BYTE_CHARSETS = Collections.singleton(BIG5); + + /** + * This tries to convert a LE byte array in cp950 + * (Microsoft's dialect of Big5) to a String. + * We know MS zero-padded ascii, and we drop those. + * There may be areas for improvement in this. + * + * @param data + * @param offset + * @param lengthInBytes + * @return Decoded String + */ + public static String cp950ToString(byte[] data, int offset, int lengthInBytes) { + StringBuilder sb = new StringBuilder(); + LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes); + int c = reader.read(); + while (c != -1) { + sb.append((char)c); + c = reader.read(); + } + reader.close(); + return sb.toString(); + } +} diff --git a/src/java/org/apache/poi/util/LittleEndianCP950Reader.java b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java index 61808afcaa..195629bb04 100644 --- a/src/java/org/apache/poi/util/LittleEndianCP950Reader.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/util/LittleEndianCP950Reader.java @@ -15,13 +15,18 @@ limitations under the License. ==================================================================== */ -package org.apache.poi.util; +package org.apache.poi.hwpf.util; + import java.io.IOException; import java.io.Reader; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharsetDecoder; +import org.apache.poi.util.Internal; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + /** * Stream that converts CP950 (MSOffice's dialect of Big5), with * zero-byte padding for ASCII and in LittleEndianOrder. @@ -31,11 +36,10 @@ public class LittleEndianCP950Reader extends Reader { private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class); - private static final char UNMAPPABLE = '?'; private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2); private final CharBuffer charBuffer = CharBuffer.allocate(2); - private final CharsetDecoder decoder = StringUtil.BIG5.newDecoder(); + private final CharsetDecoder decoder = DoubleByteUtil.BIG5.newDecoder(); //https://en.wikipedia.org/wiki/Code_page_950 //see private use area diff --git a/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java index ef648e4f00..b6c7fd201a 100644 --- a/src/testcases/org/apache/poi/util/TestLittleEndianCP950Reader.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/util/TestLittleEndianCP950Reader.java @@ -15,8 +15,7 @@ limitations under the License. ==================================================================== */ -package org.apache.poi.util; - +package org.apache.poi.hwpf.util; import static org.junit.Assert.assertEquals; |