aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndreas Beeker <kiwiwings@apache.org>2014-12-29 19:43:35 +0000
committerAndreas Beeker <kiwiwings@apache.org>2014-12-29 19:43:35 +0000
commit74ddc1440edc77fe8d5aae05df16d2770110e94e (patch)
tree159be88ddde99755adef8b016c0998034e96df21
parentf27ced46fb6f2294c18f7a51ad4ae982a4902a46 (diff)
downloadpoi-74ddc1440edc77fe8d5aae05df16d2770110e94e.tar.gz
poi-74ddc1440edc77fe8d5aae05df16d2770110e94e.zip
Bug 49541 - Mapping of symbol characters to unicode equivalent
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1648415 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--src/java/org/apache/poi/util/StringUtil.java247
-rw-r--r--src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java16
-rw-r--r--test-data/slideshow/49541_symbol_map.pptbin0 -> 313856 bytes
3 files changed, 263 insertions, 0 deletions
diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java
index 3a323f67a5..99880f500e 100644
--- a/src/java/org/apache/poi/util/StringUtil.java
+++ b/src/java/org/apache/poi/util/StringUtil.java
@@ -20,7 +20,9 @@ package org.apache.poi.util;
import java.nio.charset.Charset;
import java.text.FieldPosition;
import java.text.NumberFormat;
+import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import org.apache.poi.hssf.record.RecordInputStream;
/**
@@ -37,6 +39,7 @@ import org.apache.poi.hssf.record.RecordInputStream;
public class StringUtil {
private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
private static final Charset UTF16LE = Charset.forName("UTF-16LE");
+ private static Map<Integer,Integer> msCodepointToUnicode;
private StringUtil() {
// no instances of this class
@@ -396,4 +399,248 @@ public class StringUtil {
}
public void remove() {}
}
+
+
+ /**
+ * Some strings may contain encoded characters of the unicode private use area.
+ * Currently the characters of the symbol fonts are mapped to the corresponding
+ * characters in the normal unicode range.
+ *
+ * @param string the original string
+ * @return the string with mapped characters
+ *
+ * @see <a href="http://www.alanwood.net/unicode/private_use_area.html#symbol">Private Use Area (symbol)</a>
+ * @see <a href="http://www.alanwood.net/demos/symbol.html">Symbol font - Unicode alternatives for Greek and special characters in HTML</a>
+ */
+ public static String mapMsCodepointString(String string) {
+ if (string == null || "".equals(string)) return string;
+ initMsCodepointMap();
+
+ StringBuilder sb = new StringBuilder();
+ final int length = string.length();
+ for (int offset = 0; offset < length; ) {
+ Integer msCodepoint = string.codePointAt(offset);
+ Integer uniCodepoint = msCodepointToUnicode.get(msCodepoint);
+ sb.appendCodePoint(uniCodepoint == null ? msCodepoint : uniCodepoint);
+ offset += Character.charCount(msCodepoint);
+ }
+
+ return sb.toString();
+ }
+
+ public static synchronized void mapMsCodepoint(int msCodepoint, int unicodeCodepoint) {
+ initMsCodepointMap();
+ msCodepointToUnicode.put(msCodepoint, unicodeCodepoint);
+ }
+
+ private static synchronized void initMsCodepointMap() {
+ if (msCodepointToUnicode != null) return;
+ msCodepointToUnicode = new HashMap<Integer,Integer>();
+ int i=0xF020;
+ for (int ch : symbolMap_f020) {
+ msCodepointToUnicode.put(i++, ch);
+ }
+ i = 0xf0a0;
+ for (int ch : symbolMap_f0a0) {
+ msCodepointToUnicode.put(i++, ch);
+ }
+ }
+
+ private static final int symbolMap_f020[] = {
+ ' ', // 0xf020 space
+ '!', // 0xf021 exclam
+ 8704, // 0xf022 universal
+ '#', // 0xf023 numbersign
+ 8707, // 0xf024 existential
+ '%', // 0xf025 percent
+ '&', // 0xf026 ampersand
+ 8717, // 0xf027 suchthat
+ '(', // 0xf028 parenleft
+ ')', // 0xf029 parentright
+ 8727, // 0xf02a asteriskmath
+ '+', // 0xf02b plus
+ ',', // 0xf02c comma
+ 8722, // 0xf02d minus sign (long -)
+ '.', // 0xf02e period
+ '/', // 0xf02f slash
+ '0', // 0xf030 0
+ '1', // 0xf031 1
+ '2', // 0xf032 2
+ '3', // 0xf033 3
+ '4', // 0xf034 4
+ '5', // 0xf035 5
+ '6', // 0xf036 6
+ '7', // 0xf037 7
+ '8', // 0xf038 8
+ '9', // 0xf039 9
+ ':', // 0xf03a colon
+ ';', // 0xf03b semicolon
+ '<', // 0xf03c less
+ '=', // 0xf03d equal
+ '>', // 0xf03e greater
+ '?', // 0xf03f question
+ 8773, // 0xf040 congruent
+ 913, // 0xf041 alpha (upper)
+ 914, // 0xf042 beta (upper)
+ 935, // 0xf043 chi (upper)
+ 916, // 0xf044 delta (upper)
+ 917, // 0xf045 epsilon (upper)
+ 934, // 0xf046 phi (upper)
+ 915, // 0xf047 gamma (upper)
+ 919, // 0xf048 eta (upper)
+ 921, // 0xf049 iota (upper)
+ 977, // 0xf04a theta1 (lower)
+ 922, // 0xf04b kappa (upper)
+ 923, // 0xf04c lambda (upper)
+ 924, // 0xf04d mu (upper)
+ 925, // 0xf04e nu (upper)
+ 927, // 0xf04f omicron (upper)
+ 928, // 0xf050 pi (upper)
+ 920, // 0xf051 theta (upper)
+ 929, // 0xf052 rho (upper)
+ 931, // 0xf053 sigma (upper)
+ 932, // 0xf054 tau (upper)
+ 933, // 0xf055 upsilon (upper)
+ 962, // 0xf056 simga1 (lower)
+ 937, // 0xf057 omega (upper)
+ 926, // 0xf058 xi (upper)
+ 936, // 0xf059 psi (upper)
+ 918, // 0xf05a zeta (upper)
+ '[', // 0xf05b bracketleft
+ 8765, // 0xf05c therefore
+ ']', // 0xf05d bracketright
+ 8869, // 0xf05e perpendicular
+ '_', // 0xf05f underscore
+ ' ', // 0xf060 radicalex (doesn't exist in unicode)
+ 945, // 0xf061 alpha (lower)
+ 946, // 0xf062 beta (lower)
+ 967, // 0xf063 chi (lower)
+ 948, // 0xf064 delta (lower)
+ 949, // 0xf065 epsilon (lower)
+ 966, // 0xf066 phi (lower)
+ 947, // 0xf067 gamma (lower)
+ 951, // 0xf068 eta (lower)
+ 953, // 0xf069 iota (lower)
+ 981, // 0xf06a phi1 (lower)
+ 954, // 0xf06b kappa (lower)
+ 955, // 0xf06c lambda (lower)
+ 956, // 0xf06d mu (lower)
+ 957, // 0xf06e nu (lower)
+ 959, // 0xf06f omnicron (lower)
+ 960, // 0xf070 pi (lower)
+ 952, // 0xf071 theta (lower)
+ 961, // 0xf072 rho (lower)
+ 963, // 0xf073 sigma (lower)
+ 964, // 0xf074 tau (lower)
+ 965, // 0xf075 upsilon (lower)
+ 982, // 0xf076 piv (lower)
+ 969, // 0xf077 omega (lower)
+ 958, // 0xf078 xi (lower)
+ 968, // 0xf079 psi (lower)
+ 950, // 0xf07a zeta (lower)
+ '{', // 0xf07b braceleft
+ '|', // 0xf07c bar
+ '}', // 0xf07d braceright
+ 8764, // 0xf07e similar '~'
+ ' ', // 0xf07f not defined
+ };
+
+ private static final int symbolMap_f0a0[] = {
+ 8364, // 0xf0a0 not defined / euro symbol
+ 978, // 0xf0a1 upsilon1 (upper)
+ 8242, // 0xf0a2 minute
+ 8804, // 0xf0a3 lessequal
+ 8260, // 0xf0a4 fraction
+ 8734, // 0xf0a5 infinity
+ 402, // 0xf0a6 florin
+ 9827, // 0xf0a7 club
+ 9830, // 0xf0a8 diamond
+ 9829, // 0xf0a9 heart
+ 9824, // 0xf0aa spade
+ 8596, // 0xf0ab arrowboth
+ 8591, // 0xf0ac arrowleft
+ 8593, // 0xf0ad arrowup
+ 8594, // 0xf0ae arrowright
+ 8595, // 0xf0af arrowdown
+ 176, // 0xf0b0 degree
+ 177, // 0xf0b1 plusminus
+ 8243, // 0xf0b2 second
+ 8805, // 0xf0b3 greaterequal
+ 215, // 0xf0b4 multiply
+ 181, // 0xf0b5 proportional
+ 8706, // 0xf0b6 partialdiff
+ 8729, // 0xf0b7 bullet
+ 247, // 0xf0b8 divide
+ 8800, // 0xf0b9 notequal
+ 8801, // 0xf0ba equivalence
+ 8776, // 0xf0bb approxequal
+ 8230, // 0xf0bc ellipsis
+ 9168, // 0xf0bd arrowvertex
+ 9135, // 0xf0be arrowhorizex
+ 8629, // 0xf0bf carriagereturn
+ 8501, // 0xf0c0 aleph
+ 8475, // 0xf0c1 Ifraktur
+ 8476, // 0xf0c2 Rfraktur
+ 8472, // 0xf0c3 weierstrass
+ 8855, // 0xf0c4 circlemultiply
+ 8853, // 0xf0c5 circleplus
+ 8709, // 0xf0c6 emptyset
+ 8745, // 0xf0c7 intersection
+ 8746, // 0xf0c8 union
+ 8835, // 0xf0c9 propersuperset
+ 8839, // 0xf0ca reflexsuperset
+ 8836, // 0xf0cb notsubset
+ 8834, // 0xf0cc propersubset
+ 8838, // 0xf0cd reflexsubset
+ 8712, // 0xf0ce element
+ 8713, // 0xf0cf notelement
+ 8736, // 0xf0d0 angle
+ 8711, // 0xf0d1 gradient
+ 174, // 0xf0d2 registerserif
+ 169, // 0xf0d3 copyrightserif
+ 8482, // 0xf0d4 trademarkserif
+ 8719, // 0xf0d5 product
+ 8730, // 0xf0d6 radical
+ 8901, // 0xf0d7 dotmath
+ 172, // 0xf0d8 logicalnot
+ 8743, // 0xf0d9 logicaland
+ 8744, // 0xf0da logicalor
+ 8660, // 0xf0db arrowdblboth
+ 8656, // 0xf0dc arrowdblleft
+ 8657, // 0xf0dd arrowdblup
+ 8658, // 0xf0de arrowdblright
+ 8659, // 0xf0df arrowdbldown
+ 9674, // 0xf0e0 lozenge
+ 9001, // 0xf0e1 angleleft
+ 174, // 0xf0e2 registersans
+ 169, // 0xf0e3 copyrightsans
+ 8482, // 0xf0e4 trademarksans
+ 8721, // 0xf0e5 summation
+ 9115, // 0xf0e6 parenlefttp
+ 9116, // 0xf0e7 parenleftex
+ 9117, // 0xf0e8 parenleftbt
+ 9121, // 0xf0e9 bracketlefttp
+ 9122, // 0xf0ea bracketleftex
+ 9123, // 0xf0eb bracketleftbt
+ 9127, // 0xf0ec bracelefttp
+ 9128, // 0xf0ed braceleftmid
+ 9129, // 0xf0ee braceleftbt
+ 9130, // 0xf0ef braceex
+ ' ', // 0xf0f0 not defined
+ 9002, // 0xf0f1 angleright
+ 8747, // 0xf0f2 integral
+ 8992, // 0xf0f3 integraltp
+ 9134, // 0xf0f4 integralex
+ 8993, // 0xf0f5 integralbt
+ 9118, // 0xf0f6 parenrighttp
+ 9119, // 0xf0f7 parenrightex
+ 9120, // 0xf0f8 parenrightbt
+ 9124, // 0xf0f9 bracketrighttp
+ 9125, // 0xf0fa bracketrightex
+ 9126, // 0xf0fb bracketrightbt
+ 9131, // 0xf0fc bracerighttp
+ 9132, // 0xf0fd bracerightmid
+ 9133, // 0xf0fe bracerightbt
+ ' ', // 0xf0ff not defined
+ };
}
diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java
index 4f5d7ae10b..6b26c28d7d 100644
--- a/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java
@@ -58,6 +58,7 @@ import org.apache.poi.hslf.record.Record;
import org.apache.poi.hslf.record.SlideListWithText;
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
import org.apache.poi.hslf.record.TextHeaderAtom;
+import org.apache.poi.util.StringUtil;
import org.junit.Test;
/**
@@ -579,4 +580,19 @@ public final class TestBugs {
inputStream.close();
}
}
+
+ @Test
+ public void bug49541() throws Exception {
+ InputStream inputStream = new FileInputStream(_slTests.getFile("49541_symbol_map.ppt"));
+ try {
+ SlideShow slideShow = new SlideShow(inputStream);
+ Slide slide = slideShow.getSlides()[0];
+ ShapeGroup sg = (ShapeGroup)slide.getShapes()[0];
+ TextBox tb = (TextBox)sg.getShapes()[0];
+ String text = StringUtil.mapMsCodepointString(tb.getText());
+ assertEquals("\u226575 years", text);
+ } finally {
+ inputStream.close();
+ }
+ }
}
diff --git a/test-data/slideshow/49541_symbol_map.ppt b/test-data/slideshow/49541_symbol_map.ppt
new file mode 100644
index 0000000000..781049290e
--- /dev/null
+++ b/test-data/slideshow/49541_symbol_map.ppt
Binary files differ