]> source.dussan.org Git - poi.git/commitdiff
Bug 49541 - Mapping of symbol characters to unicode equivalent
authorAndreas Beeker <kiwiwings@apache.org>
Mon, 29 Dec 2014 19:43:35 +0000 (19:43 +0000)
committerAndreas Beeker <kiwiwings@apache.org>
Mon, 29 Dec 2014 19:43:35 +0000 (19:43 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1648415 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/poi/util/StringUtil.java
src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java
test-data/slideshow/49541_symbol_map.ppt [new file with mode: 0644]

index 3a323f67a529aedd13ba3cd56e7b3a0c68ec283c..99880f500e3b7e98348fad385e5093955a42e16a 100644 (file)
@@ -20,7 +20,9 @@ package org.apache.poi.util;
 import java.nio.charset.Charset;
 import java.text.FieldPosition;
 import java.text.NumberFormat;
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 
 import org.apache.poi.hssf.record.RecordInputStream;
 /**
@@ -37,6 +39,7 @@ import org.apache.poi.hssf.record.RecordInputStream;
 public class StringUtil {
        private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
        private static final Charset UTF16LE = Charset.forName("UTF-16LE");
+    private static Map<Integer,Integer> msCodepointToUnicode;
 
        private StringUtil() {
                // no instances of this class
@@ -396,4 +399,248 @@ public class StringUtil {
       }
       public void remove() {}
    }
+
+
+   /**
+    * Some strings may contain encoded characters of the unicode private use area.
+    * Currently the characters of the symbol fonts are mapped to the corresponding
+    * characters in the normal unicode range. 
+    *
+    * @param string the original string 
+    * @return the string with mapped characters
+    * 
+    * @see <a href="http://www.alanwood.net/unicode/private_use_area.html#symbol">Private Use Area (symbol)</a>
+    * @see <a href="http://www.alanwood.net/demos/symbol.html">Symbol font - Unicode alternatives for Greek and special characters in HTML</a>
+    */
+   public static String mapMsCodepointString(String string) {
+       if (string == null || "".equals(string)) return string;
+       initMsCodepointMap();
+       
+       StringBuilder sb = new StringBuilder();
+       final int length = string.length();
+       for (int offset = 0; offset < length; ) {
+          Integer msCodepoint = string.codePointAt(offset);
+          Integer uniCodepoint = msCodepointToUnicode.get(msCodepoint);
+          sb.appendCodePoint(uniCodepoint == null ? msCodepoint : uniCodepoint);
+          offset += Character.charCount(msCodepoint);
+       }
+       
+       return sb.toString();
+   }
+   
+   public static synchronized void mapMsCodepoint(int msCodepoint, int unicodeCodepoint) {
+       initMsCodepointMap();
+       msCodepointToUnicode.put(msCodepoint, unicodeCodepoint);
+   }
+   
+   private static synchronized void initMsCodepointMap() {
+       if (msCodepointToUnicode != null) return;
+       msCodepointToUnicode = new HashMap<Integer,Integer>();
+       int i=0xF020;
+       for (int ch : symbolMap_f020) {
+           msCodepointToUnicode.put(i++, ch);
+       }
+       i = 0xf0a0;
+       for (int ch : symbolMap_f0a0) {
+           msCodepointToUnicode.put(i++, ch);
+       }       
+   }
+   
+   private static final int symbolMap_f020[] = {
+       ' ', // 0xf020 space
+       '!', // 0xf021 exclam
+       8704, // 0xf022 universal
+       '#', // 0xf023 numbersign
+       8707, // 0xf024 existential
+       '%', // 0xf025 percent
+       '&', // 0xf026 ampersand
+       8717, // 0xf027 suchthat
+       '(', // 0xf028 parenleft
+       ')', // 0xf029 parentright
+       8727, // 0xf02a asteriskmath
+       '+', // 0xf02b plus
+       ',', // 0xf02c comma
+       8722, // 0xf02d minus sign (long -)
+       '.', // 0xf02e period
+       '/', // 0xf02f slash
+       '0', // 0xf030 0
+       '1', // 0xf031 1
+       '2', // 0xf032 2
+       '3', // 0xf033 3
+       '4', // 0xf034 4
+       '5', // 0xf035 5
+       '6', // 0xf036 6
+       '7', // 0xf037 7
+       '8', // 0xf038 8
+       '9', // 0xf039 9
+       ':', // 0xf03a colon
+       ';', // 0xf03b semicolon
+       '<', // 0xf03c less
+       '=', // 0xf03d equal
+       '>', // 0xf03e greater
+       '?', // 0xf03f question
+       8773, // 0xf040 congruent
+       913, // 0xf041 alpha (upper)
+       914, // 0xf042 beta (upper)
+       935, // 0xf043 chi (upper)
+       916, // 0xf044 delta (upper)
+       917, // 0xf045 epsilon (upper)
+       934, // 0xf046 phi (upper)
+       915, // 0xf047 gamma (upper)
+       919, // 0xf048 eta (upper)
+       921, // 0xf049 iota (upper)
+       977, // 0xf04a theta1 (lower)
+       922, // 0xf04b kappa (upper)
+       923, // 0xf04c lambda (upper)
+       924, // 0xf04d mu (upper)
+       925, // 0xf04e nu (upper)
+       927, // 0xf04f omicron (upper)
+       928, // 0xf050 pi (upper)
+       920, // 0xf051 theta (upper)
+       929, // 0xf052 rho (upper)
+       931, // 0xf053 sigma (upper)
+       932, // 0xf054 tau (upper)
+       933, // 0xf055 upsilon (upper)
+       962, // 0xf056 simga1 (lower)
+       937, // 0xf057 omega (upper)
+       926, // 0xf058 xi (upper)
+       936, // 0xf059 psi (upper)
+       918, // 0xf05a zeta (upper)
+       '[', // 0xf05b bracketleft
+       8765, // 0xf05c therefore
+       ']', // 0xf05d bracketright
+       8869, // 0xf05e perpendicular
+       '_', // 0xf05f underscore
+       ' ', // 0xf060 radicalex (doesn't exist in unicode)
+       945, // 0xf061 alpha (lower)
+       946, // 0xf062 beta (lower)
+       967, // 0xf063 chi (lower)
+       948, // 0xf064 delta (lower)
+       949, // 0xf065 epsilon (lower)
+       966, // 0xf066 phi (lower)
+       947, // 0xf067 gamma (lower)
+       951, // 0xf068 eta (lower)
+       953, // 0xf069 iota (lower)
+       981, // 0xf06a phi1 (lower)
+       954, // 0xf06b kappa (lower)
+       955, // 0xf06c lambda (lower)
+       956, // 0xf06d mu (lower)
+       957, // 0xf06e nu (lower)
+       959, // 0xf06f omnicron (lower)
+       960, // 0xf070 pi (lower)
+       952, // 0xf071 theta (lower)
+       961, // 0xf072 rho (lower)
+       963, // 0xf073 sigma (lower)
+       964, // 0xf074 tau (lower)
+       965, // 0xf075 upsilon (lower)
+       982, // 0xf076 piv (lower)
+       969, // 0xf077 omega (lower)
+       958, // 0xf078 xi (lower)
+       968, // 0xf079 psi (lower)
+       950, // 0xf07a zeta (lower)
+       '{', // 0xf07b braceleft
+       '|', // 0xf07c bar
+       '}', // 0xf07d braceright
+       8764, // 0xf07e similar '~'
+       ' ', // 0xf07f not defined
+   };
+
+   private static final int symbolMap_f0a0[] = {
+       8364, // 0xf0a0 not defined / euro symbol
+       978, // 0xf0a1 upsilon1 (upper)
+       8242, // 0xf0a2 minute
+       8804, // 0xf0a3 lessequal
+       8260, // 0xf0a4 fraction
+       8734, // 0xf0a5 infinity
+       402, // 0xf0a6 florin
+       9827, // 0xf0a7 club
+       9830, // 0xf0a8 diamond
+       9829, // 0xf0a9 heart
+       9824, // 0xf0aa spade
+       8596, // 0xf0ab arrowboth
+       8591, // 0xf0ac arrowleft
+       8593, // 0xf0ad arrowup
+       8594, // 0xf0ae arrowright
+       8595, // 0xf0af arrowdown
+       176, // 0xf0b0 degree
+       177, // 0xf0b1 plusminus
+       8243, // 0xf0b2 second
+       8805, // 0xf0b3 greaterequal
+       215, // 0xf0b4 multiply
+       181, // 0xf0b5 proportional
+       8706, // 0xf0b6 partialdiff
+       8729, // 0xf0b7 bullet
+       247, // 0xf0b8 divide
+       8800, // 0xf0b9 notequal
+       8801, // 0xf0ba equivalence
+       8776, // 0xf0bb approxequal
+       8230, // 0xf0bc ellipsis
+       9168, // 0xf0bd arrowvertex
+       9135, // 0xf0be arrowhorizex
+       8629, // 0xf0bf carriagereturn
+       8501, // 0xf0c0 aleph
+       8475, // 0xf0c1 Ifraktur
+       8476, // 0xf0c2 Rfraktur
+       8472, // 0xf0c3 weierstrass
+       8855, // 0xf0c4 circlemultiply
+       8853, // 0xf0c5 circleplus
+       8709, // 0xf0c6 emptyset
+       8745, // 0xf0c7 intersection
+       8746, // 0xf0c8 union
+       8835, // 0xf0c9 propersuperset
+       8839, // 0xf0ca reflexsuperset
+       8836, // 0xf0cb notsubset
+       8834, // 0xf0cc propersubset
+       8838, // 0xf0cd reflexsubset
+       8712, // 0xf0ce element
+       8713, // 0xf0cf notelement
+       8736, // 0xf0d0 angle
+       8711, // 0xf0d1 gradient
+       174, // 0xf0d2 registerserif
+       169, // 0xf0d3 copyrightserif
+       8482, // 0xf0d4 trademarkserif
+       8719, // 0xf0d5 product
+       8730, // 0xf0d6 radical
+       8901, // 0xf0d7 dotmath
+       172, // 0xf0d8 logicalnot
+       8743, // 0xf0d9 logicaland
+       8744, // 0xf0da logicalor
+       8660, // 0xf0db arrowdblboth
+       8656, // 0xf0dc arrowdblleft
+       8657, // 0xf0dd arrowdblup
+       8658, // 0xf0de arrowdblright
+       8659, // 0xf0df arrowdbldown
+       9674, // 0xf0e0 lozenge
+       9001, // 0xf0e1 angleleft
+       174, // 0xf0e2 registersans
+       169, // 0xf0e3 copyrightsans
+       8482, // 0xf0e4 trademarksans
+       8721, // 0xf0e5 summation
+       9115, // 0xf0e6 parenlefttp
+       9116, // 0xf0e7 parenleftex
+       9117, // 0xf0e8 parenleftbt
+       9121, // 0xf0e9 bracketlefttp
+       9122, // 0xf0ea bracketleftex
+       9123, // 0xf0eb bracketleftbt
+       9127, // 0xf0ec bracelefttp
+       9128, // 0xf0ed braceleftmid
+       9129, // 0xf0ee braceleftbt
+       9130, // 0xf0ef braceex
+       ' ', // 0xf0f0 not defined
+       9002, // 0xf0f1 angleright
+       8747, // 0xf0f2 integral
+       8992, // 0xf0f3 integraltp
+       9134, // 0xf0f4 integralex
+       8993, // 0xf0f5 integralbt
+       9118, // 0xf0f6 parenrighttp
+       9119, // 0xf0f7 parenrightex
+       9120, // 0xf0f8 parenrightbt
+       9124, // 0xf0f9 bracketrighttp
+       9125, // 0xf0fa bracketrightex
+       9126, // 0xf0fb bracketrightbt
+       9131, // 0xf0fc bracerighttp
+       9132, // 0xf0fd bracerightmid
+       9133, // 0xf0fe bracerightbt
+       ' ', // 0xf0ff not defined
+   };
 }
index 4f5d7ae10b9794809079bfa1f7307c6623c1c9ae..6b26c28d7db644bc40e5f751fa2ce6b92b3ce6c8 100644 (file)
@@ -58,6 +58,7 @@ import org.apache.poi.hslf.record.Record;
 import org.apache.poi.hslf.record.SlideListWithText;
 import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
 import org.apache.poi.hslf.record.TextHeaderAtom;
+import org.apache.poi.util.StringUtil;
 import org.junit.Test;
 
 /**
@@ -579,4 +580,19 @@ public final class TestBugs {
             inputStream.close();
         }
     }
+
+    @Test
+    public void bug49541() throws Exception {
+        InputStream inputStream = new FileInputStream(_slTests.getFile("49541_symbol_map.ppt"));
+        try {
+            SlideShow slideShow = new SlideShow(inputStream);
+            Slide slide = slideShow.getSlides()[0];
+            ShapeGroup sg = (ShapeGroup)slide.getShapes()[0];
+            TextBox tb = (TextBox)sg.getShapes()[0];
+            String text = StringUtil.mapMsCodepointString(tb.getText());
+            assertEquals("\u226575 years", text);
+        } finally {
+            inputStream.close();
+        }
+    }
 }
diff --git a/test-data/slideshow/49541_symbol_map.ppt b/test-data/slideshow/49541_symbol_map.ppt
new file mode 100644 (file)
index 0000000..7810492
Binary files /dev/null and b/test-data/slideshow/49541_symbol_map.ppt differ