Support for escaped unicode characters in Shared String Table, see bug #49653

author Yegor Kozlov <yegor@apache.org>

Wed, 28 Jul 2010 05:48:27 +0000 (05:48 +0000)

committer Yegor Kozlov <yegor@apache.org>

Wed, 28 Jul 2010 05:48:27 +0000 (05:48 +0000)
author Yegor Kozlov <yegor@apache.org>
Wed, 28 Jul 2010 05:48:27 +0000 (05:48 +0000)
committer Yegor Kozlov <yegor@apache.org>
Wed, 28 Jul 2010 05:48:27 +0000 (05:48 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 6ec1c7c1a330ca5c9f54161a932e8d0325e595ca..4e21fa3a06bd074d1544763cc3e39e90bc16788d 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,10 +34,11 @@
  
      <changes>
          <release version="3.7-beta2" date="2010-??-??">
-           <action dev="POI-DEVELOPERS" type="add">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>
-           <action dev="POI-DEVELOPERS" type="add">49593 - preserve leading and trailing white spaces in  XWPFRun</action>
+           <action dev="POI-DEVELOPERS" type="fix">49653 - Support for escaped unicode characters in Shared String Table</action>
+           <action dev="POI-DEVELOPERS" type="fix">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>
+           <action dev="POI-DEVELOPERS" type="fix">49593 - preserve leading and trailing white spaces in  XWPFRun</action>
             <action dev="POI-DEVELOPERS" type="add">49455 - Insert the content of fldSimple fields into the XWPFWordTextExtractor output</action>
-           <action dev="POI-DEVELOPERS" type="add">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action>
+           <action dev="POI-DEVELOPERS" type="fix">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action>
             <action dev="POI-DEVELOPERS" type="add">49538 - Added implementation for POISSON()</action>
             <action dev="POI-DEVELOPERS" type="add">49524 - Support for setting cell text to be vertically rotated, via style.setRotation(0xff)</action>
             <action dev="POI-DEVELOPERS" type="fix">49609 - Case insensitive matching of OOXML part names</action>
diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java

index 0afd1a5971516a4420431c8533ec1353343d2af4..736a15b3ad81895946e37a8e92c53cebee832f47 100644 (file)
--- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java
+++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java
@@ -18,6 +18,8 @@
  package org.apache.poi.xssf.usermodel;
  
  import java.util.ArrayList;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
  
  import javax.xml.namespace.QName;
  
@@ -75,6 +77,8 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.STXstring;
   * @author Yegor Kozlov
   */
  public class XSSFRichTextString implements RichTextString {
+    private static final Pattern utfPtrn = Pattern.compile("_x([0-9A-F]{4})_");
+
      private CTRst st;
      private StylesTable styles;
  
@@ -337,13 +341,13 @@ public class XSSFRichTextString implements RichTextString {
       */
      public String getString() {
          if(st.sizeOfRArray() == 0) {
-            return st.getT();
+            return utfDecode(st.getT());
          }
          StringBuffer buf = new StringBuffer();
          for(CTRElt r : st.getRList()){
              buf.append(r.getT());
          }
-        return buf.toString();
+        return utfDecode(buf.toString());
      }
  
      /**
@@ -490,4 +494,39 @@ public class XSSFRichTextString implements RichTextString {
              c.dispose();
          }
      }
+
+    /**
+     * For all characters which cannot be represented in XML as defined by the XML 1.0 specification,
+     * the characters are escaped using the Unicode numerical character representation escape character
+     * format _xHHHH_, where H represents a hexadecimal character in the character's value.
+     * <p>
+     * Example: The Unicode character 0D is invalid in an XML 1.0 document,
+     * so it shall be escaped as <code>_x000D_</code>.
+     * </p>
+     * See section 3.18.9 in the OOXML spec.
+     *
+     * @param   value the string to decode
+     * @return  the decoded string
+     */
+    static String utfDecode(String value){
+        if(value == null) return null;
+        
+        StringBuffer buf = new StringBuffer();
+        Matcher m = utfPtrn.matcher(value);
+        int idx = 0;
+        while(m.find()) {
+            int pos = m.start();
+            if( pos > idx) {
+                buf.append(value.substring(idx, pos));
+            }
+
+            String code = m.group(1);
+            int icode = Integer.decode("0x" + code);
+            buf.append((char)icode);
+
+            idx = m.end();
+        }
+        buf.append(value.substring(idx));
+        return buf.toString();
+    }
  }
diff --git a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java

index e9830795990c7683e4715334a73230ae237b29ee..5251e06bb29d69c59e5878ac7f1d29cbcb23d4a2 100644 (file)
--- a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java
+++ b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java
@@ -130,4 +130,16 @@ public final class TestXSSFRichTextString extends TestCase {
          assertEquals("<xml-fragment xml:space=\"preserve\">  Apache</xml-fragment>", xs.xmlText());
  
      }
+
+    /**
+     * test that unicode representation_ xHHHH_ is properly processed
+     */
+    public void testUtfDecode() {
+        CTRst st = CTRst.Factory.newInstance();
+        st.setT("abc_x000D_2ef_x000D_");
+        XSSFRichTextString rt = new XSSFRichTextString(st);
+        //_x000D_ is converted into carriage return
+        assertEquals("abc\r2ef\r", rt.getString());
+        
+    }
  }
author	Yegor Kozlov <yegor@apache.org>
	Wed, 28 Jul 2010 05:48:27 +0000 (05:48 +0000)
committer	Yegor Kozlov <yegor@apache.org>
	Wed, 28 Jul 2010 05:48:27 +0000 (05:48 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java		patch \| blob \| history
src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java		patch \| blob \| history