From 8ddb1b6dbd35d98dd4933a9802a232e310a413bb Mon Sep 17 00:00:00 2001 From: Yegor Kozlov Date: Wed, 28 Jul 2010 05:48:27 +0000 Subject: [PATCH] Support for escaped unicode characters in Shared String Table, see bug #49653 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@979952 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 7 +-- .../xssf/usermodel/XSSFRichTextString.java | 43 ++++++++++++++++++- .../usermodel/TestXSSFRichTextString.java | 12 ++++++ 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 6ec1c7c1a3..4e21fa3a06 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,10 +34,11 @@ - 49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord - 49593 - preserve leading and trailing white spaces in XWPFRun + 49653 - Support for escaped unicode characters in Shared String Table + 49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord + 49593 - preserve leading and trailing white spaces in XWPFRun 49455 - Insert the content of fldSimple fields into the XWPFWordTextExtractor output - 49640 - Fixed parsing formulas containing defined names beginning with an underscore + 49640 - Fixed parsing formulas containing defined names beginning with an underscore 49538 - Added implementation for POISSON() 49524 - Support for setting cell text to be vertically rotated, via style.setRotation(0xff) 49609 - Case insensitive matching of OOXML part names diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java index 0afd1a5971..736a15b3ad 100644 --- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java @@ -18,6 +18,8 @@ package org.apache.poi.xssf.usermodel; import java.util.ArrayList; +import java.util.regex.Pattern; +import java.util.regex.Matcher; import javax.xml.namespace.QName; @@ -75,6 +77,8 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.STXstring; * @author Yegor Kozlov */ public class XSSFRichTextString implements RichTextString { + private static final Pattern utfPtrn = Pattern.compile("_x([0-9A-F]{4})_"); + private CTRst st; private StylesTable styles; @@ -337,13 +341,13 @@ public class XSSFRichTextString implements RichTextString { */ public String getString() { if(st.sizeOfRArray() == 0) { - return st.getT(); + return utfDecode(st.getT()); } StringBuffer buf = new StringBuffer(); for(CTRElt r : st.getRList()){ buf.append(r.getT()); } - return buf.toString(); + return utfDecode(buf.toString()); } /** @@ -490,4 +494,39 @@ public class XSSFRichTextString implements RichTextString { c.dispose(); } } + + /** + * For all characters which cannot be represented in XML as defined by the XML 1.0 specification, + * the characters are escaped using the Unicode numerical character representation escape character + * format _xHHHH_, where H represents a hexadecimal character in the character's value. + *

+ * Example: The Unicode character 0D is invalid in an XML 1.0 document, + * so it shall be escaped as _x000D_. + *

+ * See section 3.18.9 in the OOXML spec. + * + * @param value the string to decode + * @return the decoded string + */ + static String utfDecode(String value){ + if(value == null) return null; + + StringBuffer buf = new StringBuffer(); + Matcher m = utfPtrn.matcher(value); + int idx = 0; + while(m.find()) { + int pos = m.start(); + if( pos > idx) { + buf.append(value.substring(idx, pos)); + } + + String code = m.group(1); + int icode = Integer.decode("0x" + code); + buf.append((char)icode); + + idx = m.end(); + } + buf.append(value.substring(idx)); + return buf.toString(); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java index e983079599..5251e06bb2 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java @@ -130,4 +130,16 @@ public final class TestXSSFRichTextString extends TestCase { assertEquals(" Apache", xs.xmlText()); } + + /** + * test that unicode representation_ xHHHH_ is properly processed + */ + public void testUtfDecode() { + CTRst st = CTRst.Factory.newInstance(); + st.setT("abc_x000D_2ef_x000D_"); + XSSFRichTextString rt = new XSSFRichTextString(st); + //_x000D_ is converted into carriage return + assertEquals("abc\r2ef\r", rt.getString()); + + } } -- 2.39.5