diff options
author | Yegor Kozlov <yegor@apache.org> | 2010-07-28 05:48:27 +0000 |
---|---|---|
committer | Yegor Kozlov <yegor@apache.org> | 2010-07-28 05:48:27 +0000 |
commit | 8ddb1b6dbd35d98dd4933a9802a232e310a413bb (patch) | |
tree | 59a71474077d69b7cc4cf4a375971d8bbcce8c61 | |
parent | e46e2c44a73cff9ffeff0b6499d4078e41c2a022 (diff) | |
download | poi-8ddb1b6dbd35d98dd4933a9802a232e310a413bb.tar.gz poi-8ddb1b6dbd35d98dd4933a9802a232e310a413bb.zip |
Support for escaped unicode characters in Shared String Table, see bug #49653
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@979952 13f79535-47bb-0310-9956-ffa450edef68
3 files changed, 57 insertions, 5 deletions
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 6ec1c7c1a3..4e21fa3a06 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,10 +34,11 @@ <changes> <release version="3.7-beta2" date="2010-??-??"> - <action dev="POI-DEVELOPERS" type="add">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action> - <action dev="POI-DEVELOPERS" type="add">49593 - preserve leading and trailing white spaces in XWPFRun</action> + <action dev="POI-DEVELOPERS" type="fix">49653 - Support for escaped unicode characters in Shared String Table</action> + <action dev="POI-DEVELOPERS" type="fix">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action> + <action dev="POI-DEVELOPERS" type="fix">49593 - preserve leading and trailing white spaces in XWPFRun</action> <action dev="POI-DEVELOPERS" type="add">49455 - Insert the content of fldSimple fields into the XWPFWordTextExtractor output</action> - <action dev="POI-DEVELOPERS" type="add">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action> + <action dev="POI-DEVELOPERS" type="fix">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action> <action dev="POI-DEVELOPERS" type="add">49538 - Added implementation for POISSON()</action> <action dev="POI-DEVELOPERS" type="add">49524 - Support for setting cell text to be vertically rotated, via style.setRotation(0xff)</action> <action dev="POI-DEVELOPERS" type="fix">49609 - Case insensitive matching of OOXML part names</action> diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java index 0afd1a5971..736a15b3ad 100644 --- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFRichTextString.java @@ -18,6 +18,8 @@ package org.apache.poi.xssf.usermodel; import java.util.ArrayList; +import java.util.regex.Pattern; +import java.util.regex.Matcher; import javax.xml.namespace.QName; @@ -75,6 +77,8 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.STXstring; * @author Yegor Kozlov */ public class XSSFRichTextString implements RichTextString { + private static final Pattern utfPtrn = Pattern.compile("_x([0-9A-F]{4})_"); + private CTRst st; private StylesTable styles; @@ -337,13 +341,13 @@ public class XSSFRichTextString implements RichTextString { */ public String getString() { if(st.sizeOfRArray() == 0) { - return st.getT(); + return utfDecode(st.getT()); } StringBuffer buf = new StringBuffer(); for(CTRElt r : st.getRList()){ buf.append(r.getT()); } - return buf.toString(); + return utfDecode(buf.toString()); } /** @@ -490,4 +494,39 @@ public class XSSFRichTextString implements RichTextString { c.dispose(); } } + + /** + * For all characters which cannot be represented in XML as defined by the XML 1.0 specification, + * the characters are escaped using the Unicode numerical character representation escape character + * format _xHHHH_, where H represents a hexadecimal character in the character's value. + * <p> + * Example: The Unicode character 0D is invalid in an XML 1.0 document, + * so it shall be escaped as <code>_x000D_</code>. + * </p> + * See section 3.18.9 in the OOXML spec. + * + * @param value the string to decode + * @return the decoded string + */ + static String utfDecode(String value){ + if(value == null) return null; + + StringBuffer buf = new StringBuffer(); + Matcher m = utfPtrn.matcher(value); + int idx = 0; + while(m.find()) { + int pos = m.start(); + if( pos > idx) { + buf.append(value.substring(idx, pos)); + } + + String code = m.group(1); + int icode = Integer.decode("0x" + code); + buf.append((char)icode); + + idx = m.end(); + } + buf.append(value.substring(idx)); + return buf.toString(); + } } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java index e983079599..5251e06bb2 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/usermodel/TestXSSFRichTextString.java @@ -130,4 +130,16 @@ public final class TestXSSFRichTextString extends TestCase { assertEquals("<xml-fragment xml:space=\"preserve\"> Apache</xml-fragment>", xs.xmlText()); } + + /** + * test that unicode representation_ xHHHH_ is properly processed + */ + public void testUtfDecode() { + CTRst st = CTRst.Factory.newInstance(); + st.setT("abc_x000D_2ef_x000D_"); + XSSFRichTextString rt = new XSSFRichTextString(st); + //_x000D_ is converted into carriage return + assertEquals("abc\r2ef\r", rt.getString()); + + } } |