<changes>
<release version="3.7-beta2" date="2010-??-??">
- <action dev="POI-DEVELOPERS" type="add">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>
- <action dev="POI-DEVELOPERS" type="add">49593 - preserve leading and trailing white spaces in XWPFRun</action>
+ <action dev="POI-DEVELOPERS" type="fix">49653 - Support for escaped unicode characters in Shared String Table</action>
+ <action dev="POI-DEVELOPERS" type="fix">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>
+ <action dev="POI-DEVELOPERS" type="fix">49593 - preserve leading and trailing white spaces in XWPFRun</action>
<action dev="POI-DEVELOPERS" type="add">49455 - Insert the content of fldSimple fields into the XWPFWordTextExtractor output</action>
- <action dev="POI-DEVELOPERS" type="add">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action>
+ <action dev="POI-DEVELOPERS" type="fix">49640 - Fixed parsing formulas containing defined names beginning with an underscore</action>
<action dev="POI-DEVELOPERS" type="add">49538 - Added implementation for POISSON()</action>
<action dev="POI-DEVELOPERS" type="add">49524 - Support for setting cell text to be vertically rotated, via style.setRotation(0xff)</action>
<action dev="POI-DEVELOPERS" type="fix">49609 - Case insensitive matching of OOXML part names</action>
package org.apache.poi.xssf.usermodel;
import java.util.ArrayList;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
import javax.xml.namespace.QName;
* @author Yegor Kozlov
*/
public class XSSFRichTextString implements RichTextString {
+ private static final Pattern utfPtrn = Pattern.compile("_x([0-9A-F]{4})_");
+
private CTRst st;
private StylesTable styles;
*/
public String getString() {
if(st.sizeOfRArray() == 0) {
- return st.getT();
+ return utfDecode(st.getT());
}
StringBuffer buf = new StringBuffer();
for(CTRElt r : st.getRList()){
buf.append(r.getT());
}
- return buf.toString();
+ return utfDecode(buf.toString());
}
/**
c.dispose();
}
}
+
+ /**
+ * For all characters which cannot be represented in XML as defined by the XML 1.0 specification,
+ * the characters are escaped using the Unicode numerical character representation escape character
+ * format _xHHHH_, where H represents a hexadecimal character in the character's value.
+ * <p>
+ * Example: The Unicode character 0D is invalid in an XML 1.0 document,
+ * so it shall be escaped as <code>_x000D_</code>.
+ * </p>
+ * See section 3.18.9 in the OOXML spec.
+ *
+ * @param value the string to decode
+ * @return the decoded string
+ */
+ static String utfDecode(String value){
+ if(value == null) return null;
+
+ StringBuffer buf = new StringBuffer();
+ Matcher m = utfPtrn.matcher(value);
+ int idx = 0;
+ while(m.find()) {
+ int pos = m.start();
+ if( pos > idx) {
+ buf.append(value.substring(idx, pos));
+ }
+
+ String code = m.group(1);
+ int icode = Integer.decode("0x" + code);
+ buf.append((char)icode);
+
+ idx = m.end();
+ }
+ buf.append(value.substring(idx));
+ return buf.toString();
+ }
}
assertEquals("<xml-fragment xml:space=\"preserve\"> Apache</xml-fragment>", xs.xmlText());
}
+
+ /**
+ * test that unicode representation_ xHHHH_ is properly processed
+ */
+ public void testUtfDecode() {
+ CTRst st = CTRst.Factory.newInstance();
+ st.setT("abc_x000D_2ef_x000D_");
+ XSSFRichTextString rt = new XSSFRichTextString(st);
+ //_x000D_ is converted into carriage return
+ assertEquals("abc\r2ef\r", rt.getString());
+
+ }
}