]> source.dussan.org Git - poi.git/commitdiff
[Bug 61246] fix issue where SXSSF sheet data has unicode surrogate chars replaced...
authorPJ Fanning <fanningpj@apache.org>
Mon, 3 Jul 2017 18:49:25 +0000 (18:49 +0000)
committerPJ Fanning <fanningpj@apache.org>
Mon, 3 Jul 2017 18:49:25 +0000 (18:49 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1800705 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/xssf/streaming/SheetDataWriter.java
src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java [new file with mode: 0644]

index cec86e9669d5e774b3fd8c68234d092225d72432..d8394a800e60f2cdc366d17349abe22a61154878 100644 (file)
@@ -122,12 +122,16 @@ public class SheetDataWriter {
      * flush and close the temp data writer. 
      * This method <em>must</em> be invoked before calling {@link #getWorksheetXMLInputStream()}
      */
-    public void close() throws IOException{
-        _out.flush();
+    public void close() throws IOException {
+        flush();
         _out.close();
     }
+    
+    protected void flush() throws IOException {
+        _out.flush();
+    }
 
-    protected File getTempFile(){
+    protected File getTempFile() {
         return _fd;
     }
     
@@ -329,7 +333,7 @@ public class SheetDataWriter {
     }
 
     //Taken from jdk1.3/src/javax/swing/text/html/HTMLWriter.java
-     protected void outputQuotedString(String s) throws IOException {
+    protected void outputQuotedString(String s) throws IOException {
         if (s == null || s.length() == 0) {
             return;
         }
@@ -393,15 +397,21 @@ public class SheetDataWriter {
                     break;
                 default:
                     // YK: XmlBeans silently replaces all ISO control characters ( < 32) with question marks.
-                    // the same rule applies to unicode surrogates and "not a character" symbols.
-                    if( c < ' ' || Character.isLowSurrogate(c) || Character.isHighSurrogate(c) ||
-                            ('\uFFFE' <= c && c <= '\uFFFF')) {
+                    // the same rule applies to "not a character" symbols.
+                    if (replaceWithQuestionMark(c)) {
                         if (counter > last) {
                             _out.write(chars, last, counter - last);
                         }
                         _out.write('?');
                         last = counter + 1;
                     }
+                    else if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
+                        if (counter > last) {
+                            _out.write(chars, last, counter - last);
+                        }
+                        _out.write(c);
+                        last = counter + 1;
+                    }
                     else if (c > 127) {
                         if (counter > last) {
                             _out.write(chars, last, counter - last);
@@ -421,6 +431,10 @@ public class SheetDataWriter {
         }
     }
 
+    static boolean replaceWithQuestionMark(char c) {
+        return c < ' ' || ('\uFFFE' <= c && c <= '\uFFFF');
+    }
+     
     /**
      * Deletes the temporary file that backed this sheet on disk.
      * @return true if the file was deleted, false if it wasn't.
diff --git a/src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java b/src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java
new file mode 100644 (file)
index 0000000..343180b
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ *  ====================================================================
+ *    Licensed to the Apache Software Foundation (ASF) under one or more
+ *    contributor license agreements.  See the NOTICE file distributed with
+ *    this work for additional information regarding copyright ownership.
+ *    The ASF licenses this file to You under the Apache License, Version 2.0
+ *    (the "License"); you may not use this file except in compliance with
+ *    the License.  You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ * ====================================================================
+ */
+
+package org.apache.poi.xssf.streaming;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+
+import org.apache.poi.util.IOUtils;
+import org.junit.Test;
+
+public final class TestSheetDataWriter {
+
+    final String unicodeSurrogates = "\uD835\uDF4A\uD835\uDF4B\uD835\uDF4C\uD835\uDF4D\uD835\uDF4E"
+            + "\uD835\uDF4F\uD835\uDF50\uD835\uDF51\uD835\uDF52\uD835\uDF53\uD835\uDF54\uD835"
+            + "\uDF55\uD835\uDF56\uD835\uDF57\uD835\uDF58\uD835\uDF59\uD835\uDF5A\uD835\uDF5B"
+            + "\uD835\uDF5C\uD835\uDF5D\uD835\uDF5E\uD835\uDF5F\uD835\uDF60\uD835\uDF61\uD835"
+            + "\uDF62\uD835\uDF63\uD835\uDF64\uD835\uDF65\uD835\uDF66\uD835\uDF67\uD835\uDF68"
+            + "\uD835\uDF69\uD835\uDF6A\uD835\uDF6B\uD835\uDF6C\uD835\uDF6D\uD835\uDF6E\uD835"
+            + "\uDF6F\uD835\uDF70\uD835\uDF71\uD835\uDF72\uD835\uDF73\uD835\uDF74\uD835\uDF75"
+            + "\uD835\uDF76\uD835\uDF77\uD835\uDF78\uD835\uDF79\uD835\uDF7A";
+
+    @Test
+       public void testReplaceWithQuestionMark() {
+        for(int i = 0; i < unicodeSurrogates.length(); i++) {
+            assertFalse(SheetDataWriter.replaceWithQuestionMark(unicodeSurrogates.charAt(i)));
+        }
+        assertTrue(SheetDataWriter.replaceWithQuestionMark('\uFFFE'));
+        assertTrue(SheetDataWriter.replaceWithQuestionMark('\uFFFF'));
+        assertTrue(SheetDataWriter.replaceWithQuestionMark('\u0000'));
+        assertTrue(SheetDataWriter.replaceWithQuestionMark('\u000F'));
+        assertTrue(SheetDataWriter.replaceWithQuestionMark('\u001F'));
+       }
+    
+    @Test
+    public void testWriteUnicodeSurrogates() throws IOException {
+        SheetDataWriter writer = new SheetDataWriter();
+        try {
+            writer.outputQuotedString(unicodeSurrogates);
+            writer.flush();
+            File file = writer.getTempFile();
+            FileInputStream is = new FileInputStream(file);
+            String text;
+            try {
+                text = new String(IOUtils.toByteArray(is), "UTF-8");
+            } finally {
+                is.close();
+            }
+            assertEquals(unicodeSurrogates, text);
+        } finally {
+            writer.close();
+        }
+    }
+}