FOP-2920: Surrogate pair edge-case causes Exception by Dave Roxburgh

author Simon Steiner <ssteiner@apache.org>

Tue, 1 Aug 2023 08:43:30 +0000 (09:43 +0100)

committer Simon Steiner <ssteiner@apache.org>

Tue, 1 Aug 2023 08:43:30 +0000 (09:43 +0100)
author Simon Steiner <ssteiner@apache.org>
Tue, 1 Aug 2023 08:43:30 +0000 (09:43 +0100)
committer Simon Steiner <ssteiner@apache.org>
Tue, 1 Aug 2023 08:43:30 +0000 (09:43 +0100)
diff --git a/fop-core/src/test/java/org/apache/fop/pdf/PDFToUnicodeCMapTestCase.java b/fop-core/src/test/java/org/apache/fop/pdf/PDFToUnicodeCMapTestCase.java

new file mode 100644 (file)

index 0000000..7469431
--- /dev/null
+++ b/fop-core/src/test/java/org/apache/fop/pdf/PDFToUnicodeCMapTestCase.java
@@ -0,0 +1,380 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id$ */
+
+package org.apache.fop.pdf;
+
+import java.io.CharArrayWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import org.apache.fop.apps.FopFactory;
+import org.apache.fop.apps.FopFactoryBuilder;
+import org.apache.fop.events.Event;
+import org.apache.fop.events.EventBroadcaster;
+import org.apache.fop.events.EventListener;
+import org.apache.fop.events.model.EventSeverity;
+
+public class PDFToUnicodeCMapTestCase {
+
+    static final int UNICODE_CHAR_MAP_SIZE = 200;
+
+    static final char[] S_UNICODE_CHAR_MAP = new char[UNICODE_CHAR_MAP_SIZE];
+
+    EventBroadcaster eventBroadcaster;
+
+    @Before
+    public void initUnicodeChatMap() {
+        for (int i = 0; i < UNICODE_CHAR_MAP_SIZE; ++i) {
+            S_UNICODE_CHAR_MAP[i] = (char)(50 + i);
+        }
+    }
+
+    @Before
+    public void initEventBroadcaster() {
+        URI config = URI.create("");
+        FopFactoryBuilder fopFactoryBuilder = new FopFactoryBuilder(config);
+        FopFactory fopFactory = fopFactoryBuilder.build();
+        eventBroadcaster = fopFactory.newFOUserAgent().getEventBroadcaster();
+    }
+
+    private void assertHeader(String cmap) {
+        Assert.assertTrue(cmap.contains("/CIDInit /ProcSet findresource begin\n"
+                + "12 dict begin\n"
+                + "begincmap\n"
+                + "/CIDSystemInfo 3 dict dup begin\n"
+                + "  /Registry (Adobe) def\n"
+                + "  /Ordering (UCS) def\n"
+                + "  /Supplement 0 def\n"
+                + "end def\n"
+                + "/CMapName /Adobe-Identity-UCS def\n"
+                + "/CMapType 2 def\n"));
+    }
+
+    private void assertFooter(String cmap) {
+        Assert.assertTrue(cmap.contains("endcmap\n"
+                + "CMapName currentdict /CMap defineresource pop\n"
+                + "end\n"
+                + "end\n"));
+    }
+
+    private void assertHeaderAndFooter(String cmap) {
+        assertHeader(cmap);
+        assertFooter(cmap);
+    }
+
+    private void buildAndAssertLine(char[] unicodeCharMap, Boolean singleByte, String expected) throws IOException {
+        PDFToUnicodeCMap cMap = new PDFToUnicodeCMap(unicodeCharMap,
+                PDFCMap.ENC_GB_EUC_H,
+                new PDFCIDSystemInfo("Adobe", "Identity", 0),
+                singleByte, eventBroadcaster);
+
+        CharArrayWriter writer = new CharArrayWriter();
+        CMapBuilder builder = cMap.createCMapBuilder(writer);
+        builder.writeCMap();
+        String cmap = writer.toString();
+        Assert.assertTrue(cmap.contains(expected));
+    }
+
+    private void buildAndAssert(char[] unicodeCharMap, Map<Boolean, String> configPairs) throws IOException {
+        Set<Map.Entry<Boolean, String>> configSet = configPairs.entrySet();
+        for (Map.Entry<Boolean, String> entry : configSet) {
+            buildAndAssertLine(unicodeCharMap, entry.getKey(), entry.getValue());
+        }
+    }
+
+    /**
+     * Checks entire CMap of unmodified unicodeCharMap, including header and footer.
+     * @throws IOException
+     */
+    @Test
+    public void simpleTest() throws IOException {
+        Map<Boolean, String> configPairs = new HashMap<>();
+        configPairs.put(true, "1 begincodespacerange\n"  // Single-byte char map
+                + "<00> <FF>\n"
+                + "endcodespacerange\n"
+                + "1 beginbfrange\n"
+                + "<00> <c7> <0032>\n"
+                + "endbfrange\n");
+        configPairs.put(false, "1 begincodespacerange\n"  // Double-byte char map
+                + "<0000> <FFFF>\n"
+                + "endcodespacerange\n"
+                + "1 beginbfrange\n"
+                + "<0000> <00c7> <0032>\n"
+                + "endbfrange\n");
+
+        Set<Map.Entry<Boolean, String>> configSet = configPairs.entrySet();
+        for (Map.Entry<Boolean, String> entry : configSet) {
+            PDFToUnicodeCMap cMap = new PDFToUnicodeCMap(S_UNICODE_CHAR_MAP,
+                    PDFCMap.ENC_GB_EUC_H,
+                    new PDFCIDSystemInfo("Adobe", "Identity", 0),
+                    entry.getKey(), eventBroadcaster);
+
+            CharArrayWriter writer = new CharArrayWriter();
+            CMapBuilder builder = cMap.createCMapBuilder(writer);
+            builder.writeCMap();
+            String cmap = writer.toString();
+            assertHeaderAndFooter(cmap);
+            Assert.assertTrue(cmap.contains(entry.getValue()));
+        }
+    }
+
+    /**
+     * Checks CMap of unicodeCharMap with one codepoint changed so it is out of sequence.
+     * @throws IOException
+     */
+    @Test
+    public void rangeTest() throws IOException {
+        S_UNICODE_CHAR_MAP[0x32] = 0xfa;  // Interrupt the range with an oddity.
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        configPairs.put(true, "1 begincodespacerange\n"
+                + "<00> <FF>\n"
+                + "endcodespacerange\n"
+                + "1 beginbfchar\n"
+                + "<32> <00fa>\n"
+                + "endbfchar\n"
+                + "2 beginbfrange\n"
+                + "<00> <31> <0032>\n"
+                + "<33> <c7> <0065>\n"
+                + "endbfrange");
+        configPairs.put(false, "1 begincodespacerange\n"
+                + "<0000> <FFFF>\n"
+                + "endcodespacerange\n"
+                + "1 beginbfchar\n"
+                + "<0032> <00fa>\n"
+                + "endbfchar\n"
+                + "2 beginbfrange\n"
+                + "<0000> <0031> <0032>\n"
+                + "<0033> <00c7> <0065>\n"
+                + "endbfrange");
+
+        buildAndAssert(S_UNICODE_CHAR_MAP, configPairs);
+    }
+
+    /**
+     * Checks that one surrogate pair is correctly handled, even when it crosses a section boundary.
+     * @throws IOException
+     */
+    @Test
+    public void surrogatePairTest() throws IOException {
+        final int charMapSize = 157;
+
+        char[] unicodeCharMap = new char[charMapSize];
+
+        for (int i = 0; i < charMapSize; ++i) {
+            unicodeCharMap[i] = (char)(50 + i * 2);
+        }
+
+        unicodeCharMap[99] = '\uD83C'; // High-surrogate code unit, last code unit of section.
+        unicodeCharMap[100] = '\uDF65';
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        configPairs.put(true, "<60> <00f2>\n"
+                + "<61> <00f4>\n"
+                + "<62> <00f6>\n"
+                + "<63> <d83cdf65>\n"
+                + "endbfchar\n"
+                + "56 beginbfchar\n"
+                + "<65> <00fc>\n"
+                + "<66> <00fe>");
+        configPairs.put(false, "<0060> <00f2>\n"
+                + "<0061> <00f4>\n"
+                + "<0062> <00f6>\n"
+                + "<0063> <d83cdf65>\n"
+                + "endbfchar\n"
+                + "56 beginbfchar\n"
+                + "<0065> <00fc>\n"
+                + "<0066> <00fe>");
+
+        buildAndAssert(unicodeCharMap, configPairs);
+    }
+
+    /**
+     * Checks that a range of surrogate pairs is correctly handled.
+     * @throws IOException
+     */
+    @Test
+    public void surrogatePairRangeTest() throws IOException {
+        final int charMapSize = 20;
+
+        char[] unicodeCharMap = new char[charMapSize];
+
+        for (int i = 0; i < charMapSize; ++i) {
+            unicodeCharMap[i] = (char)(50 + i * 2);
+        }
+
+        unicodeCharMap[9] = '\uD83C';
+        unicodeCharMap[10] = '\uDF65';
+        unicodeCharMap[11] = '\uD83C';
+        unicodeCharMap[12] = '\uDF66';
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        configPairs.put(true, "1 beginbfrange\n"
+                + "<09> <0b> <d83cdf65>\n"
+                + "endbfrange");
+        configPairs.put(false, "1 beginbfrange\n"
+                + "<0009> <000b> <d83cdf65>\n"
+                + "endbfrange");
+
+        buildAndAssert(unicodeCharMap, configPairs);
+    }
+
+    /**
+     * Checks that CMap is correct, even when made up of just one range of surrogate pairs.
+     * @throws IOException
+     */
+    @Test
+    public void surrogatePairsRangeTest() throws IOException {
+        final int charMapSize = 20;
+
+        char[] unicodeCharMap = new char[charMapSize];
+
+        for (int i = 0; i < charMapSize; i = i + 2) {
+            unicodeCharMap[i] = '\uD83C';
+        }
+        for (int i = 0; i < charMapSize / 2; ++i) {
+            unicodeCharMap[1 + i * 2] = (char)('\uDF65' + i);
+        }
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        configPairs.put(true, "1 beginbfrange\n"
+                + "<00> <12> <d83cdf65>\n"
+                + "endbfrange");
+        configPairs.put(false, "1 beginbfrange\n"
+                + "<0000> <0012> <d83cdf65>\n"
+                + "endbfrange");
+
+        buildAndAssert(unicodeCharMap, configPairs);
+    }
+
+    /**
+     * Checks that an unpaired surrogate (a high-surrogate as the last code unit) is correctly handled.
+     * @throws IOException
+     */
+    @Test
+    public void unpairedHighSurrogateTest() throws IOException {
+        final int charMapSize = 10;
+
+        char[] unicodeCharMap = new char[charMapSize];
+
+        for (int i = 0; i < charMapSize; ++i) {
+            unicodeCharMap[i] = (char)(50 + i);
+        }
+
+        unicodeCharMap[9] = '\uD83C'; // High-surrogate code unit.
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        configPairs.put(true, "1 beginbfchar\n"
+                + "<09> <d83c0000>\n"
+                + "endbfchar");
+        configPairs.put(false, "1 beginbfchar\n"
+                + "<0009> <d83c0000>\n"
+                + "endbfchar");
+
+        Set<Map.Entry<Boolean, String>> configSet = configPairs.entrySet();
+        for (Map.Entry<Boolean, String> entry : configSet) {
+            MyEventListener listener = new MyEventListener();
+
+            eventBroadcaster.addEventListener(listener);
+
+            buildAndAssertLine(unicodeCharMap, entry.getKey(), entry.getValue());
+
+            Event ev = listener.event;
+            assertNotNull(ev);
+            assertEquals("org.apache.fop.render.pdf.PDFEventProducer.unpairedSurrogate", listener.event.getEventID());
+            assertEquals(EventSeverity.ERROR, listener.event.getSeverity());
+
+            eventBroadcaster.removeEventListener(listener);
+        }
+    }
+
+    private class MyEventListener implements EventListener {
+
+        private Event event;
+
+        public void processEvent(Event event) {
+            if (this.event != null) {
+                fail("Multiple events received");
+            }
+            this.event = event;
+        }
+    }
+
+    /**
+     * Checks that a range of non-surrogate pairs is limited in size.
+     * @throws IOException
+     */
+    @Test
+    public void rangeSizeTest() throws IOException {
+        final int charMapSize = 300;
+
+        char[] unicodeCharMap = new char[charMapSize];
+
+        for (int i = 0; i < charMapSize; ++i) {
+            unicodeCharMap[i] = (char)(50 + i);
+        }
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        // PDFToUnicodeCMap CTOR rejects unicodeCharMap with > 256 elements where singleByte is true.
+        configPairs.put(false, "2 beginbfrange\n"
+                + "<0000> <00ff> <0032>\n"
+                + "<0100> <012b> <0132>\n"
+                + "endbfrange");
+
+        buildAndAssert(unicodeCharMap, configPairs);
+    }
+
+    /**
+     * Checks that a range of surrogate pairs is limited in size.
+     * @throws IOException
+     */
+    @Test
+    public void rangeSizeSurrogateTest() throws IOException {
+        final int charMapSize = 300;
+
+        char[] unicodeCharMap = new char[charMapSize];
+
+        for (int i = 0; i < charMapSize; i = i + 2) {
+            unicodeCharMap[i] = '\uD83C';
+        }
+        for (int i = 0; i < charMapSize / 2; ++i) {
+            unicodeCharMap[1 + i * 2] = (char)('\uDF65' + i);
+        }
+
+        Map<Boolean, String> configPairs = new HashMap<>();
+        // PDFToUnicodeCMap CTOR rejects unicodeCharMap with > 256 elements where singleByte is true.
+        configPairs.put(false, "2 beginbfrange\n"
+                + "<0000> <00fe> <d83cdf65>\n"
+                + "<0100> <012a> <d83cdfe5>\n"
+                + "endbfrange");
+
+        buildAndAssert(unicodeCharMap, configPairs);
+    }
+}
author	Simon Steiner <ssteiner@apache.org>
	Tue, 1 Aug 2023 08:43:30 +0000 (09:43 +0100)
committer	Simon Steiner <ssteiner@apache.org>
	Tue, 1 Aug 2023 08:43:30 +0000 (09:43 +0100)