From 68dd1750c65ec6ce5475f2f12ea63f6495a708ad Mon Sep 17 00:00:00 2001
From: Jeremias Maerki <jeremias@apache.org>
Date: Mon, 5 Jan 2009 07:47:02 +0000
Subject: [PATCH] FOP now creates ToUnicode CMaps for single-byte fonts that
 don't use built-in encodings to help PDF text extractors interpreting
 characters. PDF CMaps now support single-byte characters.

git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@731479 13f79535-47bb-0310-9956-ffa450edef68
---
 .../fop/fonts/AbstractCodePointMapping.java   |  7 +--
 .../fop/fonts/SimpleSingleByteEncoding.java   | 14 ++++++
 .../apache/fop/fonts/SingleByteEncoding.java  |  8 ++++
 src/java/org/apache/fop/pdf/CMapBuilder.java  | 10 ++++-
 src/java/org/apache/fop/pdf/PDFFactory.java   | 44 ++++++++++++++-----
 src/java/org/apache/fop/pdf/PDFFont.java      |  8 ++++
 .../org/apache/fop/pdf/PDFFontNonBase14.java  |  8 ----
 .../org/apache/fop/pdf/PDFToUnicodeCMap.java  | 23 +++++++---
 status.xml                                    |  4 ++
 9 files changed, 95 insertions(+), 31 deletions(-)

diff --git a/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java b/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
index 56558b02e..f03d4beab 100644
--- a/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
+++ b/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
@@ -189,12 +189,7 @@ public class AbstractCodePointMapping implements SingleByteEncoding {
         return this.unicodeMap[idx];
     }
 
-    /**
-     * Returns a character array with Unicode scalar values which can be used to map encoding
-     * code points to Unicode values. Note that this does not return all possible Unicode values
-     * that the encoding maps.
-     * @return a character array with Unicode scalar values
-     */
+    /** {@inheritDoc} */
     public final char[] getUnicodeCharMap() {
         char[] copy = new char[this.unicodeMap.length];
         System.arraycopy(this.unicodeMap, 0, copy, 0, this.unicodeMap.length);
diff --git a/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java b/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
index 6517f0328..d55529d58 100644
--- a/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
+++ b/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
@@ -25,6 +25,8 @@ import java.util.Map;
 
 import org.apache.xmlgraphics.fonts.Glyphs;
 
+import org.apache.fop.util.CharUtilities;
+
 /**
  * A simple implementation of the OneByteEncoding mostly used for encodings that are constructed
  * on-the-fly.
@@ -137,6 +139,18 @@ public class SimpleSingleByteEncoding implements SingleByteEncoding {
         }
     }
 
+    /** {@inheritDoc} */
+    public char[] getUnicodeCharMap() {
+        char[] map = new char[getLastChar() + 1];
+        for (int i = 0; i < getFirstChar(); i++) {
+            map[i] = CharUtilities.NOT_A_CHARACTER;
+        }
+        for (int i = getFirstChar(); i <= getLastChar(); i++) {
+            map[i] = getCharacterForIndex(i).getSingleUnicodeValue();
+        }
+        return map;
+    }
+
     /** {@inheritDoc} */
     public String toString() {
         return getName() + " (" + getSize() + " chars)";
diff --git a/src/java/org/apache/fop/fonts/SingleByteEncoding.java b/src/java/org/apache/fop/fonts/SingleByteEncoding.java
index bc680f983..ad9d691f6 100644
--- a/src/java/org/apache/fop/fonts/SingleByteEncoding.java
+++ b/src/java/org/apache/fop/fonts/SingleByteEncoding.java
@@ -47,4 +47,12 @@ public interface SingleByteEncoding {
      */
     String[] getCharNameMap();
 
+    /**
+     * Returns a character array with Unicode scalar values which can be used to map encoding
+     * code points to Unicode values. Note that this does not return all possible Unicode values
+     * that the encoding maps.
+     * @return a character array with Unicode scalar values
+     */
+    char[] getUnicodeCharMap();
+
 }
diff --git a/src/java/org/apache/fop/pdf/CMapBuilder.java b/src/java/org/apache/fop/pdf/CMapBuilder.java
index 8e30ad04a..affb4fcad 100644
--- a/src/java/org/apache/fop/pdf/CMapBuilder.java
+++ b/src/java/org/apache/fop/pdf/CMapBuilder.java
@@ -110,8 +110,16 @@ public class CMapBuilder {
     }
 
     protected void writeCodeSpaceRange() throws IOException {
+        writeCodeSpaceRange(false);
+    }
+
+    protected void writeCodeSpaceRange(boolean singleByte) throws IOException {
         writer.write("1 begincodespacerange\n");
-        writer.write("<0000> <FFFF>\n");
+        if (singleByte) {
+            writer.write("<00> <FF>\n");
+        } else {
+            writer.write("<0000> <FFFF>\n");
+        }
         writer.write("endcodespacerange\n");
     }
 
diff --git a/src/java/org/apache/fop/pdf/PDFFactory.java b/src/java/org/apache/fop/pdf/PDFFactory.java
index a5beb4ec7..b820e12fe 100644
--- a/src/java/org/apache/fop/pdf/PDFFactory.java
+++ b/src/java/org/apache/fop/pdf/PDFFactory.java
@@ -1227,10 +1227,23 @@ public class PDFFactory {
             return preRegisteredfont;
         }
 
+        boolean forceToUnicode = true;
+
         if (descriptor == null) {
             //Usually Base 14 fonts
             PDFFont font = new PDFFont(fontname, FontType.TYPE1, basefont, encoding);
             getDocument().registerObject(font);
+            if (forceToUnicode && !PDFEncoding.isPredefinedEncoding(encoding)) {
+                SingleByteEncoding mapping;
+                if (encoding != null) {
+                    mapping = CodePointMapping.getMapping(encoding);
+                } else {
+                    //for Symbol and ZapfDingbats where encoding must be null in PDF
+                    Typeface tf = (Typeface)metrics;
+                    mapping = CodePointMapping.getMapping(tf.getEncodingName());
+                }
+                generateToUnicodeCmap(font, mapping);
+            }
             return font;
         } else {
             FontType fonttype = metrics.getFontType();
@@ -1266,7 +1279,7 @@ public class PDFFactory {
                         "fop-ucs-H",
                         new PDFCIDSystemInfo("Adobe",
                             "Identity",
-                            0));
+                            0), false);
                 getDocument().registerObject(cmap);
                 ((PDFFontType0)font).setCMAP(cmap);
                 ((PDFFontType0)font).setDescendantFonts(cidFont);
@@ -1290,8 +1303,13 @@ public class PDFFactory {
                 SingleByteEncoding mapping = singleByteFont.getEncoding();
                 if (singleByteFont.isSymbolicFont()) {
                     //no encoding, use the font's encoding
+                    if (forceToUnicode) {
+                        generateToUnicodeCmap(nonBase14, mapping);
+                    }
                 } else if (PDFEncoding.isPredefinedEncoding(mapping.getName())) {
                     font.setEncoding(mapping.getName());
+                    //No ToUnicode CMap necessary if PDF 1.4, chapter 5.9 (page 368) is to be
+                    //believed.
                 } else {
                     Object pdfEncoding = createPDFEncoding(mapping,
                             singleByteFont.getFontName());
@@ -1300,16 +1318,9 @@ public class PDFFactory {
                     } else {
                         font.setEncoding((String)pdfEncoding);
                     }
-
-                    /* JM: What I thought would be a necessity with custom encodings turned out to
-                     * be a bug in Adobe Acrobat 8. The following section just demonstrates how
-                     * to generate a ToUnicode CMap for a Type 1 font.
-                    PDFCMap cmap = new PDFToUnicodeCMap(mapping.getUnicodeCharMap(),
-                            "fop-ucs-H",
-                            new PDFCIDSystemInfo("Adobe", "Identity", 0));
-                    getDocument().registerObject(cmap);
-                    nonBase14.setToUnicode(cmap);
-                    */
+                    if (forceToUnicode) {
+                        generateToUnicodeCmap(nonBase14, mapping);
+                    }
                 }
 
                 //Handle additional encodings (characters outside the primary encoding)
@@ -1330,6 +1341,9 @@ public class PDFFactory {
                                 new PDFArray(null, singleByteFont.getAdditionalWidths(i)));
                         getDocument().registerObject(addFont);
                         getDocument().getResources().addFont(addFont);
+                        if (forceToUnicode) {
+                            generateToUnicodeCmap(addFont, addEncoding);
+                        }
                     }
                 }
             }
@@ -1338,6 +1352,14 @@ public class PDFFactory {
         }
     }
 
+    private void generateToUnicodeCmap(PDFFont font, SingleByteEncoding encoding) {
+        PDFCMap cmap = new PDFToUnicodeCMap(encoding.getUnicodeCharMap(),
+                "fop-ucs-H",
+                new PDFCIDSystemInfo("Adobe", "Identity", 0), true);
+        getDocument().registerObject(cmap);
+        font.setToUnicode(cmap);
+    }
+
     /**
      * Creates a PDFEncoding instance from a CodePointMapping instance.
      * @param encoding the code point mapping (encoding)
diff --git a/src/java/org/apache/fop/pdf/PDFFont.java b/src/java/org/apache/fop/pdf/PDFFont.java
index 6ba6ab9a2..2808e5ba7 100644
--- a/src/java/org/apache/fop/pdf/PDFFont.java
+++ b/src/java/org/apache/fop/pdf/PDFFont.java
@@ -85,6 +85,14 @@ public class PDFFont extends PDFDictionary {
         }
     }
 
+    /**
+     * Sets a ToUnicode CMap.
+     * @param cmap the ToUnicode character map
+     */
+    public void setToUnicode(PDFCMap cmap) {
+        put("ToUnicode", cmap);
+    }
+
     /**
      * factory method with the basic parameters
      *
diff --git a/src/java/org/apache/fop/pdf/PDFFontNonBase14.java b/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
index 9b6cef618..79ed10f48 100644
--- a/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
+++ b/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
@@ -71,14 +71,6 @@ public abstract class PDFFontNonBase14 extends PDFFont {
         return (PDFFontDescriptor)get("FontDescriptor");
     }
 
-    /**
-     * Sets a ToUnicode CMap.
-     * @param cmap the ToUnicode character map
-     */
-    public void setToUnicode(PDFCMap cmap) {
-        put("ToUnicode", cmap);
-    }
-
     /** {@inheritDoc} */
     protected void validate() {
         if (getDocumentSafely().getProfile().isFontEmbeddingRequired()) {
diff --git a/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java b/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
index b70430af4..95999b73f 100644
--- a/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
+++ b/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
@@ -43,6 +43,8 @@ public class PDFToUnicodeCMap extends PDFCMap {
      */
     protected char[] unicodeCharMap;
 
+    private boolean singleByte;
+
     /**
      * Constructor.
      *
@@ -51,10 +53,17 @@ public class PDFToUnicodeCMap extends PDFCMap {
      * @param name One of the registered names found in Table 5.14 in PDF
      * Reference, Second Edition.
      * @param sysInfo The attributes of the character collection of the CIDFont.
+     * @param singleByte true for single-byte, false for double-byte
      */
-    public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo) {
+    public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo,
+            boolean singleByte) {
         super(name, sysInfo);
+        if (singleByte && unicodeCharMap.length > 256) {
+            throw new IllegalArgumentException("unicodeCharMap may not contain more than"
+                    + " 256 characters for single-byte encodings");
+        }
         this.unicodeCharMap = unicodeCharMap;
+        this.singleByte = singleByte;
     }
 
     /** {@inheritDoc} */
@@ -78,7 +87,7 @@ public class PDFToUnicodeCMap extends PDFCMap {
             writeCIDSystemInfo("Adobe", "UCS", 0);
             writeName("Adobe-Identity-UCS");
             writeType("2");
-            writeCodeSpaceRange();
+            writeCodeSpaceRange(singleByte);
             writeBFEntries();
             writeWrapUp();
         }
@@ -122,7 +131,7 @@ public class PDFToUnicodeCMap extends PDFCMap {
                     while (partOfRange(charArray, charIndex)) {
                         charIndex++;
                     }
-                    writer.write("<" + padHexString(Integer.toHexString(charIndex), 4) + "> ");
+                    writer.write("<" + padCharIndex(charIndex) + "> ");
                     writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4)
                             + ">\n");
                     charIndex++;
@@ -132,6 +141,10 @@ public class PDFToUnicodeCMap extends PDFCMap {
             } while (remainingEntries > 0);
         }
 
+        private String padCharIndex(int charIndex) {
+            return padHexString(Integer.toHexString(charIndex), (singleByte ? 2 : 4));
+        }
+
         /**
          * Writes the entries for character ranges for a base font.
          * @param p StringBuffer to write to
@@ -159,9 +172,9 @@ public class PDFToUnicodeCMap extends PDFCMap {
                     while (!startOfRange(charArray, charIndex)) {
                         charIndex++;
                     }
-                    writer.write("<" + padHexString(Integer.toHexString(charIndex), 4) + "> ");
+                    writer.write("<" + padCharIndex(charIndex) + "> ");
                     writer.write("<"
-                            + padHexString(Integer.toHexString(endOfRange(charArray, charIndex)), 4)
+                            + padCharIndex(endOfRange(charArray, charIndex))
                             + "> ");
                     writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4)
                             + ">\n");
diff --git a/status.xml b/status.xml
index 9b1ad2e56..2a3f7a20f 100644
--- a/status.xml
+++ b/status.xml
@@ -53,6 +53,10 @@
 
   <changes>
     <release version="FOP Trunk" date="TBD">
+      <action context="Fonts" dev="JM" type="add">
+        FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in
+        encodings to help PDF text extractors interpreting characters.
+      </action>
       <action context="Fonts" dev="JM" type="add">
         Added support for forcing single-byte encodings for TrueType fonts without
         creating an XML font metric file (see "encoding-mode" attribute on "font" element)
-- 
2.39.5