From 68dd1750c65ec6ce5475f2f12ea63f6495a708ad Mon Sep 17 00:00:00 2001 From: Jeremias Maerki Date: Mon, 5 Jan 2009 07:47:02 +0000 Subject: [PATCH] FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in encodings to help PDF text extractors interpreting characters. PDF CMaps now support single-byte characters. git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@731479 13f79535-47bb-0310-9956-ffa450edef68 --- .../fop/fonts/AbstractCodePointMapping.java | 7 +-- .../fop/fonts/SimpleSingleByteEncoding.java | 14 ++++++ .../apache/fop/fonts/SingleByteEncoding.java | 8 ++++ src/java/org/apache/fop/pdf/CMapBuilder.java | 10 ++++- src/java/org/apache/fop/pdf/PDFFactory.java | 44 ++++++++++++++----- src/java/org/apache/fop/pdf/PDFFont.java | 8 ++++ .../org/apache/fop/pdf/PDFFontNonBase14.java | 8 ---- .../org/apache/fop/pdf/PDFToUnicodeCMap.java | 23 +++++++--- status.xml | 4 ++ 9 files changed, 95 insertions(+), 31 deletions(-) diff --git a/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java b/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java index 56558b02e..f03d4beab 100644 --- a/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java +++ b/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java @@ -189,12 +189,7 @@ public class AbstractCodePointMapping implements SingleByteEncoding { return this.unicodeMap[idx]; } - /** - * Returns a character array with Unicode scalar values which can be used to map encoding - * code points to Unicode values. Note that this does not return all possible Unicode values - * that the encoding maps. - * @return a character array with Unicode scalar values - */ + /** {@inheritDoc} */ public final char[] getUnicodeCharMap() { char[] copy = new char[this.unicodeMap.length]; System.arraycopy(this.unicodeMap, 0, copy, 0, this.unicodeMap.length); diff --git a/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java b/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java index 6517f0328..d55529d58 100644 --- a/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java +++ b/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java @@ -25,6 +25,8 @@ import java.util.Map; import org.apache.xmlgraphics.fonts.Glyphs; +import org.apache.fop.util.CharUtilities; + /** * A simple implementation of the OneByteEncoding mostly used for encodings that are constructed * on-the-fly. @@ -137,6 +139,18 @@ public class SimpleSingleByteEncoding implements SingleByteEncoding { } } + /** {@inheritDoc} */ + public char[] getUnicodeCharMap() { + char[] map = new char[getLastChar() + 1]; + for (int i = 0; i < getFirstChar(); i++) { + map[i] = CharUtilities.NOT_A_CHARACTER; + } + for (int i = getFirstChar(); i <= getLastChar(); i++) { + map[i] = getCharacterForIndex(i).getSingleUnicodeValue(); + } + return map; + } + /** {@inheritDoc} */ public String toString() { return getName() + " (" + getSize() + " chars)"; diff --git a/src/java/org/apache/fop/fonts/SingleByteEncoding.java b/src/java/org/apache/fop/fonts/SingleByteEncoding.java index bc680f983..ad9d691f6 100644 --- a/src/java/org/apache/fop/fonts/SingleByteEncoding.java +++ b/src/java/org/apache/fop/fonts/SingleByteEncoding.java @@ -47,4 +47,12 @@ public interface SingleByteEncoding { */ String[] getCharNameMap(); + /** + * Returns a character array with Unicode scalar values which can be used to map encoding + * code points to Unicode values. Note that this does not return all possible Unicode values + * that the encoding maps. + * @return a character array with Unicode scalar values + */ + char[] getUnicodeCharMap(); + } diff --git a/src/java/org/apache/fop/pdf/CMapBuilder.java b/src/java/org/apache/fop/pdf/CMapBuilder.java index 8e30ad04a..affb4fcad 100644 --- a/src/java/org/apache/fop/pdf/CMapBuilder.java +++ b/src/java/org/apache/fop/pdf/CMapBuilder.java @@ -110,8 +110,16 @@ public class CMapBuilder { } protected void writeCodeSpaceRange() throws IOException { + writeCodeSpaceRange(false); + } + + protected void writeCodeSpaceRange(boolean singleByte) throws IOException { writer.write("1 begincodespacerange\n"); - writer.write("<0000> \n"); + if (singleByte) { + writer.write("<00> \n"); + } else { + writer.write("<0000> \n"); + } writer.write("endcodespacerange\n"); } diff --git a/src/java/org/apache/fop/pdf/PDFFactory.java b/src/java/org/apache/fop/pdf/PDFFactory.java index a5beb4ec7..b820e12fe 100644 --- a/src/java/org/apache/fop/pdf/PDFFactory.java +++ b/src/java/org/apache/fop/pdf/PDFFactory.java @@ -1227,10 +1227,23 @@ public class PDFFactory { return preRegisteredfont; } + boolean forceToUnicode = true; + if (descriptor == null) { //Usually Base 14 fonts PDFFont font = new PDFFont(fontname, FontType.TYPE1, basefont, encoding); getDocument().registerObject(font); + if (forceToUnicode && !PDFEncoding.isPredefinedEncoding(encoding)) { + SingleByteEncoding mapping; + if (encoding != null) { + mapping = CodePointMapping.getMapping(encoding); + } else { + //for Symbol and ZapfDingbats where encoding must be null in PDF + Typeface tf = (Typeface)metrics; + mapping = CodePointMapping.getMapping(tf.getEncodingName()); + } + generateToUnicodeCmap(font, mapping); + } return font; } else { FontType fonttype = metrics.getFontType(); @@ -1266,7 +1279,7 @@ public class PDFFactory { "fop-ucs-H", new PDFCIDSystemInfo("Adobe", "Identity", - 0)); + 0), false); getDocument().registerObject(cmap); ((PDFFontType0)font).setCMAP(cmap); ((PDFFontType0)font).setDescendantFonts(cidFont); @@ -1290,8 +1303,13 @@ public class PDFFactory { SingleByteEncoding mapping = singleByteFont.getEncoding(); if (singleByteFont.isSymbolicFont()) { //no encoding, use the font's encoding + if (forceToUnicode) { + generateToUnicodeCmap(nonBase14, mapping); + } } else if (PDFEncoding.isPredefinedEncoding(mapping.getName())) { font.setEncoding(mapping.getName()); + //No ToUnicode CMap necessary if PDF 1.4, chapter 5.9 (page 368) is to be + //believed. } else { Object pdfEncoding = createPDFEncoding(mapping, singleByteFont.getFontName()); @@ -1300,16 +1318,9 @@ public class PDFFactory { } else { font.setEncoding((String)pdfEncoding); } - - /* JM: What I thought would be a necessity with custom encodings turned out to - * be a bug in Adobe Acrobat 8. The following section just demonstrates how - * to generate a ToUnicode CMap for a Type 1 font. - PDFCMap cmap = new PDFToUnicodeCMap(mapping.getUnicodeCharMap(), - "fop-ucs-H", - new PDFCIDSystemInfo("Adobe", "Identity", 0)); - getDocument().registerObject(cmap); - nonBase14.setToUnicode(cmap); - */ + if (forceToUnicode) { + generateToUnicodeCmap(nonBase14, mapping); + } } //Handle additional encodings (characters outside the primary encoding) @@ -1330,6 +1341,9 @@ public class PDFFactory { new PDFArray(null, singleByteFont.getAdditionalWidths(i))); getDocument().registerObject(addFont); getDocument().getResources().addFont(addFont); + if (forceToUnicode) { + generateToUnicodeCmap(addFont, addEncoding); + } } } } @@ -1338,6 +1352,14 @@ public class PDFFactory { } } + private void generateToUnicodeCmap(PDFFont font, SingleByteEncoding encoding) { + PDFCMap cmap = new PDFToUnicodeCMap(encoding.getUnicodeCharMap(), + "fop-ucs-H", + new PDFCIDSystemInfo("Adobe", "Identity", 0), true); + getDocument().registerObject(cmap); + font.setToUnicode(cmap); + } + /** * Creates a PDFEncoding instance from a CodePointMapping instance. * @param encoding the code point mapping (encoding) diff --git a/src/java/org/apache/fop/pdf/PDFFont.java b/src/java/org/apache/fop/pdf/PDFFont.java index 6ba6ab9a2..2808e5ba7 100644 --- a/src/java/org/apache/fop/pdf/PDFFont.java +++ b/src/java/org/apache/fop/pdf/PDFFont.java @@ -85,6 +85,14 @@ public class PDFFont extends PDFDictionary { } } + /** + * Sets a ToUnicode CMap. + * @param cmap the ToUnicode character map + */ + public void setToUnicode(PDFCMap cmap) { + put("ToUnicode", cmap); + } + /** * factory method with the basic parameters * diff --git a/src/java/org/apache/fop/pdf/PDFFontNonBase14.java b/src/java/org/apache/fop/pdf/PDFFontNonBase14.java index 9b6cef618..79ed10f48 100644 --- a/src/java/org/apache/fop/pdf/PDFFontNonBase14.java +++ b/src/java/org/apache/fop/pdf/PDFFontNonBase14.java @@ -71,14 +71,6 @@ public abstract class PDFFontNonBase14 extends PDFFont { return (PDFFontDescriptor)get("FontDescriptor"); } - /** - * Sets a ToUnicode CMap. - * @param cmap the ToUnicode character map - */ - public void setToUnicode(PDFCMap cmap) { - put("ToUnicode", cmap); - } - /** {@inheritDoc} */ protected void validate() { if (getDocumentSafely().getProfile().isFontEmbeddingRequired()) { diff --git a/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java b/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java index b70430af4..95999b73f 100644 --- a/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java +++ b/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java @@ -43,6 +43,8 @@ public class PDFToUnicodeCMap extends PDFCMap { */ protected char[] unicodeCharMap; + private boolean singleByte; + /** * Constructor. * @@ -51,10 +53,17 @@ public class PDFToUnicodeCMap extends PDFCMap { * @param name One of the registered names found in Table 5.14 in PDF * Reference, Second Edition. * @param sysInfo The attributes of the character collection of the CIDFont. + * @param singleByte true for single-byte, false for double-byte */ - public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo) { + public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo, + boolean singleByte) { super(name, sysInfo); + if (singleByte && unicodeCharMap.length > 256) { + throw new IllegalArgumentException("unicodeCharMap may not contain more than" + + " 256 characters for single-byte encodings"); + } this.unicodeCharMap = unicodeCharMap; + this.singleByte = singleByte; } /** {@inheritDoc} */ @@ -78,7 +87,7 @@ public class PDFToUnicodeCMap extends PDFCMap { writeCIDSystemInfo("Adobe", "UCS", 0); writeName("Adobe-Identity-UCS"); writeType("2"); - writeCodeSpaceRange(); + writeCodeSpaceRange(singleByte); writeBFEntries(); writeWrapUp(); } @@ -122,7 +131,7 @@ public class PDFToUnicodeCMap extends PDFCMap { while (partOfRange(charArray, charIndex)) { charIndex++; } - writer.write("<" + padHexString(Integer.toHexString(charIndex), 4) + "> "); + writer.write("<" + padCharIndex(charIndex) + "> "); writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4) + ">\n"); charIndex++; @@ -132,6 +141,10 @@ public class PDFToUnicodeCMap extends PDFCMap { } while (remainingEntries > 0); } + private String padCharIndex(int charIndex) { + return padHexString(Integer.toHexString(charIndex), (singleByte ? 2 : 4)); + } + /** * Writes the entries for character ranges for a base font. * @param p StringBuffer to write to @@ -159,9 +172,9 @@ public class PDFToUnicodeCMap extends PDFCMap { while (!startOfRange(charArray, charIndex)) { charIndex++; } - writer.write("<" + padHexString(Integer.toHexString(charIndex), 4) + "> "); + writer.write("<" + padCharIndex(charIndex) + "> "); writer.write("<" - + padHexString(Integer.toHexString(endOfRange(charArray, charIndex)), 4) + + padCharIndex(endOfRange(charArray, charIndex)) + "> "); writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4) + ">\n"); diff --git a/status.xml b/status.xml index 9b1ad2e56..2a3f7a20f 100644 --- a/status.xml +++ b/status.xml @@ -53,6 +53,10 @@ + + FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in + encodings to help PDF text extractors interpreting characters. + Added support for forcing single-byte encodings for TrueType fonts without creating an XML font metric file (see "encoding-mode" attribute on "font" element) -- 2.39.5