aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeremias Maerki <jeremias@apache.org>2009-01-05 07:47:02 +0000
committerJeremias Maerki <jeremias@apache.org>2009-01-05 07:47:02 +0000
commit68dd1750c65ec6ce5475f2f12ea63f6495a708ad (patch)
treedd3de5418809c273cbc5ad2a918f5115afa8cfd5
parent1b03a90eb44c5090c357c38c7606f795a63bb3f0 (diff)
downloadxmlgraphics-fop-68dd1750c65ec6ce5475f2f12ea63f6495a708ad.tar.gz
xmlgraphics-fop-68dd1750c65ec6ce5475f2f12ea63f6495a708ad.zip
FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in encodings to help PDF text extractors interpreting characters.
PDF CMaps now support single-byte characters. git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@731479 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--src/java/org/apache/fop/fonts/AbstractCodePointMapping.java7
-rw-r--r--src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java14
-rw-r--r--src/java/org/apache/fop/fonts/SingleByteEncoding.java8
-rw-r--r--src/java/org/apache/fop/pdf/CMapBuilder.java10
-rw-r--r--src/java/org/apache/fop/pdf/PDFFactory.java44
-rw-r--r--src/java/org/apache/fop/pdf/PDFFont.java8
-rw-r--r--src/java/org/apache/fop/pdf/PDFFontNonBase14.java8
-rw-r--r--src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java23
-rw-r--r--status.xml4
9 files changed, 95 insertions, 31 deletions
diff --git a/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java b/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
index 56558b02e..f03d4beab 100644
--- a/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
+++ b/src/java/org/apache/fop/fonts/AbstractCodePointMapping.java
@@ -189,12 +189,7 @@ public class AbstractCodePointMapping implements SingleByteEncoding {
return this.unicodeMap[idx];
}
- /**
- * Returns a character array with Unicode scalar values which can be used to map encoding
- * code points to Unicode values. Note that this does not return all possible Unicode values
- * that the encoding maps.
- * @return a character array with Unicode scalar values
- */
+ /** {@inheritDoc} */
public final char[] getUnicodeCharMap() {
char[] copy = new char[this.unicodeMap.length];
System.arraycopy(this.unicodeMap, 0, copy, 0, this.unicodeMap.length);
diff --git a/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java b/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
index 6517f0328..d55529d58 100644
--- a/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
+++ b/src/java/org/apache/fop/fonts/SimpleSingleByteEncoding.java
@@ -25,6 +25,8 @@ import java.util.Map;
import org.apache.xmlgraphics.fonts.Glyphs;
+import org.apache.fop.util.CharUtilities;
+
/**
* A simple implementation of the OneByteEncoding mostly used for encodings that are constructed
* on-the-fly.
@@ -138,6 +140,18 @@ public class SimpleSingleByteEncoding implements SingleByteEncoding {
}
/** {@inheritDoc} */
+ public char[] getUnicodeCharMap() {
+ char[] map = new char[getLastChar() + 1];
+ for (int i = 0; i < getFirstChar(); i++) {
+ map[i] = CharUtilities.NOT_A_CHARACTER;
+ }
+ for (int i = getFirstChar(); i <= getLastChar(); i++) {
+ map[i] = getCharacterForIndex(i).getSingleUnicodeValue();
+ }
+ return map;
+ }
+
+ /** {@inheritDoc} */
public String toString() {
return getName() + " (" + getSize() + " chars)";
}
diff --git a/src/java/org/apache/fop/fonts/SingleByteEncoding.java b/src/java/org/apache/fop/fonts/SingleByteEncoding.java
index bc680f983..ad9d691f6 100644
--- a/src/java/org/apache/fop/fonts/SingleByteEncoding.java
+++ b/src/java/org/apache/fop/fonts/SingleByteEncoding.java
@@ -47,4 +47,12 @@ public interface SingleByteEncoding {
*/
String[] getCharNameMap();
+ /**
+ * Returns a character array with Unicode scalar values which can be used to map encoding
+ * code points to Unicode values. Note that this does not return all possible Unicode values
+ * that the encoding maps.
+ * @return a character array with Unicode scalar values
+ */
+ char[] getUnicodeCharMap();
+
}
diff --git a/src/java/org/apache/fop/pdf/CMapBuilder.java b/src/java/org/apache/fop/pdf/CMapBuilder.java
index 8e30ad04a..affb4fcad 100644
--- a/src/java/org/apache/fop/pdf/CMapBuilder.java
+++ b/src/java/org/apache/fop/pdf/CMapBuilder.java
@@ -110,8 +110,16 @@ public class CMapBuilder {
}
protected void writeCodeSpaceRange() throws IOException {
+ writeCodeSpaceRange(false);
+ }
+
+ protected void writeCodeSpaceRange(boolean singleByte) throws IOException {
writer.write("1 begincodespacerange\n");
- writer.write("<0000> <FFFF>\n");
+ if (singleByte) {
+ writer.write("<00> <FF>\n");
+ } else {
+ writer.write("<0000> <FFFF>\n");
+ }
writer.write("endcodespacerange\n");
}
diff --git a/src/java/org/apache/fop/pdf/PDFFactory.java b/src/java/org/apache/fop/pdf/PDFFactory.java
index a5beb4ec7..b820e12fe 100644
--- a/src/java/org/apache/fop/pdf/PDFFactory.java
+++ b/src/java/org/apache/fop/pdf/PDFFactory.java
@@ -1227,10 +1227,23 @@ public class PDFFactory {
return preRegisteredfont;
}
+ boolean forceToUnicode = true;
+
if (descriptor == null) {
//Usually Base 14 fonts
PDFFont font = new PDFFont(fontname, FontType.TYPE1, basefont, encoding);
getDocument().registerObject(font);
+ if (forceToUnicode && !PDFEncoding.isPredefinedEncoding(encoding)) {
+ SingleByteEncoding mapping;
+ if (encoding != null) {
+ mapping = CodePointMapping.getMapping(encoding);
+ } else {
+ //for Symbol and ZapfDingbats where encoding must be null in PDF
+ Typeface tf = (Typeface)metrics;
+ mapping = CodePointMapping.getMapping(tf.getEncodingName());
+ }
+ generateToUnicodeCmap(font, mapping);
+ }
return font;
} else {
FontType fonttype = metrics.getFontType();
@@ -1266,7 +1279,7 @@ public class PDFFactory {
"fop-ucs-H",
new PDFCIDSystemInfo("Adobe",
"Identity",
- 0));
+ 0), false);
getDocument().registerObject(cmap);
((PDFFontType0)font).setCMAP(cmap);
((PDFFontType0)font).setDescendantFonts(cidFont);
@@ -1290,8 +1303,13 @@ public class PDFFactory {
SingleByteEncoding mapping = singleByteFont.getEncoding();
if (singleByteFont.isSymbolicFont()) {
//no encoding, use the font's encoding
+ if (forceToUnicode) {
+ generateToUnicodeCmap(nonBase14, mapping);
+ }
} else if (PDFEncoding.isPredefinedEncoding(mapping.getName())) {
font.setEncoding(mapping.getName());
+ //No ToUnicode CMap necessary if PDF 1.4, chapter 5.9 (page 368) is to be
+ //believed.
} else {
Object pdfEncoding = createPDFEncoding(mapping,
singleByteFont.getFontName());
@@ -1300,16 +1318,9 @@ public class PDFFactory {
} else {
font.setEncoding((String)pdfEncoding);
}
-
- /* JM: What I thought would be a necessity with custom encodings turned out to
- * be a bug in Adobe Acrobat 8. The following section just demonstrates how
- * to generate a ToUnicode CMap for a Type 1 font.
- PDFCMap cmap = new PDFToUnicodeCMap(mapping.getUnicodeCharMap(),
- "fop-ucs-H",
- new PDFCIDSystemInfo("Adobe", "Identity", 0));
- getDocument().registerObject(cmap);
- nonBase14.setToUnicode(cmap);
- */
+ if (forceToUnicode) {
+ generateToUnicodeCmap(nonBase14, mapping);
+ }
}
//Handle additional encodings (characters outside the primary encoding)
@@ -1330,6 +1341,9 @@ public class PDFFactory {
new PDFArray(null, singleByteFont.getAdditionalWidths(i)));
getDocument().registerObject(addFont);
getDocument().getResources().addFont(addFont);
+ if (forceToUnicode) {
+ generateToUnicodeCmap(addFont, addEncoding);
+ }
}
}
}
@@ -1338,6 +1352,14 @@ public class PDFFactory {
}
}
+ private void generateToUnicodeCmap(PDFFont font, SingleByteEncoding encoding) {
+ PDFCMap cmap = new PDFToUnicodeCMap(encoding.getUnicodeCharMap(),
+ "fop-ucs-H",
+ new PDFCIDSystemInfo("Adobe", "Identity", 0), true);
+ getDocument().registerObject(cmap);
+ font.setToUnicode(cmap);
+ }
+
/**
* Creates a PDFEncoding instance from a CodePointMapping instance.
* @param encoding the code point mapping (encoding)
diff --git a/src/java/org/apache/fop/pdf/PDFFont.java b/src/java/org/apache/fop/pdf/PDFFont.java
index 6ba6ab9a2..2808e5ba7 100644
--- a/src/java/org/apache/fop/pdf/PDFFont.java
+++ b/src/java/org/apache/fop/pdf/PDFFont.java
@@ -86,6 +86,14 @@ public class PDFFont extends PDFDictionary {
}
/**
+ * Sets a ToUnicode CMap.
+ * @param cmap the ToUnicode character map
+ */
+ public void setToUnicode(PDFCMap cmap) {
+ put("ToUnicode", cmap);
+ }
+
+ /**
* factory method with the basic parameters
*
* @param fontname the internal name for the font
diff --git a/src/java/org/apache/fop/pdf/PDFFontNonBase14.java b/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
index 9b6cef618..79ed10f48 100644
--- a/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
+++ b/src/java/org/apache/fop/pdf/PDFFontNonBase14.java
@@ -71,14 +71,6 @@ public abstract class PDFFontNonBase14 extends PDFFont {
return (PDFFontDescriptor)get("FontDescriptor");
}
- /**
- * Sets a ToUnicode CMap.
- * @param cmap the ToUnicode character map
- */
- public void setToUnicode(PDFCMap cmap) {
- put("ToUnicode", cmap);
- }
-
/** {@inheritDoc} */
protected void validate() {
if (getDocumentSafely().getProfile().isFontEmbeddingRequired()) {
diff --git a/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java b/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
index b70430af4..95999b73f 100644
--- a/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
+++ b/src/java/org/apache/fop/pdf/PDFToUnicodeCMap.java
@@ -43,6 +43,8 @@ public class PDFToUnicodeCMap extends PDFCMap {
*/
protected char[] unicodeCharMap;
+ private boolean singleByte;
+
/**
* Constructor.
*
@@ -51,10 +53,17 @@ public class PDFToUnicodeCMap extends PDFCMap {
* @param name One of the registered names found in Table 5.14 in PDF
* Reference, Second Edition.
* @param sysInfo The attributes of the character collection of the CIDFont.
+ * @param singleByte true for single-byte, false for double-byte
*/
- public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo) {
+ public PDFToUnicodeCMap(char[] unicodeCharMap, String name, PDFCIDSystemInfo sysInfo,
+ boolean singleByte) {
super(name, sysInfo);
+ if (singleByte && unicodeCharMap.length > 256) {
+ throw new IllegalArgumentException("unicodeCharMap may not contain more than"
+ + " 256 characters for single-byte encodings");
+ }
this.unicodeCharMap = unicodeCharMap;
+ this.singleByte = singleByte;
}
/** {@inheritDoc} */
@@ -78,7 +87,7 @@ public class PDFToUnicodeCMap extends PDFCMap {
writeCIDSystemInfo("Adobe", "UCS", 0);
writeName("Adobe-Identity-UCS");
writeType("2");
- writeCodeSpaceRange();
+ writeCodeSpaceRange(singleByte);
writeBFEntries();
writeWrapUp();
}
@@ -122,7 +131,7 @@ public class PDFToUnicodeCMap extends PDFCMap {
while (partOfRange(charArray, charIndex)) {
charIndex++;
}
- writer.write("<" + padHexString(Integer.toHexString(charIndex), 4) + "> ");
+ writer.write("<" + padCharIndex(charIndex) + "> ");
writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4)
+ ">\n");
charIndex++;
@@ -132,6 +141,10 @@ public class PDFToUnicodeCMap extends PDFCMap {
} while (remainingEntries > 0);
}
+ private String padCharIndex(int charIndex) {
+ return padHexString(Integer.toHexString(charIndex), (singleByte ? 2 : 4));
+ }
+
/**
* Writes the entries for character ranges for a base font.
* @param p StringBuffer to write to
@@ -159,9 +172,9 @@ public class PDFToUnicodeCMap extends PDFCMap {
while (!startOfRange(charArray, charIndex)) {
charIndex++;
}
- writer.write("<" + padHexString(Integer.toHexString(charIndex), 4) + "> ");
+ writer.write("<" + padCharIndex(charIndex) + "> ");
writer.write("<"
- + padHexString(Integer.toHexString(endOfRange(charArray, charIndex)), 4)
+ + padCharIndex(endOfRange(charArray, charIndex))
+ "> ");
writer.write("<" + padHexString(Integer.toHexString(charArray[charIndex]), 4)
+ ">\n");
diff --git a/status.xml b/status.xml
index 9b1ad2e56..2a3f7a20f 100644
--- a/status.xml
+++ b/status.xml
@@ -54,6 +54,10 @@
<changes>
<release version="FOP Trunk" date="TBD">
<action context="Fonts" dev="JM" type="add">
+ FOP now creates ToUnicode CMaps for single-byte fonts that don't use built-in
+ encodings to help PDF text extractors interpreting characters.
+ </action>
+ <action context="Fonts" dev="JM" type="add">
Added support for forcing single-byte encodings for TrueType fonts without
creating an XML font metric file (see "encoding-mode" attribute on "font" element)
</action>