]> source.dussan.org Git - xmlgraphics-fop.git/commitdiff
FOP-2287: support Tamil script (preliminary)
authorGlenn Adams <gadams@apache.org>
Sun, 12 Oct 2014 04:18:37 +0000 (04:18 +0000)
committerGlenn Adams <gadams@apache.org>
Sun, 12 Oct 2014 04:18:37 +0000 (04:18 +0000)
git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@1631147 13f79535-47bb-0310-9956-ffa450edef68

src/java/org/apache/fop/complexscripts/fonts/GlyphDefinitionTable.java
src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java
src/java/org/apache/fop/complexscripts/scripts/DefaultScriptProcessor.java
src/java/org/apache/fop/complexscripts/scripts/GurmukhiScriptProcessor.java
src/java/org/apache/fop/complexscripts/scripts/IndicScriptProcessor.java
src/java/org/apache/fop/complexscripts/scripts/ScriptProcessor.java
src/java/org/apache/fop/complexscripts/scripts/TamilScriptProcessor.java [new file with mode: 0644]
src/java/org/apache/fop/complexscripts/util/CharNormalize.java [new file with mode: 0644]
src/java/org/apache/fop/fonts/MultiByteFont.java

index a5942536c0fec711d7df2ad8471c02383f646b5f..2ed1c2875f7184e0595d52e9604f2b4f58af6a74 100644 (file)
@@ -96,14 +96,15 @@ public class GlyphDefinitionTable extends GlyphTable {
      * method since when the segment is reversed by BIDI processing, marks are automatically reordered to precede
      * their base glyph.
      * @param gs an input glyph sequence
+     * @param widths associated advance widths (also reordered)
      * @param gpa associated glyph position adjustments (also reordered)
      * @param script a script identifier
      * @param language a language identifier
      * @return the reordered (output) glyph sequence
      */
-    public GlyphSequence reorderCombiningMarks(GlyphSequence gs, int[][] gpa, String script, String language) {
+    public GlyphSequence reorderCombiningMarks(GlyphSequence gs, int[] widths, int[][] gpa, String script, String language) {
         ScriptProcessor sp = ScriptProcessor.getInstance(script);
-        return sp.reorderCombiningMarks(this, gs, gpa, script, language);
+        return sp.reorderCombiningMarks(this, gs, widths, gpa, script, language);
     }
 
     /** {@inheritDoc} */
index 2753ca9451e6c14d6cd7027d3cd26449f7d61ca4..b108c5ebe189b3e9b69b3047b6c6e609459ba336 100644 (file)
@@ -141,7 +141,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor {
 
     /** {@inheritDoc} */
     @Override
-    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) {
+    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[] widths, int[][] gpa, String script, String language) {
         // a side effect of BIDI reordering is to order combining marks before their base, so we need to override the default here to
         // prevent double reordering
         return gs;
index ced4d4041f34e0d5ab83868de118cacb87349317..275fb20dc1acbacf2099fc8bbe2bb3fd1a35c376 100644 (file)
@@ -80,14 +80,15 @@ public class DefaultScriptProcessor extends ScriptProcessor {
 
     @Override
     /** {@inheritDoc} */
-    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) {
+    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[] unscaledWidths, int[][] gpa, String script, String language) {
         int   ng  = gs.getGlyphCount();
         int[] ga  = gs.getGlyphArray(false);
         int   nm  = 0;
         // count combining marks
         for (int i = 0; i < ng; i++) {
             int gid = ga [ i ];
-            if (gdef.isGlyphClass(gid, GlyphDefinitionTable.GLYPH_CLASS_MARK)) {
+            int gw = unscaledWidths [ i ];
+            if (isReorderedMark(gdef, ga, unscaledWidths, i)) {
                 nm++;
             }
         }
@@ -105,7 +106,7 @@ public class DefaultScriptProcessor extends ScriptProcessor {
                 int gid = ga [ i ];
                 int[] pa = (gpa != null) ? gpa [ i ] : null;
                 CharAssociation ca = aa [ i ];
-                if (gdef.isGlyphClass(gid, GlyphDefinitionTable.GLYPH_CLASS_MARK)) {
+                if (isReorderedMark(gdef, ga, unscaledWidths, i)) {
                     nga [ k ] = gid;
                     naa [ k ] = ca;
                     if (npa != null) {
@@ -149,4 +150,8 @@ public class DefaultScriptProcessor extends ScriptProcessor {
         }
     }
 
+    protected boolean isReorderedMark(GlyphDefinitionTable gdef, int[] glyphs, int[] unscaledWidths, int index) {
+        return gdef.isGlyphClass(glyphs[index], GlyphDefinitionTable.GLYPH_CLASS_MARK) && (unscaledWidths[index] != 0);
+    }
+
 }
index 94f5893a9e8e02cf693cd1de01447766023e4467..cc50761bbcaa14a97d21f88b8965d671ded08172 100644 (file)
@@ -22,7 +22,6 @@ package org.apache.fop.complexscripts.scripts;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-import org.apache.fop.complexscripts.fonts.GlyphDefinitionTable;
 import org.apache.fop.complexscripts.util.CharAssociation;
 import org.apache.fop.complexscripts.util.GlyphSequence;
 
@@ -538,9 +537,4 @@ public class GurmukhiScriptProcessor extends IndicScriptProcessor {
         return hasFlag(c, C_N);
     }
 
-    @Override
-    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) {
-        return super.reorderCombiningMarks(gdef, gs, gpa, script, language);
-    }
-
 }
index ad533b843033b796254e44352ac900355c111a03..01d5c6d70ecf268d1666405e36c023898af659e9 100644 (file)
@@ -126,6 +126,9 @@ public class IndicScriptProcessor extends DefaultScriptProcessor {
         case CharScript.SCRIPT_GURMUKHI:
         case CharScript.SCRIPT_GURMUKHI_2:
             return new GurmukhiScriptProcessor(script);
+        case CharScript.SCRIPT_TAMIL:
+        case CharScript.SCRIPT_TAMIL_2:
+            return new TamilScriptProcessor(script);
         // [TBD] implement other script processors
         default:
             return new IndicScriptProcessor(script);
@@ -460,22 +463,30 @@ public class IndicScriptProcessor extends DefaultScriptProcessor {
             Vector<Segment> sv = new Vector<Segment>(nc);
             for (int s = 0, e = nc; s < e; ) {
                 int i;
-                if ((i = findStartOfSyllable(ca, s, e)) > s) {
-                    // from s to i is non-syllable segment
-                    sv.add(new Segment(s, i, Segment.OTHER));
+                if ((i = findStartOfSyllable(ca, s, e)) < e) {
+                    if (s < i) {
+                        // from s to i is non-syllable segment
+                        sv.add(new Segment(s, i, Segment.OTHER));
+                    }
                     s = i; // move s to start of syllable
-                } else if (i > s) {
-                    // from s to e is non-syllable segment
-                    sv.add(new Segment(s, e, Segment.OTHER));
+                } else {
+                    if (s < e) {
+                        // from s to e is non-syllable segment
+                        sv.add(new Segment(s, e, Segment.OTHER));
+                    }
                     s = e; // move s to end of input sequence
                 }
                 if ((i = findEndOfSyllable(ca, s, e)) > s) {
-                    // from s to i is syllable segment
-                    sv.add(new Segment(s, i, Segment.SYLLABLE));
+                    if (s < i) {
+                        // from s to i is syllable segment
+                        sv.add(new Segment(s, i, Segment.SYLLABLE));
+                    }
                     s = i; // move s to end of syllable
                 } else {
-                    // from s to e is non-syllable segment
-                    sv.add(new Segment(s, e, Segment.OTHER));
+                    if (s < e) {
+                        // from s to e is non-syllable segment
+                        sv.add(new Segment(s, e, Segment.OTHER));
+                    }
                     s = e; // move s to end of input sequence
                 }
             }
index cfcc4ff59c4fcf86d22b7260cd6b916ccadcadb1..d492045f1e0d5997378275a1cbe87d2b4b758ce7 100644 (file)
@@ -123,12 +123,13 @@ public abstract class ScriptProcessor {
      * their base glyph.
      * @param gdef the glyph definition table that applies
      * @param gs an input glyph sequence
+     * @param unscaledWidths associated unscaled advance widths (also reordered)
      * @param gpa associated glyph position adjustments (also reordered)
      * @param script a script identifier
      * @param language a language identifier
      * @return the reordered (output) glyph sequence
      */
-    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) {
+    public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[] unscaledWidths, int[][] gpa, String script, String language) {
         return gs;
     }
 
diff --git a/src/java/org/apache/fop/complexscripts/scripts/TamilScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/TamilScriptProcessor.java
new file mode 100644 (file)
index 0000000..6df0bf8
--- /dev/null
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id$ */
+
+package org.apache.fop.complexscripts.scripts;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.fop.complexscripts.util.CharAssociation;
+import org.apache.fop.complexscripts.util.GlyphSequence;
+
+// CSOFF: LineLengthCheck
+
+/**
+ * <p>The <code>TamilScriptProcessor</code> class implements a script processor for
+ * performing glyph substitution and positioning operations on content associated with the Tamil script.</p>
+ *
+ * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
+ */
+public class TamilScriptProcessor extends IndicScriptProcessor {
+
+    /** logging instance */
+    private static final Log log = LogFactory.getLog(TamilScriptProcessor.class);
+
+    TamilScriptProcessor(String script) {
+        super(script);
+    }
+
+    @Override
+    protected Class<? extends TamilSyllabizer> getSyllabizerClass() {
+        return TamilSyllabizer.class;
+    }
+
+    @Override
+    // find rightmost pre-base matra
+    protected int findPreBaseMatra(GlyphSequence gs) {
+        int   ng = gs.getGlyphCount();
+        int   lk = -1;
+        for (int i = ng; i > 0; i--) {
+            int k = i - 1;
+            if (containsPreBaseMatra(gs, k)) {
+                lk = k;
+                break;
+            }
+        }
+        return lk;
+    }
+
+    @Override
+    // find leftmost pre-base matra target, starting from source
+    protected int findPreBaseMatraTarget(GlyphSequence gs, int source) {
+        int   ng = gs.getGlyphCount();
+        int   lk = -1;
+        for (int i = (source < ng) ? source : ng; i > 0; i--) {
+            int k = i - 1;
+            if (containsConsonant(gs, k)) {
+                if (containsHalfConsonant(gs, k)) {
+                    lk = k;
+                } else if (lk == -1) {
+                    lk = k;
+                } else {
+                    break;
+                }
+            }
+        }
+        return lk;
+    }
+
+    private static boolean containsPreBaseMatra(GlyphSequence gs, int k) {
+        CharAssociation a = gs.getAssociation(k);
+        int[] ca = gs.getCharacterArray(false);
+        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
+            if (isPreM(ca [ i ])) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static boolean containsConsonant(GlyphSequence gs, int k) {
+        CharAssociation a = gs.getAssociation(k);
+        int[] ca = gs.getCharacterArray(false);
+        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
+            if (isC(ca [ i ])) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static boolean containsHalfConsonant(GlyphSequence gs, int k) {
+        Boolean half = (Boolean) gs.getAssociation(k).getPredication("half");
+        return (half != null) ? half.booleanValue() : false;
+    }
+
+    @Override
+    protected int findReph(GlyphSequence gs) {
+        int   ng = gs.getGlyphCount();
+        int   li = -1;
+        for (int i = 0; i < ng; i++) {
+            if (containsReph(gs, i)) {
+                li = i;
+                break;
+            }
+        }
+        return li;
+    }
+
+    @Override
+    protected int findRephTarget(GlyphSequence gs, int source) {
+        int   ng = gs.getGlyphCount();
+        int   c1 = -1;
+        int   c2 = -1;
+        // first candidate target is after first non-half consonant
+        for (int i = 0; i < ng; i++) {
+            if ((i != source) && containsConsonant(gs, i)) {
+                if (!containsHalfConsonant(gs, i)) {
+                    c1 = i + 1;
+                    break;
+                }
+            }
+        }
+        // second candidate target is after last non-prebase matra after first candidate or before first syllable or vedic mark
+        for (int i = (c1 >= 0) ? c1 : 0; i < ng; i++) {
+            if (containsMatra(gs, i) && !containsPreBaseMatra(gs, i)) {
+                c2 = i + 1;
+            } else if (containsOtherMark(gs, i)) {
+                c2 = i;
+                break;
+            }
+        }
+        if (c2 >= 0) {
+            return c2;
+        } else if (c1 >= 0) {
+            return c1;
+        } else {
+            return source;
+        }
+    }
+
+    private static boolean containsReph(GlyphSequence gs, int k) {
+        Boolean rphf = (Boolean) gs.getAssociation(k).getPredication("rphf");
+        return (rphf != null) ? rphf.booleanValue() : false;
+    }
+
+    private static boolean containsMatra(GlyphSequence gs, int k) {
+        CharAssociation a = gs.getAssociation(k);
+        int[] ca = gs.getCharacterArray(false);
+        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
+            if (isM(ca [ i ])) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static boolean containsOtherMark(GlyphSequence gs, int k) {
+        CharAssociation a = gs.getAssociation(k);
+        int[] ca = gs.getCharacterArray(false);
+        for (int i = a.getStart(), e = a.getEnd(); i < e; i++) {
+            switch (typeOf(ca [ i ])) {
+            case C_T:   // tone (e.g., udatta, anudatta)
+            case C_A:   // accent (e.g., acute, grave)
+            case C_O:   // other (e.g., candrabindu, anusvara, visarga, etc)
+                return true;
+            default:
+                break;
+            }
+        }
+        return false;
+    }
+
+    private static class TamilSyllabizer extends DefaultSyllabizer {
+        TamilSyllabizer(String script, String language) {
+            super(script, language);
+        }
+        @Override
+        // | C ...
+        protected int findStartOfSyllable(int[] ca, int s, int e) {
+            if ((s < 0) || (s >= e)) {
+                return -1;
+            } else {
+                while (s < e) {
+                    int c = ca [ s ];
+                    if (isC(c)) {
+                        break;
+                    } else {
+                        s++;
+                    }
+                }
+                return s;
+            }
+        }
+        @Override
+        // D* L? | ...
+        protected int findEndOfSyllable(int[] ca, int s, int e) {
+            if ((s < 0) || (s >= e)) {
+                return -1;
+            } else {
+                int nd = 0;
+                int nl = 0;
+                int i;
+                // consume dead consonants
+                while ((i = isDeadConsonant(ca, s, e)) > s) {
+                    s = i;
+                    nd++;
+                }
+                // consume zero or one live consonant
+                if ((i = isLiveConsonant(ca, s, e)) > s) {
+                    s = i;
+                    nl++;
+                }
+                return ((nd > 0) || (nl > 0)) ? s : -1;
+            }
+        }
+        // D := ( C N? H )?
+        private int isDeadConsonant(int[] ca, int s, int e) {
+            if (s < 0) {
+                return -1;
+            } else {
+                int c;
+                int i = 0;
+                int nc = 0;
+                int nh = 0;
+                do {
+                    // C
+                    if ((s + i) < e) {
+                        c = ca [ s + i ];
+                        if (isC(c)) {
+                            i++;
+                            nc++;
+                        } else {
+                            break;
+                        }
+                    }
+                    // N?
+                    if ((s + i) < e) {
+                        c = ca [ s + 1 ];
+                        if (isN(c)) {
+                            i++;
+                        }
+                    }
+                    // H
+                    if ((s + i) < e) {
+                        c = ca [ s + i ];
+                        if (isH(c)) {
+                            i++;
+                            nh++;
+                        } else {
+                            break;
+                        }
+                    }
+                } while (false);
+                return (nc > 0) && (nh > 0) ? s + i : -1;
+            }
+        }
+        // L := ( (C|V) N? X* )?; where X = ( MATRA | ACCENT MARK | TONE MARK | OTHER MARK )
+        private int isLiveConsonant(int[] ca, int s, int e) {
+            if (s < 0) {
+                return -1;
+            } else {
+                int c;
+                int i = 0;
+                int nc = 0;
+                int nv = 0;
+                int nx = 0;
+                do {
+                    // C
+                    if ((s + i) < e) {
+                        c = ca [ s + i ];
+                        if (isC(c)) {
+                            i++;
+                            nc++;
+                        } else if (isV(c)) {
+                            i++;
+                            nv++;
+                        } else {
+                            break;
+                        }
+                    }
+                    // N?
+                    if ((s + i) < e) {
+                        c = ca [ s + i ];
+                        if (isN(c)) {
+                            i++;
+                        }
+                    }
+                    // X*
+                    while ((s + i) < e) {
+                        c = ca [ s + i ];
+                        if (isX(c)) {
+                            i++;
+                            nx++;
+                        } else {
+                            break;
+                        }
+                    }
+                } while (false);
+                // if no X but has H, then ignore C|I
+                if (nx == 0) {
+                    if ((s + i) < e) {
+                        c = ca [ s + i ];
+                        if (isH(c)) {
+                            if (nc > 0) {
+                                nc--;
+                            } else if (nv > 0) {
+                                nv--;
+                            }
+                        }
+                    }
+                }
+                return ((nc > 0) || (nv > 0)) ? s + i : -1;
+            }
+        }
+    }
+
+    // tamil character types
+    static final short C_U          = 0;                        // unassigned
+    static final short C_C          = 1;                        // consonant
+    static final short C_V          = 2;                        // vowel
+    static final short C_M          = 3;                        // vowel sign (matra)
+    static final short C_S          = 4;                        // symbol or sign
+    static final short C_T          = 5;                        // tone mark
+    static final short C_A          = 6;                        // accent mark
+    static final short C_P          = 7;                        // punctuation
+    static final short C_D          = 8;                        // digit
+    static final short C_H          = 9;                        // halant (virama)
+    static final short C_O          = 10;                       // other signs
+    static final short C_N          = 0x0100;                   // nukta(ized)
+    static final short C_R          = 0x0200;                   // reph(ized)
+    static final short C_PRE        = 0x0400;                   // pre-base
+    static final short C_POST       = 0x1000;                   // post-base
+    static final short C_WRAP       = C_PRE | C_POST;           // wrap (two part) vowel
+    static final short C_M_TYPE     = 0x00FF;                   // type mask
+    static final short C_M_FLAGS    = 0x7F00;                   // flag mask
+    // tamil block range
+    static final int CCA_START       =  0x0B80;                 // first code point mapped by cca
+    static final int CCA_END         =  0x0C00;                 // last code point + 1 mapped by cca
+    // tamil character type lookups
+    static final short[] CCA = {
+        C_U,                        // 0x0B80                   //
+        C_U,                        // 0x0B81                   //
+        C_O,                        // 0x0B82                   // ANUSVARA
+        C_O,                        // 0x0B83                   // VISARGA
+        C_U,                        // 0x0B84                   //
+        C_V,                        // 0x0B85                   // A
+        C_V,                        // 0x0B86                   // AA
+        C_V,                        // 0x0B87                   // I
+        C_V,                        // 0x0B88                   // II
+        C_V,                        // 0x0B89                   // U
+        C_V,                        // 0x0B8A                   // UU
+        C_U,                        // 0x0B8B                   //
+        C_U,                        // 0x0B8C                   //
+        C_U,                        // 0x0B8D                   //
+        C_V,                        // 0x0B8E                   // E
+        C_V,                        // 0x0B8F                   // EE
+        C_V,                        // 0x0B90                   // AI
+        C_U,                        // 0x0B91                   //
+        C_V,                        // 0x0B92                   // O
+        C_V,                        // 0x0B93                   // OO
+        C_V,                        // 0x0B94                   // AU
+        C_C,                        // 0x0B95                   // KA
+        C_U,                        // 0x0B96                   //
+        C_U,                        // 0x0B97                   //
+        C_U,                        // 0x0B98                   //
+        C_C,                        // 0x0B99                   // NGA
+        C_C,                        // 0x0B9A                   // CA
+        C_U,                        // 0x0B9B                   //
+        C_C,                        // 0x0B9C                   // JA
+        C_U,                        // 0x0B9D                   //
+        C_C,                        // 0x0B9E                   // NYA
+        C_C,                        // 0x0B9F                   // TTA
+        C_U,                        // 0x0BA0                   //
+        C_U,                        // 0x0BA1                   //
+        C_U,                        // 0x0BA2                   //
+        C_C,                        // 0x0BA3                   // NNA
+        C_C,                        // 0x0BA4                   // TA
+        C_U,                        // 0x0BA5                   //
+        C_U,                        // 0x0BA6                   //
+        C_U,                        // 0x0BA7                   //
+        C_C,                        // 0x0BA8                   // NA
+        C_C,                        // 0x0BA9                   // NNNA
+        C_C,                        // 0x0BAA                   // PA
+        C_U,                        // 0x0BAB                   //
+        C_U,                        // 0x0BAC                   //
+        C_U,                        // 0x0BAD                   //
+        C_C,                        // 0x0BAE                   // MA
+        C_C,                        // 0x0BAF                   // YA
+        C_C | C_R,                  // 0x0BB0                   // RA
+        C_C | C_R,                  // 0x0BB1                   // RRA
+        C_C,                        // 0x0BB2                   // LA
+        C_C,                        // 0x0BB3                   // LLA
+        C_C,                        // 0x0BB4                   // LLLA
+        C_C,                        // 0x0BB5                   // VA
+        C_C,                        // 0x0BB6                   // SHA
+        C_C,                        // 0x0BB7                   // SSA
+        C_C,                        // 0x0BB8                   // SA
+        C_C,                        // 0x0BB9                   // HA
+        C_U,                        // 0x0BBA                   //
+        C_U,                        // 0x0BBB                   //
+        C_U,                        // 0x0BBC                   //
+        C_U,                        // 0x0BBD                   //
+        C_M,                        // 0x0BBE                   // AA
+        C_M,                        // 0x0BBF                   // I
+        C_M,                        // 0x0BC0                   // II
+        C_M,                        // 0x0BC1                   // U
+        C_M,                        // 0x0BC2                   // UU
+        C_U,                        // 0x0BC3                   //
+        C_U,                        // 0x0BC4                   //
+        C_U,                        // 0x0BC5                   //
+        C_M | C_PRE,                // 0x0BC6                   // E
+        C_M | C_PRE,                // 0x0BC7                   // EE
+        C_M | C_PRE,                // 0x0BC8                   // AI
+        C_U,                        // 0x0BC9                   //
+        C_M | C_WRAP,               // 0x0BCA                   // O
+        C_M | C_WRAP,               // 0x0BCB                   // OO
+        C_M | C_WRAP,               // 0x0BCC                   // AU
+        C_H,                        // 0x0BCD                   // VIRAMA (HALANT)
+        C_U,                        // 0x0BCE                   //
+        C_U,                        // 0x0BCF                   //
+        C_S,                        // 0x0BD0                   // OM
+        C_U,                        // 0x0BD1                   //
+        C_U,                        // 0x0BD2                   //
+        C_U,                        // 0x0BD3                   //
+        C_U,                        // 0x0BD4                   //
+        C_U,                        // 0x0BD5                   //
+        C_U,                        // 0x0BD6                   //
+        C_M,                        // 0x0BD7                   // AU LENGTH MARK
+        C_U,                        // 0x0BD8                   //
+        C_U,                        // 0x0BD9                   //
+        C_U,                        // 0x0BDA                   //
+        C_U,                        // 0x0BDB                   //
+        C_U,                        // 0x0BDC                   //
+        C_U,                        // 0x0BDD                   //
+        C_U,                        // 0x0BDE                   //
+        C_U,                        // 0x0BDF                   //
+        C_U,                        // 0x0BE0                   //
+        C_U,                        // 0x0BE1                   //
+        C_U,                        // 0x0BE2                   //
+        C_U,                        // 0x0BE3                   //
+        C_U,                        // 0x0BE4                   //
+        C_U,                        // 0x0BE5                   //
+        C_D,                        // 0x0BE6                   // ZERO
+        C_D,                        // 0x0BE7                   // ONE
+        C_D,                        // 0x0BE8                   // TWO
+        C_D,                        // 0x0BE9                   // THREE
+        C_D,                        // 0x0BEA                   // FOUR
+        C_D,                        // 0x0BEB                   // FIVE
+        C_D,                        // 0x0BEC                   // SIX
+        C_D,                        // 0x0BED                   // SEVEN
+        C_D,                        // 0x0BEE                   // EIGHT
+        C_D,                        // 0x0BEF                   // NINE
+        C_S,                        // 0x0BF0                   // TEN
+        C_S,                        // 0x0BF1                   // ONE HUNDRED
+        C_S,                        // 0x0BF2                   // ONE THOUSAND
+        C_S,                        // 0x0BF3                   // DAY SIGN (naal)
+        C_S,                        // 0x0BF4                   // MONTH SIGN (maatham)
+        C_S,                        // 0x0BF5                   // YEAR SIGN (varudam)
+        C_S,                        // 0x0BF6                   // DEBIT SIGN (patru)
+        C_S,                        // 0x0BF7                   // CREDIT SIGN (varavu)
+        C_S,                        // 0x0BF8                   // AS ABOVE SIGN (merpadi)
+        C_S,                        // 0x0BF9                   // RUPEE SIGN (rupai)
+        C_S,                        // 0x0BFA                   // NUMBER SIGN (enn)
+        C_U,                        // 0x0BFB                   //
+        C_U,                        // 0x0BFC                   //
+        C_U,                        // 0x0BFD                   //
+        C_U,                        // 0x0BFE                   //
+        C_U                         // 0x0BFF                   //
+    };
+    static int typeOf(int c) {
+        if ((c >= CCA_START) && (c < CCA_END)) {
+            return CCA [ c - CCA_START ] & C_M_TYPE;
+        } else {
+            return C_U;
+        }
+    }
+    static boolean isType(int c, int t) {
+        return typeOf(c) == t;
+    }
+    static boolean hasFlag(int c, int f) {
+        if ((c >= CCA_START) && (c < CCA_END)) {
+            return (CCA [ c - CCA_START ] & f) == f;
+        } else {
+            return false;
+        }
+    }
+    static boolean isC(int c) {
+        return isType(c, C_C);
+    }
+    static boolean isR(int c) {
+        return isType(c, C_C) && hasR(c);
+    }
+    static boolean isV(int c) {
+        return isType(c, C_V);
+    }
+    static boolean isN(int c) {
+        return c == 0x093C;
+    }
+    static boolean isH(int c) {
+        return c == 0x094D;
+    }
+    static boolean isM(int c) {
+        return isType(c, C_M);
+    }
+    static boolean isPreM(int c) {
+        return isType(c, C_M) && hasFlag(c, C_PRE);
+    }
+    static boolean isX(int c) {
+        switch (typeOf(c)) {
+        case C_M: // matra (combining vowel)
+        case C_A: // accent mark
+        case C_T: // tone mark
+        case C_O: // other (modifying) mark
+            return true;
+        default:
+            return false;
+        }
+    }
+    static boolean hasR(int c) {
+        return hasFlag(c, C_R);
+    }
+    static boolean hasN(int c) {
+        return hasFlag(c, C_N);
+    }
+
+}
diff --git a/src/java/org/apache/fop/complexscripts/util/CharNormalize.java b/src/java/org/apache/fop/complexscripts/util/CharNormalize.java
new file mode 100644 (file)
index 0000000..a0c8e96
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id$ */
+
+package org.apache.fop.complexscripts.util;
+
+import java.util.Arrays;
+
+/**
+ * <p>Normalization related utilities. N.B. This implementation is an experimental
+ * shortcut, the full version of which would require either using ICU4J or an extraction
+ * of its normalization function, either being a significant undertaking. At present
+ * we handle only specialized decomposition of Indic two part matras.</p>
+ *
+ * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p>
+ */
+public final class CharNormalize {
+
+    // CSOFF: LineLength
+
+    private CharNormalize() {
+    }
+
+    private static final int[] DECOMPOSABLES = {
+        // bengali
+        0x09CB,
+        0x09CC,
+        // oriya
+        0x0B4B,
+        0x0B4C,
+        // tamil
+        0x0BCA,
+        0x0BCB,
+        0x0BCC,
+        // malayalam
+        0x0D4A,
+        0x0D4B,
+        0x0D4C,
+        // sinhala
+        0x0DDA,
+        0x0DDC,
+        0x0DDD,
+        0x0DDE,
+    };
+
+    private static final int[][] DECOMPOSITIONS = {
+        // bengali
+        { 0x09C7, 0x09BE },             // 0x09CB
+        { 0x09C7, 0x09D7 },             // 0x09CC
+        // oriya
+        { 0x0B47, 0x0B4E },             // 0x0B4B
+        { 0x0B47, 0x0B57 },             // 0x0B4C
+        // tamil
+        { 0x0BC6, 0x0BBE },             // 0x0BCA
+        { 0x0BC7, 0x0BBE },             // 0x0BCB
+        { 0x0BC6, 0x0BD7 },             // 0x0BCC
+        // malayalam
+        { 0x0D46, 0x0D3E },             // 0x0D4A
+        { 0x0D47, 0x0D3E },             // 0x0D4B
+        { 0x0D46, 0x0D57 },             // 0x0D4C
+        // sinhala
+        { 0x0DD9, 0x0DCA },             // 0x0DDA
+        { 0x0DD9, 0x0DCF },             // 0x0DDC
+        { 0x0DD9, 0x0DCF, 0x0DCA },     // 0x0DDD
+        { 0x0DD9, 0x0DDF },             // 0x0DDE
+    };
+
+    private static final int MAX_DECOMPOSITION_LENGTH = 3;
+
+    public static boolean isDecomposable(int c) {
+        return Arrays.binarySearch(DECOMPOSABLES, c) >= 0;
+    }
+
+    public static int maximumDecompositionLength() {
+        return MAX_DECOMPOSITION_LENGTH;
+    }
+
+    public static int[] decompose(int c, int[] da) {
+        int di = Arrays.binarySearch(DECOMPOSABLES, c);
+        if (di >= 0) {
+            return DECOMPOSITIONS[di];
+        } else if ((da != null) && (da.length > 1)) {
+            da[0] = c;
+            da[1] = 0;
+            return da;
+        } else {
+            return new int[] { c };
+        }
+    }
+
+}
index 5b2f62a26363234c6cbc678a4104bc4f3a43d8cf..d95958c19f1072543a282f87052fdd3aa0c0e5cb 100644 (file)
@@ -38,6 +38,7 @@ import org.apache.fop.complexscripts.fonts.GlyphSubstitutionTable;
 import org.apache.fop.complexscripts.fonts.GlyphTable;
 import org.apache.fop.complexscripts.fonts.Positionable;
 import org.apache.fop.complexscripts.fonts.Substitutable;
+import org.apache.fop.complexscripts.util.CharNormalize;
 import org.apache.fop.complexscripts.util.GlyphSequence;
 import org.apache.fop.util.CharUtilities;
 
@@ -491,7 +492,8 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl
     /** {@inheritDoc} */
     public CharSequence performSubstitution(CharSequence cs, String script, String language, List associations) {
         if (gsub != null) {
-            GlyphSequence igs = mapCharsToGlyphs(cs, associations);
+            CharSequence  ncs = normalize(cs, associations);
+            GlyphSequence igs = mapCharsToGlyphs(ncs, associations);
             GlyphSequence ogs = gsub.substitute(igs, script, language);
             if (associations != null) {
                 associations.clear();
@@ -509,7 +511,7 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl
         CharSequence cs, int[][] gpa, String script, String language, List associations) {
         if (gdef != null) {
             GlyphSequence igs = mapCharsToGlyphs(cs, associations);
-            GlyphSequence ogs = gdef.reorderCombiningMarks(igs, gpa, script, language);
+            GlyphSequence ogs = gdef.reorderCombiningMarks(igs, getUnscaledWidths(igs), gpa, script, language);
             if (associations != null) {
                 associations.clear();
                 associations.addAll(ogs.getAssociations());
@@ -521,6 +523,16 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl
         }
     }
 
+    protected int[] getUnscaledWidths(GlyphSequence gs) {
+        int[] widths = new int[gs.getGlyphCount()];
+        for (int i = 0, n = widths.length; i < n; ++i) {
+            if (i < width.length) {
+                widths[i] = width[i];
+            }
+        }
+        return widths;
+    }
+
     /** {@inheritDoc} */
     public boolean performsPositioning() {
         return gpos != null;
@@ -652,6 +664,37 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl
         return cb;
     }
 
+    private CharSequence normalize(CharSequence cs, List associations) {
+        return hasDecomposable(cs) ? decompose(cs, associations) : cs;
+    }
+
+    private boolean hasDecomposable(CharSequence cs) {
+        for (int i = 0, n = cs.length(); i < n; i++) {
+            int cc = cs.charAt(i);
+            if (CharNormalize.isDecomposable(cc)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private CharSequence decompose(CharSequence cs, List associations) {
+        StringBuffer sb = new StringBuffer(cs.length());
+        int[] daBuffer = new int[CharNormalize.maximumDecompositionLength()];
+        for (int i = 0, n = cs.length(); i < n; i++) {
+            int cc = cs.charAt(i);
+            int[] da = CharNormalize.decompose(cc, daBuffer);
+            for (int j = 0; j < da.length; j++) {
+                if (da[j] > 0) {
+                    sb.append((char) da[j]);
+                } else {
+                    break;
+                }
+            }
+        }
+        return sb;
+    }
+
     @Override
     public boolean hasFeature(int tableType, String script, String language, String feature) {
         GlyphTable table;